In [3]:
from app.loader import load_docs
from app.splitter import TextSplitter, token_size
from nomic import embed
from app.vector_store import VectorStore
from pdfminer.high_level import extract_text

In [2]:
docs = load_docs()

100%|██████████| 11/11 [00:45<00:00,  4.10s/it]

Loaded 11 PDF documents





In [5]:
chunks = []
text_splitter = TextSplitter(chunk_size=512)

for i, doc in enumerate(docs):
    doc_chunks = text_splitter(doc)
    chunks += doc_chunks
    print(f'Doc {i+1}: {token_size(doc)} tokens, {len(doc_chunks)} chunks')

print('\nTotal chunks', len(chunks))


Doc 1: 45366 tokens, 91 chunks
Doc 2: 65671 tokens, 131 chunks
Doc 3: 25519 tokens, 51 chunks
Doc 4: 46689 tokens, 94 chunks
Doc 5: 48980 tokens, 98 chunks
Doc 6: 61540 tokens, 126 chunks
Doc 7: 24178 tokens, 49 chunks
Doc 8: 50730 tokens, 102 chunks
Doc 9: 41999 tokens, 84 chunks
Doc 10: 46333 tokens, 93 chunks
Doc 11: 50055 tokens, 101 chunks

Total chunks 1020


In [15]:
doc = extract_text('data/docs/Inception.pdf')
text_splitter = TextSplitter(chunk_size=512)
chunks = text_splitter(doc)
print(f'Doc: {token_size(doc)} tokens, {len(chunks)} chunks')

Doc: 46333 tokens, 93 chunks


In [16]:
output = embed.text(
    texts=chunks,
    model='nomic-embed-text-v1.5',
    task_type='search_document'
)
print(f'Embeddings: {len(output['embeddings'])}, Usage: {output['usage']}')

Embeddings: 93, Usage: {'prompt_tokens': 39334, 'total_tokens': 39334}


In [17]:
vector_store = VectorStore()
vectors = [{'vector': vector, 'text': text} for vector, text in zip(output['embeddings'], chunks)]
vector_store.add(vectors)
print(f'{len(vectors)} vectors added to vector store')
vector_store.save()

93 vectors added to vector store


In [4]:
vector_store = VectorStore()
vector_store.load()

In [15]:
query = "What's the most resilient parasite?"
query_embed = embed.text(
    texts=[query],
    model='nomic-embed-text-v1.5',
    task_type='search_query'
)
query_vector = query_embed['embeddings'][0]

In [16]:
results = vector_store.query(query_vector)
for res in results:
    print(res['score'])
    print(res['text'])
    print('*'*50)
    print('\n\n\n\n\n')

Shape (93,)
0.6931548258389311
I’ve seen one before. Many, many 
years ago...

The Elderly Japanese Man STARES at the top mesmerized.

ELDERLY JAPANESE MAN

It belonged to a man I met in a 
half-remembered dream...

MOVE IN on the GRACEFULLY SPINNING TOP...

ELDERLY JAPANESE MAN

A man possessed of some radical 
notions...

The Elderly Japanese Man STARES, remembering... 

What’s the most resilient parasite?

COBB (V.O.)

CUT TO:

INT. SAME ELEGANT DINING ROOM - NIGHT (YEARS EARLIER)

The speaker, COBB, is 35, handsome, tailored. A young 
Japanese man, SAITO, eats as he listens. 

A bacteria? A virus?

COBB

Cobb gestures at their feast with his wine glass-

An intestinal worm?

COBB

Saito’s fork pauses, mid-air. Cobb GRINS. A third man is at 
the table- ARTHUR. He jumps in to save the pitch- 

What Mr. Cobb is trying to say-

ARTHUR

An idea.

COBB

Saito looks at Cobb, curious.

3.

COBB

Resilient, highly contagious. Once 
an idea’s taken hold in the brain 
it’s almost impossible 