In [1]:
from app.loader import load_docs
from app.splitter import TextSplitter, token_size
from nomic import embed
from app.vector_store import VectorStore
from pdfminer.high_level import extract_text
from app.config import settings
from groq import Groq

In [2]:
docs = load_docs()

100%|██████████| 11/11 [00:45<00:00,  4.10s/it]

Loaded 11 PDF documents





In [5]:
chunks = []
text_splitter = TextSplitter(chunk_size=512)

for i, doc in enumerate(docs):
    doc_chunks = text_splitter(doc)
    chunks += doc_chunks
    print(f'Doc {i+1}: {token_size(doc)} tokens, {len(doc_chunks)} chunks')

print('\nTotal chunks', len(chunks))


Doc 1: 45366 tokens, 91 chunks
Doc 2: 65671 tokens, 131 chunks
Doc 3: 25519 tokens, 51 chunks
Doc 4: 46689 tokens, 94 chunks
Doc 5: 48980 tokens, 98 chunks
Doc 6: 61540 tokens, 126 chunks
Doc 7: 24178 tokens, 49 chunks
Doc 8: 50730 tokens, 102 chunks
Doc 9: 41999 tokens, 84 chunks
Doc 10: 46333 tokens, 93 chunks
Doc 11: 50055 tokens, 101 chunks

Total chunks 1020


In [15]:
doc = extract_text('data/docs/Inception.pdf')
text_splitter = TextSplitter(chunk_size=512)
chunks = text_splitter(doc)
print(f'Doc: {token_size(doc)} tokens, {len(chunks)} chunks')

Doc: 46333 tokens, 93 chunks


In [16]:
output = embed.text(
    texts=chunks,
    model='nomic-embed-text-v1.5',
    task_type='search_document'
)
print(f'Embeddings: {len(output['embeddings'])}, Usage: {output['usage']}')

Embeddings: 93, Usage: {'prompt_tokens': 39334, 'total_tokens': 39334}


In [17]:
vector_store = VectorStore()
vectors = [{'vector': vector, 'text': text} for vector, text in zip(output['embeddings'], chunks)]
vector_store.add(vectors)
print(f'{len(vectors)} vectors added to vector store')
vector_store.save()

93 vectors added to vector store


In [5]:
vector_store = VectorStore()
vector_store.load()

In [26]:
query = "What's the most resilient parasite?"
query_embed = embed.text(
    texts=[query],
    model='nomic-embed-text-v1.5',
    task_type='search_query'
)
query_vector = query_embed['embeddings'][0]

In [None]:
results = vector_store.query(query_vector)
for res in results:
    print(res['score'])
    print(res['text'])
    print('*'*50)
    print('\n\n\n\n\n')

In [2]:
groq_client = Groq(api_key=settings.GROQ_API_KEY)

vector_store = VectorStore()
vector_store.load()

In [58]:
SYSTEM_PROMPT = """You are an assistant that answers user questions about a collection of movie screenplays."""

USER_PROMPT = """
Use the following pieces of context from movie screenplays to answer the user question.
You must only use the facts from the context to answer. If the answer cannot be found in the context, say that you don't have enough information to answer the question and provide any facts from the context that could be relevant to the answer.
Don't address the context or the scripts directly in your answer, just answer the question like it's your own knowledge.

Context:
{context}

User Question:
{question}
"""

In [59]:
def answer_question(question):
    query_embed = embed.text(
        texts=[question],
        model='nomic-embed-text-v1.5',
        task_type='search_query'
    )
    query_vector = query_embed['embeddings'][0]
    chunks = vector_store.query(query_vector)

    context = '\n\n---\n\n'.join([chunk['text'] for chunk in chunks]) + '\n\n---'
    user_message =  USER_PROMPT.format(context=context, question=question)
    messages=[
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': user_message}
    ]

    chat_completion = groq_client.chat.completions.create(
        messages=messages, model='llama3-70b-8192'
    )
    print('Total tokens:', chat_completion.usage.total_tokens)
    return chat_completion.choices[0].message.content

In [46]:
questions = [
    "What's the most resilient parasite?",
    "What's the name of Cobb's wife",
    "What's the name of Fisher's company",
    "what was Fisher's relationship with his father like?",
    "What happened to Cobb's wife?",
    "How did Cobb die?"
]

for question in questions:
    answer = answer_question(question)
    print(question)
    print(answer, '\n')

Total tokens: 5172
What's the most resilient parasite?
According to Cobb, the most resilient parasite is an idea. He explains that once an idea takes hold in someone's brain, it's almost impossible to eradicate, even if the person tries to cover it up or ignore it. 

Total tokens: 5167
What's the name of Cobb's wife
The name of Cobb's wife is Mal. 

Total tokens: 5200
What's the name of Fisher's company
The name of Robert Fischer's company is Fischer Morrow. It's an energy conglomerate. 

Total tokens: 5188
How did Cobb die?
I don't have enough information to answer this question. The provided context does not explicitly state how Cobb died. In fact, there is no indication that Cobb died at all. 

Total tokens: 5320
what was Fisher's relationship with his father like?
According to the script, Fischer's relationship with his father, Maurice Fischer, was strained and difficult. In a conversation with Browning, Fischer says that after his mother died, he went to his father in his grief, b