# Lab 1 - Overview of embeddings-based retrieval

Welcome! Here's a few notes about the Chroma course notebooks.
 - A number of warnings pop up when running the notebooks. These are normal and can be ignored.
 - Some operations such as calling an LLM or an opeation using generated data return unpredictable results and so your notebook outputs may differ from the video.
  
Enjoy the course!

In [1]:
from rich import print

In [None]:
%cd ..
%pwd  # To verify the current working directory

In [None]:
from pypdf import PdfReader

reader = PdfReader("eval/microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

print(pdf_texts[0])

In [None]:
# This notebook is in the eval folder.  Change to the root folder.
%cd ..
%pwd  # To verify the current working directory

In [None]:
# Load docs
# --a->: Read in the markdown files in the Obsidian vault directory
from src.ingest_service import IngestService
from src.doc_stats import DocStats
# The Directory containing the knowledge documents used by the AI to do the analysis on the soil tests.
soil_knowledge_directory = r"G:\My Drive\Audios_To_Knowledge\knowledge\AskGrowBuddy\AskGrowBuddy\Knowledge\soil_test_knowlege"
# Load the documents
ingest_service = IngestService()
loaded_documents = ingest_service.load_obsidian_notes(soil_knowledge_directory)
# Show some summary stats about the documents

DocStats.print_llama_index_docs_summary_stats(loaded_documents)

In [5]:
from rich import print

You can view the pdf in your browser [here](./microsoft_annual_report_2022.pdf) if you would like. 

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter


In [None]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print(character_split_texts[10])
print(f"\nTotal chunks: {len(character_split_texts)}")

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(token_split_texts[10])
print(f"\nTotal chunks: {len(token_split_texts)}")

In [None]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()
print(embedding_function([token_split_texts[10]]))

In [None]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022", embedding_function=embedding_function, metadata={"hnsw:space": "cosine"})

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

In [None]:
query = "What was the total revenue?"

results = chroma_collection.query(query_texts=[query], n_results=3,
include=['documents', 'distances'])
retrieved_documents = results['documents'][0]
distances = results['distances'][0]
for document, distance in zip(retrieved_documents, distances):
    print(f"Document: {document}")
    print(f"Distance: {distance}")
    print()

In [27]:
from sentence_transformers import SentenceTransformer, SimilarityFunction
model = SentenceTransformer("multi-qa-mpnet-base-cos-v1", similarity_fn_name=SimilarityFunction.COSINE)


# Two lists of sentences
sentences1 = [
    "The new movie is awesome",
    "The cat sits outside",
    "A man is playing guitar",
]

sentences2 = [
    "The dog plays in the garden",
    "The new movie is so great",
    "A woman watches TV",
]

# Compute embeddings for both lists
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

# Compute cosine similarities
similarities = model.similarity(embeddings1, embeddings2)
# Output the pairs with their score
for idx_i, sentence1 in enumerate(sentences1):
    print(sentence1)
    for idx_j, sentence2 in enumerate(sentences2):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

2024-10-23 17:16:33,791 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu - c:\Users\happy\Documents\Projects\askgrowbuddy\.venv\Lib\site-packages\sentence_transformers\SentenceTransformer.py:208
2024-10-23 17:16:33,792 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: multi-qa-mpnet-base-cos-v1 - c:\Users\happy\Documents\Projects\askgrowbuddy\.venv\Lib\site-packages\sentence_transformers\SentenceTransformer.py:216


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

# Assuming you've already initialized your embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # or whichever model you're using

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

query = "What was the total revenue?"

# Get the query embedding
query_embedding = model.encode([query])[0]

# Perform the query and get both documents and embeddings
results = chroma_collection.query(
    query_texts=[query],
    n_results=3,
    include=['documents', 'embeddings']
)

retrieved_documents = results['documents'][0]
retrieved_embeddings = results['embeddings'][0]

# Calculate cosine similarities
similarities = [cosine_similarity(query_embedding, doc_embedding)
                for doc_embedding in retrieved_embeddings]

# Print documents with their cosine similarities
for document, similarity in zip(retrieved_documents, similarities):
    print(f"Document: {document}")
    print(f"Cosine Similarity: {similarity:.4f}")
    print()

In [None]:
import os
import openai
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

openai_client = OpenAI()

In [None]:
def rag(query, retrieved_documents, model="gpt-3.5-turbo"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [None]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))