# Lab 1 - Overview of embeddings-based retrieval

Welcome! Here's a few notes about the Chroma course notebooks.
 - A number of warnings pop up when running the notebooks. These are normal and can be ignored.
 - Some operations such as calling an LLM or an opeation using generated data return unpredictable results and so your notebook outputs may differ from the video.
  
Enjoy the course!

https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/2/overview-of-embeddings-based-retrieval

In [1]:
import os
import openai
import sys
import chromadb

from openai import OpenAI
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from chromadb.config import Settings
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from dotenv import load_dotenv, find_dotenv
from pypdf import PdfReader
from helper_utils_02 import word_wrap

sys.path.append('../..')

In [2]:
is_persistent = True

# Specify the directory to store the database files
persist_dir = './persist_dir/'

# SentenceTransformersTokenTextSplitter hyper-parameters
# SentenceTransformers embedding model's context windows is 256 max.
# It will truncate the rest, after the exceeding the max. context window. 
tokens_per_chunk = 256
tokens_chunk_overlap = int(tokens_per_chunk * 0.2)

In [None]:
# Default model for SentenceTransformerEmbeddingFunction is "all-MiniLM-L6-v2"
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [None]:
reader = PdfReader("microsoft_annual_report_2022.pdf")

# Extracting text from pages and strip leading and trailing space characters
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

print(word_wrap(pdf_texts[0], n_chars=90))

In [None]:
print(f'type(reader): {type(reader)}')
print(f'type(pdf_texts): {type(pdf_texts)}')
print(f'len(pdf_texts): {len(pdf_texts)}')
print(f'\npdf page 1: {pdf_texts[0]}')

You can view the pdf in your browser [here](./microsoft_annual_report_2022.pdf) if you would like. 


Print first and last 200 characters on page 2

In [None]:
print(word_wrap(pdf_texts[1][:200]))
print('....')
print(word_wrap(pdf_texts[1][-200:]))

https://dev.to/eteimz/understanding-langchains-recursivecharactertextsplitter-2846

In [None]:
# creates an instance of RecursiveCharacterTextSplitter
character_splitter = RecursiveCharacterTextSplitter(
    # list of delimiters used to split the text,
    # it split text in the order of the separators, to satisfy chunk_size
    # "": An empty string (effectively splitting on every character) 
    # separators=["\n\n", "\n", ". ", " ", ""],
    separators=["\n\n", "\n", ". ", " "],  # remove "" separator, don't split on character    
    chunk_size=1000,
    chunk_overlap=0
)
#  concatenate pdf pages using "\n\n" (two newlines) as a delimiter.
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print(word_wrap(character_split_texts[10]))
print(f"\n{character_split_texts[10]}")
print(f"\nlen(character_split_texts[0]): {len(character_split_texts[10])}")
print(f"Total chunks: {len(character_split_texts)}")

#### Splitting text into tokens is the first step of any RAG system

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=tokens_chunk_overlap, tokens_per_chunk=tokens_per_chunk)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(word_wrap(token_split_texts[10]))
print(f"\nTotal token_split_texts chunks: {len(token_split_texts)}")

#### Example of embedding text. An embedding vector of a single word or a chunk of text, both vectors have the same dimensions 

In [None]:
_texts = [['hello'], [token_split_texts[10]]]
for _text in _texts:
  # emb_text = [token_split_texts[10]]
  emb_text = _text
  emb_vector = embedding_function(emb_text)[0]  # list has only 1 vector 
  emb_vector_dim = len(emb_vector)

  print(f'text to be embedded: {_text}')
  print(f'embedding vector: {emb_vector}')
  print(f'embedding vector dimension: {emb_vector_dim}\n')

In [8]:
# Create a Chroma client
client = chromadb.Client(Settings(is_persistent=is_persistent, persist_directory=persist_dir))  

# Create a collection
collection_name = "microsoft_annual_report_2022"
collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

In [20]:
# Add documents to the collection 
ids = [str(i) for i in range(len(token_split_texts))]
collection.add(ids=ids, documents=token_split_texts)

In [9]:
# Retrieve all documents
documents = collection.get()

In [None]:
doc_keys = []
for k, v in documents.items():
  doc_keys.append(k)
  print(k, v)

print(f'\ndocuments keys: {doc_keys}')

In [None]:
for key in doc_keys:
  try:
    print(f"documents[{key}]")
    print(f"{key}: {documents[key]}")    
  except TypeError:
    print('TypeError')
  print('-------')    

In [None]:
# Retrieve the first document and its ID
# first_document = chroma_collection.get(ids=[ids[0]])
my_documents = documents['documents']
my_documents 

# # Access the document and its ID
# document_text = first_document['documents']  # Retrieve the document text
# document_id = first_document['ids']  # Retrieve the document ID

# print("Document Text:", document_text)
# print("Document ID:", document_id)

In [None]:
# Retrieve the first document and its ID
# first_document = chroma_collection.get(ids=[ids[0]])
first_document = chroma_collection.get(ids=['0', '1'])

# Access the document and its ID
document_text = first_document['documents']  # Retrieve the document text
document_id = first_document['ids']  # Retrieve the document ID

print("Document Text:", document_text)
print("Document ID:", document_id)

# Access the document and its ID
document_text = first_document['documents']  # Retrieve the document text
document_id = first_document['ids']  # Retrieve the document ID

print("Document Text:", document_text)
print("Document ID:", document_id)

In [None]:
# Retrieve the first document and its ID
# first_document = chroma_collection.get(ids=[ids[0]])
first_document = chroma_collection.get(ids=['0', '1'])

# Access the document and its ID
document_text = first_document['documents']  # Retrieve the document text
document_id = first_document['ids']  # Retrieve the document ID

print("Document Text:", document_text)
print("Document ID:", document_id)

In [None]:
print(word_wrap(document_text[1]))

In [None]:
queries = ["What was the total revenue?", "What was the operating margin?"]

results = chroma_collection.query(query_texts=queries, n_results=3)

In [None]:
for i, query in enumerate(queries):
  retrieved_documents = results['documents'][i]
  for j, document in enumerate(retrieved_documents):
    print(f"query: {queries[i]}")
    # print(word_wrap(document, n_chars=90))
    print(f"document {j}: {word_wrap(document, n_chars=90)}")
    print('\n')

In [15]:

_ = load_dotenv(find_dotenv('.env\my_api_key.env')) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

openai_client = OpenAI()

In [16]:
def rag(query, retrieved_documents, model="gpt-3.5-turbo"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [None]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))

In [None]:
for query in queries:
  output = rag(query=query, retrieved_documents=retrieved_documents)
  print(f'query: {query}')
  for i, retrieved_document in enumerate(retrieved_documents):
    print(f'retrieved document {i}: {retrieved_document}')
  print(f'output: {word_wrap(output)}\n')
  # print('\n')  

In [None]:
retrieved_documents

In [None]:
output = rag(query=queries, retrieved_documents=retrieved_documents)

print(queries)
print(word_wrap(output))

In [None]:
output = rag(query=queries[0], retrieved_documents=retrieved_documents)

print(queries[0])
print(word_wrap(output))

In [None]:
output = rag(query=queries[1], retrieved_documents=retrieved_documents)

print(queries[1])
print(word_wrap(output))