In [1]:
import os
import warnings
from openai import OpenAI

# Define OpenAI API_KEY
with open("/home/savitha07/.env") as env:
    for line in env:
        key, value = line.strip().split('=')
        os.environ[key] = value

client = OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
)

os.environ["TAVILY_API_KEY"] = os.environ.get('OPENAI_API_KEY')

warnings.filterwarnings("ignore")

In [2]:
# 1. Load PDF

from langchain.document_loaders import PyPDFLoader

loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(
      "docs/sample1.pdf"),
    PyPDFLoader(
      "docs/sample2.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
# 2. Document splitting

# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter


text_splitter = CharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)


len(splits)

4

In [4]:
# 3. Create an index for each chunk by embeddings

from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [5]:
import numpy as np

In [6]:
np.dot(embedding1, embedding2)

0.9630350414845885

In [7]:
np.dot(embedding1, embedding3)

0.7701147991091322

In [8]:
np.dot(embedding2, embedding3)

0.7591130000177126

In [9]:
# ! pip install chromadb

In [10]:
# 4. Vectorstores

from langchain_community.vectorstores import Chroma


In [11]:

persist_directory = 'docs/chroma/'

In [12]:

! rm -rf ./docs/chroma 

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [13]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

In [None]:
# Similarity Search

question = "is there an email i can ask for help"

docs = vectordb.similarity_search(question,k=3)

len(docs)

docs[0].page_content


vectordb.persist()

In [None]:
# 6. Edge Case - Failure modes

# Diversity

question = "what did they say about matlab?"

docs = vectordb.similarity_search(question,k=5)

In [None]:
docs[0]

In [None]:
docs[1]

In [None]:
# Specificity

question = "what did they say about regression \
  in the third lecture?"


In [None]:

docs = vectordb.similarity_search(question,k=5)


In [None]:
for doc in docs:
    print(doc.metadata)

In [None]:
print(docs[4].page_content)