In [21]:
import os
import warnings
from openai import OpenAI

# Define OpenAI API_KEY
with open("/home/savitha07/.env") as env:
    for line in env:
        key, value = line.strip().split('=')
        os.environ[key] = value

client = OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY'),
)

os.environ["TAVILY_API_KEY"] = os.environ.get('OPENAI_API_KEY')

warnings.filterwarnings("ignore")


In [22]:
# Load the PDF file

from langchain.document_loaders import PyPDFLoader

loaders = [
    # Duplicate documents on purpose - messy data
PyPDFLoader(
      "docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader(
      "docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader(
      "docs/MachineLearning-Lecture02.pdf"),
    PyPDFLoader(
      "docs/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [23]:
# 2. Split the content to create chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [24]:
splits = text_splitter.split_documents(docs)


In [25]:
len(splits)

228

In [26]:
# 3. Create an index for each chunk by embeddings

from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

In [27]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [28]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [29]:
import numpy as np

In [30]:
np.dot(embedding1, embedding2)

0.9630350414845885

In [31]:
np.dot(embedding1, embedding3)

0.7701147991091322

In [32]:
np.dot(embedding2, embedding3)

0.7591130000177126

In [33]:
# 4. Vectorstores

# ! pip install chromadb

In [34]:
import chromadb
from langchain_community.vectorstores import Chroma


In [35]:
persist_directory = 'docs/chroma/'


In [36]:
# get_ipython().system('rm -rf ./docs/chroma')  

In [37]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding
)


In [38]:
print(vectordb._collection.count())

236


In [40]:
# 5. Similarity Search

question = "is there an email i can ask for help"


In [41]:
docs = vectordb.similarity_search(question,k=3)

In [42]:
len(docs)

3

In [43]:
docs[0].page_content

"cs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework probl ems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me  appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thi ng that I think will help you to succeed and \ndo well in this class and even help you to enjoy this cla ss more is if you form a study \ngroup.  \nSo start looking around where you' re sitting now or at the end of class today, mingle a \nlittle bit and get to know your classmates. I strongly encourage you to f

In [44]:
vectordb.persist()

In [45]:
# 6. Edge Case - Failure modes

question = "what did they say about matlab?"

In [46]:
docs = vectordb.similarity_search(question,k=5)

In [47]:
# 6.1 Edge Case 1 - Failure modes: Diversity

docs[0]

Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\'s free, and for the purposes of  this class,

In [48]:
docs[1]

Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\'s free, and for the purposes of  this class,

In [49]:
# 6.2 Edge Case 2 - Failure modes: Specifity

question = "what did they say about regression \
  in the third lecture?"

In [50]:
docs = vectordb.similarity_search(question,k=5)

In [51]:
for doc in docs:
    print(doc.metadata)

{'page': 8, 'source': 'docs/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/MachineLearning-Lecture02.pdf'}
{'page': 8, 'source': 'docs/MachineLearning-Lecture01.pdf'}
{'page': 8, 'source': 'docs/MachineLearning-Lecture03.pdf'}
{'page': 8, 'source': 'docs/MachineLearning-Lecture01.pdf'}


In [52]:
print(docs[4].page_content)

statistics for a while or maybe algebra, we'll go over those in the discussion sections as a 
refresher for those of you that want one.  
Later in this quarter, we'll also use the disc ussion sections to go over extensions for the 
material that I'm teaching in the main lectur es. So machine learning is a huge field, and 
there are a few extensions that we really want  to teach but didn't have time in the main 
lectures for.
