In [3]:
%pip install langchain
%pip install pypdf


Note: you may need to restart the kernel to use updated packages.


In [4]:
# Load all the pages from the file into a variable using PyPDFLoader

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(
    "./documents/Computer Systems A Programmer’s Perspective Third Edition by Randal E. Bryant, David R. O’Hallaron.pdf"
)
raw_pages = loader.load()

In [5]:
#raw_pages stores all the pages in array form. Each index also stores the metadata

print(len(raw_pages))
raw_page = raw_pages[0]
# print(page.page_content[:300])
raw_page.metadata

1122


{'source': './documents/Computer Systems A Programmer’s Perspective Third Edition by Randal E. Bryant, David R. O’Hallaron.pdf',
 'page': 0}

In [6]:
# DOCUMENT SPLITTING


from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

chunk_size = 1000
chunk_overlap = 200
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

In [7]:
pages = r_splitter.split_documents(raw_pages)
for n in range(3):
    print(pages[n+30])


page_content='Contents 17\n11\nNetwork Programming 953\n11.1 The Client-Server Programming Model 954\n11.2 Networks 955\n11.3 The Global IP Internet 960\n11.3.1 IP Addresses 96111.3.2 Internet Domain Names 96311.3.3 Internet Connections 965\n11.4 The Sockets Interface 96811.4.1 Socket Address Structures 96911.4.2 The\nsocket Function 970\n11.4.3 The connect Function 970\n11.4.4 The bind Function 971\n11.4.5 The listen Function 971\n11.4.6 The accept Function 972\n11.4.7 Host and Service Conversion 97311.4.8 Helper Functions for the Sockets Interface 97811.4.9 Example Echo Client and Server 980\n11.5 Web Servers 98411.5.1 Web Basics 98411.5.2 Web Content 98511.5.3 HTTP Transactions 98611.5.4 Serving Dynamic Content 989\n11.6 Putting It Together: The Tiny Web Server 992\n11.7 Summary 1000\nBibliographic Notes 1001Homework Problems 1001Solutions to Practice Problems 1002\n12\nConcurrent Programming 1007\n12.1 Concurrent Programming with Processes 1009' metadata={'source': './documents/Com

Testing out embeddings

In [8]:
import numpy as np
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [9]:
%pip install tiktoken

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:
import tiktoken

In [11]:
sentence1 = "My name is Saransh Sinha, I am a software engineer"
sentence2 = "My name is Shashwat Sinha, I am a student"
sentence3 = "the weather is ugly outside"

In [12]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [13]:
np.dot(embedding1, embedding2)

0.9169101714100671

In [14]:
np.dot(embedding1, embedding3)

0.722529012713944

In [15]:
np.dot(embedding2, embedding3)

0.7193759409167937

Now we will convert pdfs to embeddings and store them in a vectorstore

In [None]:
%pip install chromadb

In [17]:
from langchain.vectorstores import Chroma

In [18]:
persist_directory = 'docs/chroma/'

In [19]:
!rm -rf ./docs/chroma  # remove old database files if any

In [20]:
vectordb = Chroma.from_documents(
    documents=pages,
    embedding=embedding,
    persist_directory=persist_directory
)

In [30]:
# pages = r_splitter.split_documents(raw_pages) is splits = text_splitter.split_documents(docs) from dlai
len(pages)

3197

In [41]:
question = "strong versus weak definition"

In [42]:
docs = vectordb.similarity_search(question,k=3)

In [37]:
len(docs)

3

In [45]:
for i in docs:
    print(i)
    print()
# docs[0].page_content

page_content='1056 Chapter 12 Concurrent Programming\nWeak scaling is often a truer measure than strong scaling because it more\naccurately reﬂects our desire to use bigger machines to do more work. This is\nparticularly true for scientiﬁc codes, where the problem size can be easily increasedand where bigger problem sizes translate directly to better predictions of nature.However, there exist applications whose sizes are not so easily increased, and forthese applications strong scaling is more appropriate. For example, the amount ofwork performed by real-time signal-processing applications is often determinedby the properties of the physical sensors that are generating the signals. Changingthe total amount of work requires using different physical sensors, which might notbe feasible or necessary. For these applications, we typically want to use parallelismto accomplish a ﬁxed amount of work as quickly as possible.\nPractice Problem 12.11 (solution page 1074)' metadata={'page': 1056, 's

In [44]:
vectordb.persist()

In [46]:
print(vectordb._collection.count())

3197


In [47]:
vectordb

<langchain.vectorstores.chroma.Chroma at 0x7f7ed93472e0>