In [1]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [22]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("python", exist_ok=True)
files = [
    # "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
    "https://ia600200.us.archive.org/35/items/eric-matthes-python-crash-course-no-starch-press-2023/Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf"
]
for url in files:
    file_path = os.path.join("python", url.rpartition("/")[2])
    urlretrieve(url, file_path)
    print(file_path)

python\Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf


In [7]:
loader = PyPDFDirectoryLoader("C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\psaundary_4f587fcb-dff7-4abe-803a-38d46fb0e265_github")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]


Document(metadata={'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\psaundary_4f587fcb-dff7-4abe-803a-38d46fb0e265_github\\Git - Giant Undo Button.pdf', 'page': 0}, page_content='ĢİǺŇȚ\xa0ŲŇĐǾ\xa0BŲȚȚǾŇ')

In [24]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

# print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
# print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')Before split, there were 63 documents loaded, with average characters equal to 3830.After split, there were 296 documents (chunks), with average characters equal to 864 (average chunk length))

In [25]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

In [14]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-7.50839487e-02 -1.18847508e-02 -3.14887278e-02  2.94038784e-02
  5.03487065e-02  5.62426560e-02 -1.69078279e-02  3.46887745e-02
 -9.79061723e-02 -2.52804179e-02  7.62901306e-02  5.73797636e-02
 -2.42905449e-02 -3.06788404e-02  6.20474527e-03  4.02186960e-02
 -8.71851016e-03 -8.25359765e-03 -3.58134247e-02  3.61376256e-02
 -4.81583290e-02  4.19040211e-02 -3.68844494e-02 -5.38902432e-02
  1.65312029e-02  1.20014157e-02 -1.46691436e-02  2.12824587e-02
 -5.34791723e-02 -1.49481416e-01  2.32284260e-03  3.20234261e-02
 -5.21530211e-02 -2.28869896e-02  2.32763048e-02  2.14958973e-02
 -1.39637236e-02  7.53694922e-02  5.07269576e-02  5.50113022e-02
 -3.30469720e-02  1.79150887e-02 -2.08278317e-02  1.26921711e-03
 -2.61849370e-02  2.41151988e-03 -1.92517284e-02  3.06517747e-03
  1.50286616e-03 -5.14435619e-02  3.78552452e-02 -1.38906296e-02
  4.22979146e-02  6.64815083e-02  6.62789270e-02 -4.27095555e-02
  9.69377160e-03 -3.63818258e-02 -4.47576381e-02  3

In [26]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [34]:
query = """what is Django Shell"""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
# print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
# print(relevant_documents[0].page_content)
for i,a in enumerate(relevant_documents):
    print(f"Answer number {i} : {a.page_content} \n \n")

Answer number 0 : ers have practiced specific ways of keeping their weight over their 
feet whenever possible.
These three entries will give us something to work with as we continue 
to develop Learning Log.
The Django Shell
Now that we’ve entered some data, we can examine it programmatically 
through an interactive terminal session. This interactive environment is 
called the Django shell, and it’s a great environment for testing and trouble-
shooting your project. Here’s an example of an interactive shell session:
(ll_env)learning_log$ python manage.py shell
1 >>> from learning_logs.models import Topic
>>> Topic.objects.all()
<QuerySet [<Topic: Chess>, <Topic: Rock Climbing>]> 
 

Answer number 1 : request. The shell is really useful for making sure your code retrieves the 
data you want it to. If your code works as you expect it to in the shell, it 
should also work properly in the files within your project. If your code gen-
erates errors or doesn’t retrieve the data you expect it 