In [1]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [22]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("python", exist_ok=True)
files = [
    # "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
    "https://ia600200.us.archive.org/35/items/eric-matthes-python-crash-course-no-starch-press-2023/Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf"
]
for url in files:
    file_path = os.path.join("python", url.rpartition("/")[2])
    urlretrieve(url, file_path)
    print(file_path)

python\Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf


In [2]:
loader = PyPDFDirectoryLoader("C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\pysaundary_1592d66a-05a3-4703-bfb5-1477e1f86800_git")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]


Document(metadata={'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\pysaundary_1592d66a-05a3-4703-bfb5-1477e1f86800_git\\Git Internals.pdf', 'page': 0}, page_content='by Scott Chacon\n$9\nSource code control and beyond\nGit Internals')

In [3]:
docs_before_split

[Document(metadata={'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\pysaundary_1592d66a-05a3-4703-bfb5-1477e1f86800_git\\Git Internals.pdf', 'page': 0}, page_content='by Scott Chacon\n$9\nSource code control and beyond\nGit Internals'),
 Document(metadata={'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\pysaundary_1592d66a-05a3-4703-bfb5-1477e1f86800_git\\Git Internals.pdf', 'page': 1}, page_content='2\nGit Internals\n©2008 Scott Chacon\nEvery effort was made to provide accurate information in this document. \nHowever, neither Scott Chacon nor Topfunky Corporation shall have any \nliability for any errors in the code or descriptions presented in this book.\nThis document is available for US$9 at PeepCode.com (http://peepcode.com). \nGroup discounts and site licenses can also be purchased by sending email \nto peepcode@topfunky.com.\nother peepcode products\nRSpec (•\t http://peepcode.com/products/rspec-basics) – A th

In [4]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

# print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
# print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')Before split, there were 63 documents loaded, with average characters equal to 3830.After split, there were 296 documents (chunks), with average characters equal to 864 (average chunk length))

In [7]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

huggingface_embeddings

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-small-en-v1.5', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [8]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-3.09737306e-02 -3.60131450e-02 -2.02799626e-02 -2.37482786e-02
  3.31209749e-02 -1.29411612e-02 -1.34010697e-02  4.19268478e-03
 -1.36038158e-02 -2.09906679e-02 -7.74440961e-03  7.64717534e-02
  1.22649530e-02  2.42868019e-03  6.24539703e-02  1.18708964e-02
  6.56883174e-04  2.36558430e-02  3.41411456e-02  1.13739092e-02
  3.17354612e-02 -3.63937914e-02  7.44705601e-03 -3.69760841e-02
 -2.89258659e-02  3.36940438e-02 -5.09569496e-02 -7.86973089e-02
 -8.90183300e-02 -1.84417024e-01 -4.67542261e-02  2.77667632e-03
  3.45572382e-02 -8.99881311e-03  9.02059022e-03  2.13622693e-02
  1.03797764e-02 -1.69113483e-02 -1.57875549e-02  6.69238567e-02
  5.62849566e-02  2.72508860e-02 -8.52409564e-03 -3.24059534e-03
 -4.46709096e-02 -5.36640882e-02  3.05774044e-02  5.06087160e-03
 -7.82724842e-02  4.44196500e-02 -5.85860712e-03  1.54695641e-02
 -5.06787263e-02  4.03074473e-02 -1.43928500e-03  1.70750320e-02
  7.08394796e-02  9.47464854e-02 -1.73961241e-02 -7

In [9]:
sample_embedding

array([-3.09737306e-02, -3.60131450e-02, -2.02799626e-02, -2.37482786e-02,
        3.31209749e-02, -1.29411612e-02, -1.34010697e-02,  4.19268478e-03,
       -1.36038158e-02, -2.09906679e-02, -7.74440961e-03,  7.64717534e-02,
        1.22649530e-02,  2.42868019e-03,  6.24539703e-02,  1.18708964e-02,
        6.56883174e-04,  2.36558430e-02,  3.41411456e-02,  1.13739092e-02,
        3.17354612e-02, -3.63937914e-02,  7.44705601e-03, -3.69760841e-02,
       -2.89258659e-02,  3.36940438e-02, -5.09569496e-02, -7.86973089e-02,
       -8.90183300e-02, -1.84417024e-01, -4.67542261e-02,  2.77667632e-03,
        3.45572382e-02, -8.99881311e-03,  9.02059022e-03,  2.13622693e-02,
        1.03797764e-02, -1.69113483e-02, -1.57875549e-02,  6.69238567e-02,
        5.62849566e-02,  2.72508860e-02, -8.52409564e-03, -3.24059534e-03,
       -4.46709096e-02, -5.36640882e-02,  3.05774044e-02,  5.06087160e-03,
       -7.82724842e-02,  4.44196500e-02, -5.85860712e-03,  1.54695641e-02,
       -5.06787263e-02,  

In [14]:
a = docs_after_split[0].to_json()
a
Doc

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\pysaundary_1592d66a-05a3-4703-bfb5-1477e1f86800_git\\Git Internals.pdf',
   'page': 0},
  'page_content': 'by Scott Chacon\n$9\nSource code control and beyond\nGit Internals',
  'type': 'Document'}}

In [17]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

vectorstore.serialize_to_bytes()



In [None]:
query = """How to fetch branch"""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
# print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
# print(relevant_documents[0].page_content)
for i,a in enumerate(relevant_documents):
    print(f"Answer number {i} : {a.page_content} \n \n")

Answer number 0 : fetch = +refs/heads/qa*:refs/remotes/origin/qa*
However, you can use namespacing to accomplish something like that. If you have
a QA team that pushes a series of branches, and you want to get the master branch and
any of the QA team’s branches but nothing else, you can use a conﬁg section like this:
[remote "origin"]
url = git@github.com:schacon/simplegit-progit.git
fetch = +refs/heads/master:refs/remotes/origin/master
fetch = +refs/heads/qa/*:refs/remotes/origin/qa/*
If you have a complex workﬂow process that has a QA team pushing branches, de-
velopers pushing branches, and integration teams pushing and collaborating on remote
branches, you can namespace them easily this way. 
 

Answer number 1 : This does a one-time pull and doesn’t save the URL as a remote reference:
$ git pull git://github.com/onetimeguy/project.git
From git://github.com/onetimeguy/project
* branch HEAD -> FETCH_HEAD
Merge made by recursive.
5.3.4 Determining What Is Introduced
Now you have a to

In [13]:
anwsers = {}
for i,a in enumerate(relevant_documents):
    # print(f"Answer number {i} : {a.page_content} \n \n")
    anwsers[i]=a.page_content

print(anwsers)

{0: 'fetch = +refs/heads/qa*:refs/remotes/origin/qa*\nHowever, you can use namespacing to accomplish something like that. If you have\na QA team that pushes a series of branches, and you want to get the master branch and\nany of the QA team’s branches but nothing else, you can use a conﬁg section like this:\n[remote "origin"]\nurl = git@github.com:schacon/simplegit-progit.git\nfetch = +refs/heads/master:refs/remotes/origin/master\nfetch = +refs/heads/qa/*:refs/remotes/origin/qa/*\nIf you have a complex workﬂow process that has a QA team pushing branches, de-\nvelopers pushing branches, and integration teams pushing and collaborating on remote\nbranches, you can namespace them easily this way.', 1: 'This does a one-time pull and doesn’t save the URL as a remote reference:\n$ git pull git://github.com/onetimeguy/project.git\nFrom git://github.com/onetimeguy/project\n* branch HEAD -> FETCH_HEAD\nMerge made by recursive.\n5.3.4 Determining What Is Introduced\nNow you have a topic branch th

In [14]:
ans = {i:a.page_content  for i,a in enumerate(relevant_documents) }

In [15]:
ans

{0: 'fetch = +refs/heads/qa*:refs/remotes/origin/qa*\nHowever, you can use namespacing to accomplish something like that. If you have\na QA team that pushes a series of branches, and you want to get the master branch and\nany of the QA team’s branches but nothing else, you can use a conﬁg section like this:\n[remote "origin"]\nurl = git@github.com:schacon/simplegit-progit.git\nfetch = +refs/heads/master:refs/remotes/origin/master\nfetch = +refs/heads/qa/*:refs/remotes/origin/qa/*\nIf you have a complex workﬂow process that has a QA team pushing branches, de-\nvelopers pushing branches, and integration teams pushing and collaborating on remote\nbranches, you can namespace them easily this way.',
 1: 'This does a one-time pull and doesn’t save the URL as a remote reference:\n$ git pull git://github.com/onetimeguy/project.git\nFrom git://github.com/onetimeguy/project\n* branch HEAD -> FETCH_HEAD\nMerge made by recursive.\n5.3.4 Determining What Is Introduced\nNow you have a topic branch t