In [1]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [22]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("python", exist_ok=True)
files = [
    # "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    # "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
    "https://ia600200.us.archive.org/35/items/eric-matthes-python-crash-course-no-starch-press-2023/Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf"
]
for url in files:
    file_path = os.path.join("python", url.rpartition("/")[2])
    urlretrieve(url, file_path)
    print(file_path)

python\Eric%20Matthes%20-%20Python%20Crash%20Course-No%20Starch%20Press%20%282023%29.pdf


In [5]:
loader = PyPDFDirectoryLoader("C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\psaundary_0da70738-7510-4809-a5d2-ee22d43f78f7_github")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]


Document(metadata={'source': 'C:\\Users\\pysau\\OneDrive\\Documents\\GitHub\\jkTechAssignment\\media\\psaundary_0da70738-7510-4809-a5d2-ee22d43f78f7_github\\Git - Giant Undo Button.pdf', 'page': 0}, page_content='ĢİǺŇȚ\xa0ŲŇĐǾ\xa0BŲȚȚǾŇ')

In [6]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

# print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
# print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')Before split, there were 63 documents loaded, with average characters equal to 3830.After split, there were 296 documents (chunks), with average characters equal to 864 (average chunk length))

In [7]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 9.45379306e-03  3.40652801e-02 -8.87223752e-04  4.57740873e-02
  5.24118841e-02  3.09098735e-02  1.87078528e-02  2.71418355e-02
  2.15778444e-02  1.66033916e-02  4.27022502e-02 -3.07556819e-02
  3.73598039e-02 -2.22873744e-02  3.41020932e-05  3.06538269e-02
  1.81604512e-02  3.30316201e-02  5.48799057e-03  4.86590341e-02
  2.48664040e-02 -2.36055367e-02  4.83363383e-02 -2.34760568e-02
  1.22057488e-02  5.75912232e-03  1.43930549e-02  2.59865019e-02
 -1.11157289e-02 -2.01219618e-01  7.88473524e-03 -2.47552153e-02
  6.90990686e-02 -4.87190485e-02  1.14937872e-02  4.40839306e-02
  2.88460981e-02 -2.57098489e-03 -1.48328079e-03  7.39263818e-02
  5.73802032e-02  6.40018657e-02 -3.87344770e-02 -4.24978882e-03
  4.82909456e-02  4.89551164e-02  2.17431020e-02 -5.66363260e-02
  7.24157412e-03 -1.38047934e-02  3.27429175e-02 -1.36124194e-02
  4.45400700e-02  4.16527651e-02  1.71112437e-02 -4.10470665e-02
  4.41569388e-02 -3.63703892e-02 -4.19905130e-03 -5

In [11]:
sample_embedding

array([ 9.45379306e-03,  3.40652801e-02, -8.87223752e-04,  4.57740873e-02,
        5.24118841e-02,  3.09098735e-02,  1.87078528e-02,  2.71418355e-02,
        2.15778444e-02,  1.66033916e-02,  4.27022502e-02, -3.07556819e-02,
        3.73598039e-02, -2.22873744e-02,  3.41020932e-05,  3.06538269e-02,
        1.81604512e-02,  3.30316201e-02,  5.48799057e-03,  4.86590341e-02,
        2.48664040e-02, -2.36055367e-02,  4.83363383e-02, -2.34760568e-02,
        1.22057488e-02,  5.75912232e-03,  1.43930549e-02,  2.59865019e-02,
       -1.11157289e-02, -2.01219618e-01,  7.88473524e-03, -2.47552153e-02,
        6.90990686e-02, -4.87190485e-02,  1.14937872e-02,  4.40839306e-02,
        2.88460981e-02, -2.57098489e-03, -1.48328079e-03,  7.39263818e-02,
        5.73802032e-02,  6.40018657e-02, -3.87344770e-02, -4.24978882e-03,
        4.82909456e-02,  4.89551164e-02,  2.17431020e-02, -5.66363260e-02,
        7.24157412e-03, -1.38047934e-02,  3.27429175e-02, -1.36124194e-02,
        4.45400700e-02,  

In [9]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [None]:
query = """How to fetch branch"""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
# print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
# print(relevant_documents[0].page_content)
for i,a in enumerate(relevant_documents):
    print(f"Answer number {i} : {a.page_content} \n \n")

Answer number 0 : fetch = +refs/heads/qa*:refs/remotes/origin/qa*
However, you can use namespacing to accomplish something like that. If you have
a QA team that pushes a series of branches, and you want to get the master branch and
any of the QA team’s branches but nothing else, you can use a conﬁg section like this:
[remote "origin"]
url = git@github.com:schacon/simplegit-progit.git
fetch = +refs/heads/master:refs/remotes/origin/master
fetch = +refs/heads/qa/*:refs/remotes/origin/qa/*
If you have a complex workﬂow process that has a QA team pushing branches, de-
velopers pushing branches, and integration teams pushing and collaborating on remote
branches, you can namespace them easily this way. 
 

Answer number 1 : This does a one-time pull and doesn’t save the URL as a remote reference:
$ git pull git://github.com/onetimeguy/project.git
From git://github.com/onetimeguy/project
* branch HEAD -> FETCH_HEAD
Merge made by recursive.
5.3.4 Determining What Is Introduced
Now you have a to

In [13]:
anwsers = {}
for i,a in enumerate(relevant_documents):
    # print(f"Answer number {i} : {a.page_content} \n \n")
    anwsers[i]=a.page_content

print(anwsers)

{0: 'fetch = +refs/heads/qa*:refs/remotes/origin/qa*\nHowever, you can use namespacing to accomplish something like that. If you have\na QA team that pushes a series of branches, and you want to get the master branch and\nany of the QA team’s branches but nothing else, you can use a conﬁg section like this:\n[remote "origin"]\nurl = git@github.com:schacon/simplegit-progit.git\nfetch = +refs/heads/master:refs/remotes/origin/master\nfetch = +refs/heads/qa/*:refs/remotes/origin/qa/*\nIf you have a complex workﬂow process that has a QA team pushing branches, de-\nvelopers pushing branches, and integration teams pushing and collaborating on remote\nbranches, you can namespace them easily this way.', 1: 'This does a one-time pull and doesn’t save the URL as a remote reference:\n$ git pull git://github.com/onetimeguy/project.git\nFrom git://github.com/onetimeguy/project\n* branch HEAD -> FETCH_HEAD\nMerge made by recursive.\n5.3.4 Determining What Is Introduced\nNow you have a topic branch th

In [14]:
ans = {i:a.page_content  for i,a in enumerate(relevant_documents) }

In [15]:
ans

{0: 'fetch = +refs/heads/qa*:refs/remotes/origin/qa*\nHowever, you can use namespacing to accomplish something like that. If you have\na QA team that pushes a series of branches, and you want to get the master branch and\nany of the QA team’s branches but nothing else, you can use a conﬁg section like this:\n[remote "origin"]\nurl = git@github.com:schacon/simplegit-progit.git\nfetch = +refs/heads/master:refs/remotes/origin/master\nfetch = +refs/heads/qa/*:refs/remotes/origin/qa/*\nIf you have a complex workﬂow process that has a QA team pushing branches, de-\nvelopers pushing branches, and integration teams pushing and collaborating on remote\nbranches, you can namespace them easily this way.',
 1: 'This does a one-time pull and doesn’t save the URL as a remote reference:\n$ git pull git://github.com/onetimeguy/project.git\nFrom git://github.com/onetimeguy/project\n* branch HEAD -> FETCH_HEAD\nMerge made by recursive.\n5.3.4 Determining What Is Introduced\nNow you have a topic branch t