In [21]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import pandas as pd


### 1. Loading the data

In [6]:
loader = PyPDFLoader("yolo.pdf")
documents = loader.load_and_split()

In [11]:
print(f'Documents length -- {len(documents)}')
print(documents[0].page_content)

Documents length -- 16
You Only Look Once:
Uniﬁed, Real-Time Object Detection
Joseph Redmon∗, Santosh Divvala∗†, Ross Girshick¶, Ali Farhadi∗†
University of Washington∗, Allen Institute for AI†, Facebook AI Research¶
http://pjreddie.com/yolo/
Abstract
We present YOLO, a new approach to object detection.
Prior work on object detection repurposes classiﬁers to per-
form detection. Instead, we frame object detection as a re-
gression problem to spatially separated bounding boxes and
associated class probabilities. A single neural network pre-
dicts bounding boxes and class probabilities directly from
full images in one evaluation. Since the whole detection
pipeline is a single network, it can be optimized end-to-end
directly on detection performance.
Our uniﬁed architecture is extremely fast. Our base
YOLO model processes images in real-time at 45 frames
per second. A smaller version of the network, Fast YOLO,
processes an astounding 155 frames per second while
still achieving double the 

### 2. Splitting the documents

In [12]:
split_text = RecursiveCharacterTextSplitter(chunk_size =512, chunk_overlap = 50)
texts = split_text.split_documents(documents=documents)

In [15]:
print(texts[0].page_content)

You Only Look Once:
Uniﬁed, Real-Time Object Detection
Joseph Redmon∗, Santosh Divvala∗†, Ross Girshick¶, Ali Farhadi∗†
University of Washington∗, Allen Institute for AI†, Facebook AI Research¶
http://pjreddie.com/yolo/
Abstract
We present YOLO, a new approach to object detection.
Prior work on object detection repurposes classiﬁers to per-
form detection. Instead, we frame object detection as a re-
gression problem to spatially separated bounding boxes and


### 3. Embedding the documents

In [16]:
embeddings = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2', 
    model_kwargs={'device':'cpu'}
    )

  from .autonotebook import tqdm as notebook_tqdm


### 4. Creating vector db index

In [17]:
faiss_index = FAISS.from_documents(texts, embeddings)
faiss_index_name = 'faiss_index_updated'

faiss_index.save_local(faiss_index_name) # a directory shall be created in the name of <faiss_index_name> and couple of files are created

### 5. Loading the vector db from local

In [19]:
loaded_vector_db_from_local = FAISS.load_local('./faiss_index_updated', embeddings)

### 6. Viewing the vector db

In [18]:
def convert_vectordb_to_df(vectorDb):
    vector_dict = vectorDb.docstore._dict
    data_rows = []

    for k in vector_dict.keys():
        doc_name = vector_dict[k].metadata['source'].split('/')[-1]
        page_number = vector_dict[k].metadata['page'] + 1
        content =  vector_dict[k].page_content
        data_rows.append({"chunk_id": k, "document": doc_name, "page": page_number, "content":content})

    vector_df = pd.DataFrame(data_rows)
    print(vector_df)
    return vector_df

In [22]:
convert_vectordb_to_df(loaded_vector_db_from_local)

                                chunk_id  document  page  \
0   75337173-28cd-44ca-b57f-6b38c307a6e1  yolo.pdf     1   
1   ec546c79-8340-445f-ad01-b42d1db9e5b3  yolo.pdf     1   
2   2ce49afc-d253-43fc-95e0-f80e6ba3d2a1  yolo.pdf     1   
3   71bccb92-f842-44f7-b49c-efa8dc0ace50  yolo.pdf     1   
4   d0d2901e-8048-46c3-be4a-84ffe589717b  yolo.pdf     1   
..                                   ...       ...   ...   
94  b3b09bf4-aba4-4fef-bbbb-d42389b344ce  yolo.pdf     9   
95  5d01ef25-bbad-46d9-b0e2-917796c7d3d4  yolo.pdf     9   
96  c0fe7989-80c7-4519-99f9-7f45325d1a7f  yolo.pdf    10   
97  51341021-c471-48ae-b2b8-29dcb72486d9  yolo.pdf    10   
98  cd987d5a-e5c6-4708-a1b4-9b9202156266  yolo.pdf    10   

                                              content  
0   You Only Look Once:\nUniﬁed, Real-Time Object ...  
1   associated class probabilities. A single neura...  
2   still achieving double the mAP of other real-t...  
3   jects are in the image, where they are, and ho...  

Unnamed: 0,chunk_id,document,page,content
0,75337173-28cd-44ca-b57f-6b38c307a6e1,yolo.pdf,1,"You Only Look Once:\nUniﬁed, Real-Time Object ..."
1,ec546c79-8340-445f-ad01-b42d1db9e5b3,yolo.pdf,1,associated class probabilities. A single neura...
2,2ce49afc-d253-43fc-95e0-f80e6ba3d2a1,yolo.pdf,1,still achieving double the mAP of other real-t...
3,71bccb92-f842-44f7-b49c-efa8dc0ace50,yolo.pdf,1,"jects are in the image, where they are, and ho..."
4,d0d2901e-8048-46c3-be4a-84ffe589717b,yolo.pdf,1,"for general purpose, responsive robotic system..."
...,...,...,...,...
94,b3b09bf4-aba4-4fef-bbbb-d42389b344ce,yolo.pdf,9,"5\n[28] S. Ren, K. He, R. Girshick, and J. Sun..."
95,5d01ef25-bbad-46d9-b0e2-917796c7d3d4,yolo.pdf,9,Recognition Challenge. International Journal o...
96,c0fe7989-80c7-4519-99f9-7f45325d1a7f,yolo.pdf,10,[33] Z. Shen and X. Xue. Do more dropouts in p...
97,51341021-c471-48ae-b2b8-29dcb72486d9,yolo.pdf,10,4\n[36] P. Viola and M. Jones. Robust real-tim...


### 7. Adding a new document

In [26]:
# Loading new file - selenium_documentation_0.pdf
loader = PyPDFLoader("selenium_documentation_0.pdf")
documents_new_doc = loader.load_and_split()

# Splitting the documents
split_text = RecursiveCharacterTextSplitter(chunk_size =512, chunk_overlap = 50)
texts_new_docs = split_text.split_documents(documents=documents_new_doc)

# Embedding the documents : using the same embedding used above

# embeddings = HuggingFaceEmbeddings(
#     model_name='sentence-transformers/all-MiniLM-L6-v2', 
#     model_kwargs={'device':'cpu'}
#     )

faiss_index_new = FAISS.from_documents(texts_new_docs, embeddings)

In [27]:
# Loading the original vector db stored in local
loaded_vector_db_from_local = FAISS.load_local('./faiss_index_updated', embeddings)

# Merging the new vector db with the old one
loaded_vector_db_from_local.merge_from(faiss_index_new)

# Saving the merged vector db
loaded_vector_db_from_local.save_local('./faiss_index_updated')

In [28]:
convert_vectordb_to_df(loaded_vector_db_from_local) ## both documents are present in this Vector DB

                                 chunk_id                      document  page  \
0    75337173-28cd-44ca-b57f-6b38c307a6e1                      yolo.pdf     1   
1    ec546c79-8340-445f-ad01-b42d1db9e5b3                      yolo.pdf     1   
2    2ce49afc-d253-43fc-95e0-f80e6ba3d2a1                      yolo.pdf     1   
3    71bccb92-f842-44f7-b49c-efa8dc0ace50                      yolo.pdf     1   
4    d0d2901e-8048-46c3-be4a-84ffe589717b                      yolo.pdf     1   
..                                    ...                           ...   ...   
844  b3cf81c2-4d76-4ea0-ae05-9efb6b0d7665  selenium_documentation_0.pdf   200   
845  ace550c0-1f11-443c-be9c-f802e7317829  selenium_documentation_0.pdf   200   
846  f6045114-dbad-404b-8b55-8cebcd8bbab2  selenium_documentation_0.pdf   200   
847  df3a6312-dc71-46ba-b728-c5e2c1979b56  selenium_documentation_0.pdf   200   
848  f03ed8b3-7dc5-44ab-8b65-22c6f2a8ee79  selenium_documentation_0.pdf   201   

                           

Unnamed: 0,chunk_id,document,page,content
0,75337173-28cd-44ca-b57f-6b38c307a6e1,yolo.pdf,1,"You Only Look Once:\nUniﬁed, Real-Time Object ..."
1,ec546c79-8340-445f-ad01-b42d1db9e5b3,yolo.pdf,1,associated class probabilities. A single neura...
2,2ce49afc-d253-43fc-95e0-f80e6ba3d2a1,yolo.pdf,1,still achieving double the mAP of other real-t...
3,71bccb92-f842-44f7-b49c-efa8dc0ace50,yolo.pdf,1,"jects are in the image, where they are, and ho..."
4,d0d2901e-8048-46c3-be4a-84ffe589717b,yolo.pdf,1,"for general purpose, responsive robotic system..."
...,...,...,...,...
844,b3cf81c2-4d76-4ea0-ae05-9efb6b0d7665,selenium_documentation_0.pdf,200,WebDriverBackedSelenium and use a Sizzle locat...
845,ace550c0-1f11-443c-be9c-f802e7317829,selenium_documentation_0.pdf,200,is no longer possible. How can you tell if you...
846,f6045114-dbad-404b-8b55-8cebcd8bbab2,selenium_documentation_0.pdf,200,"or “document” directly.\nAlternatively, you mi..."
847,df3a6312-dc71-46ba-b728-c5e2c1979b56,selenium_documentation_0.pdf,200,"""return arguments[0].tagName"" , element);\nNot..."
