# Incremental Vector Storage  and Embedding Modulation

## Installing Dependencies

In [55]:
!pip install langchain faiss-cpu pypdf GitPython openpyxl sentence-transformers transformers llama-cpp-python 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [1]:
from langchain.embeddings import (
    LlamaCppEmbeddings, 
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )

import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os

## Text Extraction Function

In [2]:
def get_text_splits(text_file):
  """Function takes in the text data and returns the  
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list

## Pdf Text Extraction function

In [3]:
loader = PyPDFLoader("/Users/apple/Documents/TestingEmbbeding/9780723434177.pdf")
pages = loader.load_and_split()

In [4]:
len(pages)

18

In [5]:
pages[0].page_content


'Objectives\nIn this chapter you will learn to:\n•Describe the anatomical position.\n•Describe the anatomical planes.\n•Deﬁne the anatomical terms used in anatomy and clinical practice.\n•Describe the terms of movement, including those of the thumb.\n•Understand the structure of bone.\n•List the factors that contribute to joint stability.\n•Describe the classiﬁcation of muscles according to their actions.\n•Describe the organization and function of muscle.\n•Draw a diagram of the components of a spinal nerve.\n•Describe the layers of a blood vessel wall.\n•Describe factors causing lymphatic ﬂuid movement and functions of lymph.\n•Outline the layout of the gastrointestinal system and general functions.\n•Outline the layout of the urinary system and general functions.Basic concepts of anatomy 1\n3Such anatomical planes are frequently used in\ncomputer tomography (CT) scans and magneticresonance imaging (MRI), to visualize muscle, bone,lung and other soft tissues as well as pathologies, f

In [7]:

def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the  
  splits so for further processing can be done."""
  
    
  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()  

  textSplit = RecursiveCharacterTextSplitter(chunk_size=850,
                                             chunk_overlap=200,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)# here we are giveing each page content with is a text
    doc_list.extend(pg_splits)

  return doc_list

## For Processing all PDF files in a directory 

In [67]:
import os
from PyPDF2 import PdfFileReader



import os
from PyPDF2 import PdfReader

def custom_text_splitter(text, chunk_size=850, chunk_overlap=200):
    """Custom text splitting function."""
    splits = []
    start = 0

    while start < len(text):
        end = min(start + chunk_size, len(text))
        splits.append(text[start:end])
        start = end - chunk_overlap

    return splits

def get_pdf_splits_direc(directory):
    """Function takes in a directory containing PDF files and returns the splits
    for further processing.

    Args:
    directory (str): Path to the directory containing PDF files.

    Returns:
    list: List of document splits.
    """
    doc_list = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)

            # Load the PDF file
            pdf_reader = PdfReader(file_path)
            num_pages = len(pdf_reader.pages)

            # Iterate through pages in the PDF
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                page_content = page.extract_text()

                # Split the page content using custom splitter
                page_splits = custom_text_splitter(page_content)
                doc_list.extend(page_splits)

    return doc_list


## Excel Text Extraction function

In [9]:
def get_excel_splits(excel_file,target_col,sheet_name):
  trialDF = pd.read_excel(io=excel_file,
                          engine='openpyxl',
                          sheet_name=sheet_name)
  
  df_loader = DataFrameLoader(trialDF,
                              page_content_column=target_col)
  
  excel_docs = df_loader.load()

  return excel_docs
     


## Coma Seperated Values Text Extraction Fucntion

In [10]:
def get_csv_splits(csv_file):
  """Function takes in the csv and returns the  
  splits so for further processing can be done."""
  csvLoader = CSVLoader(csv_file)
  csvdocs = csvLoader.load()
  return csvdocs

## IPYNB Text Extraction Function

In [11]:
def get_ipynb_splits(notebook):
  """Function takes the notebook file,reads the file 
  data as python script, then splits script data directly"""

  with open(notebook) as fh:
    nb = nbformat.reads(fh.read(), nbformat.NO_CONVERT)

  exporter = PythonExporter()
  source, meta = exporter.from_notebook_node(nb)

  #Python file data is in the source variable
  
  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(source)
  return doc_lis

## Git Hub File Extraction Function

In [12]:
def get_git_files(repo_link, folder_path, file_ext):
  # eg. loading only python files
  git_loader = GitLoader(clone_url=repo_link,
    repo_path=folder_path, 
    file_filter=lambda file_path: file_path.endswith(file_ext))
  #Will take each file individual document
  git_docs = git_loader.load()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for code in git_docs:
    code_splits = textSplit.split_text(code.page_content)
    doc_list.extend(code_splits)

  return doc_list

## Incrementation of Exsisting Vector 

In [11]:

def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")

In [14]:

def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn)
  test_dict = test_index.docstore._dict
  return len(test_dict.values()) 

In [8]:
#testing out the above function with the open source 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [38]:
documents = get_pdf_splits("/Users/apple/Downloads/MBBS FirstYear/vishram-singh-textbook-of-clinical-neuroanatomy.pdf")

In [17]:
BharatAI_documents = get_pdf_splits("/Users/apple/Desktop/Protoype/finalppt.pdf")

In [18]:
len(BDC_documents)

587

In [19]:
len(BharatAI_documents)

12

In [39]:
embed_index(doc_list=documents,embed_fn=embeddings,index_store='new_index')

Merge completed
Updated index saved


In [21]:
embed_index(doc_list=BharatAI_documents,embed_fn=embeddings,index_store='new_index')

Merge completed
Updated index saved


In [40]:
get_docs_length(index_path='new_index',embed_fn=embeddings)


18858

In [17]:

test_index = FAISS.load_local("new_index",embeddings)

In [97]:
 result = test_index.similarity_search("what is myocartila infarction")
 result

[Document(page_content="Nervous  System  I 165 \n- Mononeuropathy:  Usually  one  neuron  is affected  and most \ncommon  cause  is ischaemia  due  to pressure.  The  resultant \ndysfunction  depends  on site and  degree  of injury. \n• Bell's  palsy  is the compression  of a facial  nerve  in or just outside \nstylomastoid  foramen  due to inflammation  and oedema  of the nerve. \nThis causes  paralysis  of facial  muscles  and loss  of facial  expression \non the affected  side  (Fig.  7.24). \n• Acute  idiopathic  inflammatory  polyneuropathy  (Guillain-Barre \nsyndrome)  is a sudden,  acute  and progressive  bilateral  ascending \nparalysis  which  starts  at the lower  limb  and then  spreads  to arms, \ntrunks  and  cranial  nerves.  It is characterized  by widespread \ninflammation  with  some  demyelination  of spinal  and cranial  nerves \nand the spinal  ganglia.", metadata={}),
 Document(page_content="Nervous  System  I 165 \n- Mononeuropathy:  Usually  one  neuron  is affec

In [98]:
def answer_question(query):
    """
    Answer a question using the loaded model and preprocessed embeddings.
    """
    # Search for documents similar to the query within the preprocessed embeddings
    docs = test_index.similarity_search(query)

    # Get the answer from the model
    answer = loaded_chain.run(input_documents=docs, question=query)
    return answer
    

In [95]:
result = answer_question("what is myocartila infarction tell in detail")
print(result)


 Myocardial infarction is a medical term for a heart attack. It is a condition in which one of the coronary arteries becomes blocked, preventing oxygen-rich blood from reaching the heart muscle. The lack of oxygen causes necrosis (death) of the cardiac muscle, leading to chest pain, not relieved by rest, referred to the left arm, chest, and neighbouring areas.
