In [None]:
!pip install langchain faiss-cpu pypdf GitPython openpyxl sentence-transformers transformers llama-cpp-python PyPDF2 python-dotenv streamlit

### Notebook shows how the multiple file types can be loaded into FAISS Index (open source)

The embedding used will be again open source sentence-transformers encoders

**All Files are Converted to Text**

1) PDF File

### Libraries Involved

langchain

faiss-cpu

pypdf

GitPython

openpyxl

sentence-transformers

In [None]:
from langchain.embeddings import (
    LlamaCppEmbeddings,
    HuggingFaceEmbeddings,
    SentenceTransformerEmbeddings,
    HuggingFaceInstructEmbeddings
)

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    PyPDFLoader,
    DataFrameLoader,
    GitLoader
  )
import pandas as pd
import nbformat
from nbconvert import PythonExporter
import os

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub


In [None]:
def get_text_splits(text_file):
  """Function takes in the text data and returns the
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=1000,
                                             chunk_overlap=150,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list

In [None]:
def get_pdf_splits(pdf_file):
  """Function takes in the pdf data and returns the
  splits so for further processing can be done."""

  loader = PyPDFLoader(pdf_file)
  pages = loader.load_and_split()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=1000,
                                             chunk_overlap=200,
                                             length_function=len)
  doc_list = []
  #Pages will be list of pages, so need to modify the loop
  for pg in pages:
    pg_splits = textSplit.split_text(pg.page_content)
    doc_list.extend(pg_splits)

  return doc_list

In [None]:
def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store,
  new doc_list and embedding function that is
  initialized on appropriate model. Local or online.
  New embedding is merged with the existing index. If no
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list,
                              embed_fn)
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list,
                              embed_fn)

  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn,allow_dangerous_deserialization=True)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")


In [None]:
def get_docs_length(index_path, embed_fn):
  test_index = FAISS.load_local(index_path,
                              embeddings=embed_fn,
                              allow_dangerous_deserialization=True)
  test_dict = test_index.docstore._dict
  return len(test_dict.values())

In [None]:
# #You can change the embedding here
# embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")

In [None]:
# pdf_docs = get_pdf_splits("/content/Full book Rich Dad Poor Dad What the Rich Teach Their Kids About Money.pdf")

# embed_index(doc_list=pdf_docs,
#             embed_fn=embeddings,
#             index_store='Rich_dad')

In [None]:
# pdf_docs = get_pdf_splits("/content/Human_nature.pdf")

# embed_index(doc_list=pdf_docs,
#             embed_fn=embeddings,
#             index_store='Human_Nature')

In [None]:
pdf_docs = get_pdf_splits("/content/48_laws.pdf")

embed_index(doc_list=pdf_docs,
            embed_fn= HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl"),
            index_store='48_laws')

In [None]:
!wget -q -O - ipv4.icanhazip.com


In [None]:
!streamlit run appV2.py & npx localtunnel --port 8501