## OpenAI vs. Local Embeddings
Performance Comparison
- OpenAI's Embedding Model
- InstructorEmbedding (https://huggingface.co/hkunlp/instructor-xl)

In [None]:
!pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.5/415.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m271.1/271.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m64.9 MB/s[0m eta

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

In [None]:
# InstructorEmbedding
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [None]:
# OpenAI Embedding
from langchain.embeddings import OpenAIEmbeddings

### Load Multiple files from Directory

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


In [None]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader(f'{root_dir}/Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
# documents

### Divide and Conquer

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000,
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [None]:
texts[0]

Document(page_content='1501 Copyright Small Claims15.1401 Unauthorized use of pre-1972 sound recordings\xa01 14.1301 Protection of Original Designs13.1201 Copyright Protection and Management Systems12.1101 Sound Recordings and Music Videos11.1001 Digital Audio Recording Devices and Media10.901 Protection of Semiconductor Chip Products9.801 Proceedings by Copyright Royalty Judges8.701 Copyright Office7.601 Importation and Exportation6.501 Copyright Infringement and Remedies5.401 Copyright Notice, Deposit, and Registration4.301 Duration of Copyright3.201 Copyright Ownership and Transfer2.101 Subject Matter and Scope of Copyright1.Sec. Chap.(Release Point 118-13)\nTITLE 17—COPYRIGHTS\n\xa0\xa0\xa0\xa0This title was enacted by act July 30, 1947, ch. 391, 61 Stat. 652, and was revised in its entirety\nby\xa0\xa0\xa0\xa0 Pub. L. 94–553, title I, §101, Oct. 19, 1976, 90 Stat. 2541\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\nEDITORIAL NOTES\nAMENDMENTS\n2020—Pub. L. 11

In [None]:
len(texts)

2079

### Get Embeddings for OUR Documents

In [None]:
!pip install faiss-cpu



In [None]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [None]:
def store_embeddings(docs, embeddings, sotre_name, path):

    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [None]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

### HF Instructor Embeddings

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [None]:
# store_embeddings(texts,
#                  instructor_embeddings,
#                  sotre_name='instructEmbeddings',
#                  path=Embedding_store_path)

In [None]:
# db_instructEmbedd = load_embeddings(sotre_name='instructEmbeddings',
#                                     path=Embedding_store_path)

In [None]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)

RuntimeError: ignored

In [None]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

In [None]:
docs = retriever.get_relevant_documents("How do I get a loan to start my own business?")

In [None]:
docs[0]