In [1]:
!pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu

In [29]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""

In [27]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.llms import HuggingFaceHub

In [4]:
# InstructorEmbedding 
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# OpenAI Embedding
from langchain.embeddings import OpenAIEmbeddings

In [6]:
root_dir = "./files"

In [7]:
loader = DirectoryLoader(f'{root_dir}/Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000, 
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [9]:
texts[0]

Document(page_content='Pierre Bourdieu on Marriage Strategies\nSource: Population and Development Review, Vol. 28, No. 3 (Sep., 2002), pp. 549-558 \nPublished by: Population Council\nStable URL: http://www.jstor.org/stable/3092841 .\nAccessed: 06/02/2015 14:53\nYour use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at .\nhttp://www.jstor.org/page/info/about/policies/terms.jsp\n .\nJSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of\ncontent in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms\nof scholarship. For more information about JSTOR, please contact support@jstor.org.\n .\nPopulation Council is collaborating with JSTOR to digitize, preserve and extend access to Population and\nDevelopment Review.\nhttp://www.jstor.org \nThis content downloaded from 128.235.251.160 on Fri, 6 Feb 2015 14:53:22 PM

In [10]:
len(texts)

45

In [11]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [12]:
def store_embeddings(docs, embeddings, sotre_name, path):
    
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [13]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

In [14]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cuda"})

load INSTRUCTOR_Transformer
max_seq_length  512


In [15]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [16]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)

In [17]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})

In [18]:
retriever.search_type

'similarity'

In [19]:
retriever.search_kwargs

{'k': 3}

In [22]:
docs = retriever.get_relevant_documents("whom should a modest girl marry?")

In [23]:
docs[0]

Document(page_content='sister (or sisters) was quite different from that of one who had a brother (or \nbrothers). All our informants have indicated that the adot of girls was al- \nmost always higher than that of boys, so that their chances of marriage were \ngreater. The reason, as we have seen, was that families had no choice but to \nmarry off these useless mouths, and as quickly as possible. \nIn the case of younger sons the strategies could be more complex, the \nfirst reason being that an abundant, even superabundant, supply of labor \nwill create a great desire for land that can only be beneficial to the patri- \nmony. It follows that families were less anxious to marry off a younger son \n(except, perhaps, the first of the younger sons in an important family) than \na younger daughter or even an eldest son. One way would be to marry him \nto an heiress, a course that was very normal and best suited to his own \ninterests, though not necessarily to the interest of the lineage: 

In [35]:
# create the chain to answer questions 
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5,"max_length":1000} ), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [36]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [43]:
query = 'whom should important family son marry?'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

younger daughter of an important family

Sources:
files\Documents\On marriage strategies.pdf
files\Documents\On marriage strategies.pdf
files\Documents\On marriage strategies.pdf
