In [None]:
!pip install h5py
!pip install typing-extensions
!pip install wheel

In [None]:
!pip -q install chromadb openai tiktoken pypdf
!pip install -U langchain-community langchain_openai

In [None]:
!pip show chromadb

In [None]:
import os
openai.api_key = os.getenv('OPENAI_API_KEY')

In [33]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain import embeddings

In [21]:
loader = PyPDFLoader(
    "./spaul/Deep-Learning-with-PyTorch.pdf",
)
document = loader.load()

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 20)
text = text_splitter.split_documents(document)

In [None]:
text[1]

Create database

In [35]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [44]:
# Persist the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now the persisted database can be loaded from disk
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [46]:
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("What is natural language processing?")

In [None]:
retriever.search_type

Extract the OpenAI API response using LangChain

In [None]:
from langchain.chains import RetrievalQA
llm=OpenAI()
llm

In [None]:
# Create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = "What is natural language processing?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Delete the database

In [None]:
!zip -r db.zip ./db

# To cleanup, delete the collection
vectordb.delete_collection()
vectordb.persist()

# Delete the directory
!rm -rf db/

Reload the database

In [None]:
!unzip db.zip