In [2]:

#openai.api_key = os.getenv("OPENAI_API_KEY")

# Load

Specify a DocumentLoader to load in your unstructured data as Documents

In [3]:
from langchain.document_loaders import TextLoader

loader = TextLoader("australia.txt")
data = loader.load()

For multiple docs 

from langchain.document_loaders import WebBaseLoader, PyPDFLoader

from langchain.document_loaders.merge import MergedDataLoader

loader_web = WebBaseLoader(
    "https://github.com/basecamp/handbook/blob/master/37signals-is-you.md"
)

loader_pdf = PyPDFLoader("../MachineLearning-Lecture01.pdf")

loader_all = MergedDataLoader(loaders=[loader_web, loader_pdf])

docs_all = loader_all.load()

 or file directory with https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory

# Split

Split the Document into chunks for embedding and vector storage.

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0) # we used 500 in prototyping
all_splits = text_splitter.split_documents(data)

# Store

To be able to look up our document splits, we first need to store them where we can later look them up. The most common way to do this is to embed the contents of each document then store the embedding and document in a vector store, with the embedding being used to index the document.

In [5]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

RuntimeError: [91mYour system has an unsupported version of sqlite3. Chroma requires sqlite3 >= 3.35.0.[0m
[94mPlease visit https://docs.trychroma.com/troubleshooting#sqlite to learn how to upgrade.[0m

# Retrieve

Retrieve relevant splits for any question using similarity search.

In [None]:
question = "Do I need a visa?"
docs = vectorstore.similarity_search(question)
len(docs)

# Generate

Distill the retrieved documents into an answer using an LLM/Chat model

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

template = """Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

llm = ChatOpenAI(model_name="text-davinci-003", 
                 temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
result = qa_chain({"query": question})
result["result"]