In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print("OK!")

In [3]:
!pip -q install chromadb openai langchain tiktoken

In [None]:
!pip show chromadb

In [5]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
!unzip -q new_articles.zip -d new_articles

## Setting up Environment

In [7]:
import os

os.environ['OPENAI_API_KEY'] = ""

## Import some libraries

In [None]:
!pip install -U langchain-community


In [9]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

## Load data

In [10]:
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [11]:
document = loader.load()

In [None]:
document

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
text

In [None]:
len(text)

In [None]:
text[1]

In [None]:
text[2]

## Creating DB

In [20]:
from langchain import embeddings
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [23]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

In [None]:
len(docs)

In [None]:
docs

In [27]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

## Make a chain

In [30]:
from langchain.chains import RetrievalQA

In [None]:
llm=OpenAI()

In [None]:
llm

In [33]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [34]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

## Deleteing the DB


In [None]:
!zip -r db.zip ./db

In [38]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory
!rm -rf db/

## Starting again loading the db

In [None]:
!unzip db.zip