# Building the documents database
#### Using OpenAI to generate embeddings
- See https://platform.openai.com/docs/guides/embeddings 
#### Using Chroma to store embeddings
- See https://docs.trychroma.com/getting-started
#### Using LangChain to manage embeddings, vectorstore and LLMs
- See https://docs.langchain.com/docs/use-cases/qa-docs 

#### Install Chroma (if not installed yet)

In [3]:
! pip install chromadb



#### Install PyPDF (if not installed yet)

In [1]:
! pip install pypdf

Collecting pypdf
  Downloading pypdf-3.9.0-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.5/249.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.9.0


#### Install tiktoken (if not installed yet)

In [157]:
! pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (769 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769.7/769.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: regex, tiktoken
Successfully installed regex-2023.5.5 tiktoken-0.4.0


#### Libs

In [180]:
# Vector database - Chroma
import chromadb
from langchain.vectorstores import Chroma

# Question and Answering Chain
from langchain.chains import RetrievalQA

# OpenAI LLM
import openai
from langchain.llms import OpenAI

# OpenAI Embeddings
from langchain.embeddings import OpenAIEmbeddings

# Document loaders
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader

# Text splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Vector index
from langchain.indexes import VectorstoreIndexCreator

# My utils
from rautils import read_file as rf # Read plain text files

# Other utils
import os

#### OpenAI setup

In [154]:
oai_key_filename = 'ram_openai_apikey.txt'
openai_api_key = rf.read_file(oai_key_filename)

In [155]:
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

#### Chroma setup

In [145]:
chroma_client = chromadb.Client()

In [146]:
my_docs_collection = chroma_client.create_collection(name="my_docs_collection")

No embedding_function provided, using default embedding function: DefaultEmbeddingFunction https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


#### Documents loading

In [147]:
loader = PyPDFDirectoryLoader("docs/")

In [148]:
docs = loader.load()

In [149]:
# Check
docs[0].page_content[0:100]

'Perla: A Conversational Agent  for Depression Screen-\ning in Digital Ecosystems . Design, Implementa'

#### Create and test Index

In [158]:
index = VectorstoreIndexCreator().from_loaders([loader]) 

In [163]:
vector_store = index.vectorstore
vector_store_retriever = index.vectorstore.as_retriever()

In [159]:
# Testing index
query = "What is Perla?"
index.query(query)

' Perla is a conversational agent designed to perform a structured interview based on the PHQ-9 questionnaire to effectively estimate the presence of depression symptoms for Spanish speaking population.'

In [162]:
query = "Is Perla a good alternative to test for depression in teenagers?"
index.query(query)

' No, Perla is designed to test for depression in Spanish-speaking adults. It is not designed to test for depression in teenagers.'

In [165]:
query = "What's the impact of depression in Spanish population?"
index.query_with_sources(query)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


KeyboardInterrupt: 

#### Persisting the index

In [167]:
# where to store de embeddings
persist_directory = 'vector_db'

In [170]:
# From loaded docs to text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [172]:
len(texts)

50

In [178]:
# Creating OpenAI embeddings
embedding = OpenAIEmbeddings()

In [181]:
# Create the local vector db
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: You exceeded your current quota, please check your plan and billing details..


KeyboardInterrupt: 

In [None]:
# force saving to disk
vectordb.persist()
vectordb = None 

In [None]:
# Retrieve from disk:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type="stuff", vectorstore=vectordb)

In [None]:
# Use the chain for qa:
query = "What's the percentage of females in this study?"
qa.run(query)

#### Local db cleanup

In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# Or just nuke the persist directory
# !rm -rf <dir-name>/