### Setting up langchain and chromadb

In [None]:
!bash setup.sh

In [None]:
import os
import shutil

import dotenv
from langchain_openai import ChatOpenAI
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import (
     PromptTemplate,
     SystemMessagePromptTemplate,
     HumanMessagePromptTemplate,
     ChatPromptTemplate,
)
from langchain_core.output_parsers import StrOutputParser
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema.runnable import RunnablePassthrough
from langchain.docstore.document import Document
import chromadb
from chromadb.config import Settings
from lib import id_maker, add_data_to_db
dotenv.load_dotenv()
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
CHROMA_PATH =  os.getenv("CHROMA_PATH", "./CHROMA_PATH")
COLLECTION_NAME= "collection1"


### Create Chroma database

##### Clear the database -- note that you will have to restart the session

In [None]:

unique_id=id_maker(0).f

vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=CHROMA_PATH,
    embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
)
urls=["https://en.wikipedia.org/wiki/Napoleon"]
add_data_to_db(urls, vector_store, unique_id)



### Check if vector store insertions are working

In [None]:

# vector_store = Chroma(
#     collection_name=COLLECTION_NAME,
#     persist_directory=CHROMA_PATH,
#     embedding_function=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
#     )

# retriever  = vector_store.as_retriever(k=10)
# query="who was Napoleon?"

# results = retriever.invoke(query)

# # Print the results
# print(f"Number of documents retrieved: {len(results)}")
# for doc in results:
#     print(f"Content: {doc.page_content}")
#     print(f"Source: {doc.metadata['url']}")

### Create retrieval chain

In [None]:
system_template_str = """Your job is to to answer questions using
 context provided to answer questions.
 Be as detailed as possible, but don't make up any information
 that's not from the context. If the answer does not directly follow from context,
 say
 you don't know.  Please state the url from which this information was extracted.
 The url is in the metadata for each document in the context supplied for the prompt.

{context}
"""

system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"], template=system_template_str
     )
 )

human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"], template="{question}"
    )
)

messages = [system_prompt, human_prompt]
prompt_template = ChatPromptTemplate(
     input_variables=["context", "question"],
    messages=messages,
 )

In [None]:
def create_prompt(context, question):
    return prompt_template.format(context=context, question=question)

# Define the main chain
def chain(question):
    # Retrieve documents
    docs = retriever.invoke(question)
    #print(question, docs)
    context = " ".join([doc.page_content + " url:" + doc.metadata["url"] +"\n"
                        for doc in docs])
    # Create prompt
    prompt = create_prompt(context, question)
    print(prompt)
    # Get response from the language model
    chat_model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0,
                        openai_api_key=OPENAI_API_KEY)

    response = chat_model.invoke(prompt)
    return response

In [None]:
result = chain("Who was Napoleon's daughter?")
print(result)