# ChatBot AI Agent with LangChain and OpenAI

### Install

In [15]:
# !python3 -m pip install openai # from terminal or jupyter notebook
# !python3 -m pip install langchain --user

### Data Load

In [2]:
# https://rajpurkar.github.io/SQuAD-explorer/

from datasets import load_dataset 

data = load_dataset('squad', split='train')
df = data.to_pandas()
df.drop_duplicates(subset='context', keep='first', inplace=True)

Found cached dataset squad (/home/mohsin/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [35]:
# df = data.to_pandas()

In [9]:
df.iloc[0]['context']

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [10]:
df.iloc[0]['question']

'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'

In [13]:
df.iloc[0]['answers']

{'text': array(['Saint Bernadette Soubirous'], dtype=object),
 'answer_start': array([515], dtype=int32)}

In [31]:
# sum(df['context'].duplicated())

68708

In [36]:
# df.drop_duplicates(subset='context', keep='first', inplace=True)

In [18]:
df.shape

(18891, 5)

In [19]:
df.head(2)

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."


### Embedding API

In [3]:
# Open AI

# OPENAI_API_KEY= ""

import openai

In [4]:
openai.api_key = OPENAI_API_KEY
MODEL  = "text-embedding-ada-002"

res = openai.Embedding.create(input = "I love openai", engine = MODEL)

In [49]:
emb_vector  = res["data"][0]["embedding"] # parsing embedded vectors

In [5]:
def get_embedding(text, model):
    text = text.replace("\n", " ")
    res = openai.Embedding.create(input = text, engine = model)
    return res["data"][0]["embedding"]

In [6]:
vec = get_embedding("I am trying a new text \n And see what happens", MODEL)

In [7]:
len(vec) # 

1536

### Vector DB Setup

In [8]:
# db of 1536 dimension

# API_KEY = "your key"
# ENV = "your env"

import pinecone

pinecone.init(api_key = API_KEY, environment = ENV)
# pinecone.create_index("ai-agent", dimension=1536, metric='dotproduct')
index = pinecone.Index("ai-agent")

In [None]:
# index.delete(delete_all=True)


### Indexing

In [92]:
df_sample = df.sample(10000, random_state=45)
batch_size = 20 # free tier limit 20 RPM

In [None]:
# embedding function from OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"

embed = OpenAIEmbeddings(
    model = model_name,
    openai_api_key= OPENAI_API_KEY)

In [93]:
from tqdm.auto import tqdm
import time

In [94]:
%%time
for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
#     print(i, i_end)
    batch = df_sample.iloc[i:i_end]
    meta_data = [{"titile" : row['title'], 
              "context": row['context']} 
             for i, row in batch.iterrows()]
    
    # embedding  
    docs = batch['context'].tolist()  # pd.Series to python list
#     emb_vectors = [get_embedding(doc, MODEL) for doc in docs] 
    emb_vectors = embed.embed_documents(docs) # list of list

    ids = batch['id'].tolist()
    
    # upsert
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)
    
    time.sleep(20) # 8s for 50 data points

    


  0%|          | 0/500 [00:00<?, ?it/s]

In [88]:
df.shape[0]/3600 # 5 hrs to load , free tier will take 15hrs
# 14000 records/dollar

5.2475

### Using

In [11]:
def get_embedding2(text):
    text = text.replace("\n", " ")
    res = openai.Embedding.create(input = text, 
                                  engine = "text-embedding-ada-002")
    return res["data"][0]["embedding"]

In [46]:
# Langchain vector store defination
from langchain.vectorstores import Pinecone

# vectorstore = Pinecone(index, get_embedding2, "text")
vectorstore = Pinecone(index, embed.embed_query, "text")

In [47]:
query = "When was university of notredame establish"

In [33]:
openai.api_key = OPENAI_API_KEY

In [48]:
# pure semantic, non generative, non agent based
vectorstore.similarity_search(query, k=3)

[Document(page_content="In 1919 Father James Burns became president of Notre Dame, and in three years he produced an academic revolution that brought the school up to national standards by adopting the elective system and moving away from the university's traditional scholastic and classical emphasis. By contrast, the Jesuit colleges, bastions of academic conservatism, were reluctant to move to a system of electives. Their graduates were shut out of Harvard Law School for that reason. Notre Dame continued to grow over the years, adding more colleges, programs, and sports teams. By 1921, with the addition of the College of Commerce, Notre Dame had grown from a small college to a university with five colleges and a professional law school. The university continued to expand and add new residence halls and buildings with each subsequent president.", metadata={'title': 'University_of_Notre_Dame'}),
 Document(page_content="After the Fall of Rome, the Catholic Church became the sole preserve

In [44]:
# embed.embed_query("embedding single document")

# embed.embed_documents(["first doc", "second doc"])

### Define QA Agent

In [55]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory

from langchain.chains import RetrievalQA

# OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY,
                model_name = 'gpt-3.5-turbo',
                temperature = 0.0)

# conversational memory
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'chat_history',
    k = 5,
    return_messages =True)

# retrieval qa
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())


# https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html
# https://docs.langchain.com/docs/components/chains/index_related_chains

### Invoking Retrieval QA

In [57]:
query = "When was university of notredame establish"
qa.run(query) # retrieving the info

'The University of Notre Dame was established in 1842.'

In [58]:
query = "who established the university of notredame"
qa.run(query)

'The University of Notre Dame was established by Father Edward Sorin, a French priest of the Congregation of Holy Cross, in 1842.'

In [59]:
from langchain.agents import Tool

tools = [
    Tool(
    name = 'Knowledge Base',
    func = qa.run,
    description = ('use this when answering based on knwowledge')
    )
]

In [70]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conv_mem 
)

In [71]:
agent("when was university of notredame established") # chat gpt kind



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Knowledge Base",
    "action_input": "University of Notre Dame establishment date"
}[0m
Observation: [36;1m[1;3mThe University of Notre Dame was founded on November 26, 1842.[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The University of Notre Dame was founded on November 26, 1842."
}[0m

[1m> Finished chain.[0m


{'input': 'when was university of notredame established',
 'chat_history': [HumanMessage(content='what is marian place', additional_kwargs={}, example=False),
  AIMessage(content="The Challenger Deep is a small slot-shaped valley in the floor of the Mariana Trench, which is the deepest part of the world's oceans. It reaches a maximum-known depth of 10,984 meters (36,037 feet) (± 25 meters [82 feet]) at the Challenger Deep.", additional_kwargs={}, example=False)],
 'output': 'The University of Notre Dame was founded on November 26, 1842.'}

In [72]:
agent("who founded the university")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The University of Notre Dame was founded by Father Edward Sorin, a French priest of the Congregation of Holy Cross, in 1842."
}[0m

[1m> Finished chain.[0m


{'input': 'who founded the university',
 'chat_history': [HumanMessage(content='what is marian place', additional_kwargs={}, example=False),
  AIMessage(content="The Challenger Deep is a small slot-shaped valley in the floor of the Mariana Trench, which is the deepest part of the world's oceans. It reaches a maximum-known depth of 10,984 meters (36,037 feet) (± 25 meters [82 feet]) at the Challenger Deep.", additional_kwargs={}, example=False),
  HumanMessage(content='when was university of notredame established', additional_kwargs={}, example=False),
  AIMessage(content='The University of Notre Dame was founded on November 26, 1842.', additional_kwargs={}, example=False)],
 'output': 'The University of Notre Dame was founded by Father Edward Sorin, a French priest of the Congregation of Holy Cross, in 1842.'}

In [73]:
agent("20+6")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "26"
}[0m

[1m> Finished chain.[0m


{'input': '20+6',
 'chat_history': [HumanMessage(content='what is marian place', additional_kwargs={}, example=False),
  AIMessage(content="The Challenger Deep is a small slot-shaped valley in the floor of the Mariana Trench, which is the deepest part of the world's oceans. It reaches a maximum-known depth of 10,984 meters (36,037 feet) (± 25 meters [82 feet]) at the Challenger Deep.", additional_kwargs={}, example=False),
  HumanMessage(content='when was university of notredame established', additional_kwargs={}, example=False),
  AIMessage(content='The University of Notre Dame was founded on November 26, 1842.', additional_kwargs={}, example=False),
  HumanMessage(content='who founded the university', additional_kwargs={}, example=False),
  AIMessage(content='The University of Notre Dame was founded by Father Edward Sorin, a French priest of the Congregation of Holy Cross, in 1842.', additional_kwargs={}, example=False)],
 'output': '26'}

#### Note on the Rate Limit

Rate Limit: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

Retry Options: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

### Further Reading

https://arxiv.org/abs/2005.11401 

https://platform.openai.com/docs/models/gpt-3-5