# ChatBot AI Agent with LangChain and OpenAI

### Install

In [3]:
# !python3 -m pip install openai==1.55.2 # from terminal or jupyter notebook
# !python3 -m pip install langchain==0.3.9 --user # latest as of Nov 2024
# pip install langchain-openai==0.2.10
# pip install -U langchain-pinecone==0.2.0 

### Data Load

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


In [1]:
# https://rajpurkar.github.io/SQuAD-explorer/

from datasets import load_dataset 

data = load_dataset('squad', split='train')
df = data.to_pandas()
df.drop_duplicates(subset='context', keep='first', inplace=True)
print(df.shape)
df.head(2)

  from .autonotebook import tqdm as notebook_tqdm


(18891, 5)


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."


In [2]:
# df = data.to_pandas()
# df.iloc[0]['context']
# df.iloc[0]['question']
# df.iloc[0]['answers']
# sum(df['context'].duplicated())

### Embedding API

In [3]:
# This block is older code using openai==0.28. 
# You can still use it but make sure to install : pip install openai==0.28

# openai.api_key = OPENAI_API_KEY
# MODEL  = "text-embedding-ada-002"

# res = openai.Embedding.create(input = "I love openai", engine = MODEL) 
# embedding = response['data'][0]['embedding']
# print(embedding)

In [3]:
# This block uses lates (Nov 2024) version of openai : # openai.__version__  #'1.55.2'
from openai import OpenAI
import os

MODEL = "text-embedding-ada-002"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", openai_api_key)) # first one will work if set as env var, second one explicit one

response = client.embeddings.create(
    input="I love openai",
    model=MODEL
)

# usage
embedding  = (response.data)[0].embedding # new code for v-1.55.2
# print(embedding)


In [7]:
# helper function
def get_embedding(text, model):
    text = text.replace("\n", " ")
    res = client.embeddings.create(input = text, model = model) # engine replaced by model in v-1.55.2
    return (response.data)[0].embedding

vec = get_embedding("I am trying a new text \n And see what happens", MODEL)
print(f"required vector dim: {len(vec)}") # dimension of the index"

required vector dim: 1536


### Vector DB Setup

In [None]:
pc_api_key = os.getenv('PINECONE_API_KEY')

In [None]:
# db of 1536 dimension

from pinecone import Pinecone, ServerlessSpec
# API_KEY = "YOUR API KEY"
pc = Pinecone(api_key = pc_api_key)

# pc.create_index("ai-agent", dimension=1536, metric='dotproduct',
#                      spec=ServerlessSpec(cloud="aws", region="us-east-1"))
index = pc.Index("ai-agent")
# index.delete(delete_all=True)

### Indexing

In [9]:
df_sample = df.sample(1000, random_state=45)
batch_size = 100 # free tier limit 20 RPM in 2023 now 3000

In [9]:
# embedding function from OpenAI, old code  won't work anymore. instead use langchain-openai as shown in next cell
# from langchain.embeddings.openai import OpenAIEmbeddings

# model_name = "text-embedding-ada-002"

# embed = OpenAIEmbeddings(
#     model = model_name,
#     openai_api_key= OPENAI_API_KEY)

In [10]:
# https://pypi.org/project/langchain-openai/
# pip install langchain-openai==0.2.10

from langchain_openai import OpenAIEmbeddings 

MODEL = "text-embedding-ada-002"
embed = OpenAIEmbeddings(
    model = MODEL,
    openai_api_key= OPENAI_API_KEY)

# Usage:
# doc1 = "Hello how are you"
# doc2 = "Hello everyone!"
# embed.embed_query(doc) # single doc
# embed.embed_documents([doc1, doc2])  # output will be list of list

In [43]:
from tqdm.auto import tqdm
import time

In [44]:
%%time
for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
#     print(i, i_end)
    batch = df_sample.iloc[i:i_end]
    meta_data = [{"titile" : row['title'], 
              "context": row['context']} 
             for i, row in batch.iterrows()]
    
    # embedding  
    docs = batch['context'].tolist()  # pd.Series to python list
#     emb_vectors = [get_embedding(doc, MODEL) for doc in docs] 
    emb_vectors = embed.embed_documents(docs) # list of list

    ids = batch['id'].tolist()
    
    # upsert
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)
    
    # time.sleep(20) # 8s for 50 data points, this was needed when free tier had rate limit to 20RPM, no need anymore

    
# df.shape[0]/3600 # 5 hrs to load , free tier will take 15hrs
# # 14000 records/dollar

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.66s/it]

CPU times: total: 3.48 s
Wall time: 26.6 s





### Using

In [12]:
def get_embedding2(text):
    text = text.replace("\n", " ")
    res = client.embeddings.create(input = text, 
                                  model = "text-embedding-ada-002")
    return (res.data)[0].embedding

# get_embedding2("tabula rasa")

In [69]:
# Updated code with new libraries and classes
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings 

MODEL = "text-embedding-ada-002"

# Initialize the vector store with the correct embedding method
embeddings = OpenAIEmbeddings(model=MODEL, api_key=OPENAI_API_KEY)


vectorstore = PineconeVectorStore(index, embeddings, "context", pinecone_api_key= API_KEY) # df['context'] column is the actual text field to search from

# Perform the similarity search, pure semantic, nothing genrative
query = "destruction of US fifth fleet"
results = vectorstore.similarity_search(query, k=2)


In [67]:
results

[Document(id='573228eab9d445190005e86f', metadata={'titile': 'Pacific_War'}, page_content="It was imperative for Japanese commanders to hold Saipan. The only way to do this was to destroy the U.S. Fifth Fleet, which had 15 fleet carriers and 956 planes, 7 battleships, 28 submarines, and 69 destroyers, as well as several light and heavy cruisers. Vice Admiral Jisaburo Ozawa attacked with nine-tenths of Japan's fighting fleet, which included nine carriers with 473 planes, 5 battleships, several cruisers, and 28 destroyers. Ozawa's pilots were outnumbered 2:1 and their aircraft were becoming or were already obsolete. The Japanese had considerable antiaircraft defenses but lacked proximity fuzes or good radar. With the odds against him, Ozawa devised an appropriate strategy. His planes had greater range because they were not weighed down with protective armor; they could attack at about 480 km (300 mi)[citation needed], and could search a radius of 900 km[citation needed] (560 mi). U.S. Na

In [44]:
# embed.embed_query("embedding single document")

# embed.embed_documents(["first doc", "second doc"])

### Define QA Agent

In [70]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory

from langchain.chains import RetrievalQA

# OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY,
                model_name = 'gpt-3.5-turbo',
                temperature = 0.0)

# conversational memory
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'chat_history',
    k = 5,
    return_messages =True)

# retrieval qa
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())


# https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html
# https://docs.langchain.com/docs/components/chains/index_related_chains

### Invoking Retrieval QA

In [71]:
query = "Which year university of notredame was established"
qa.invoke(query) # retrieving the info

{'query': 'Which year university of notredame was established',
 'result': 'The University of Notre Dame du Lac was established in 1842.'}

In [72]:
query = "who established the university of notredame"
qa.invoke(query)

{'query': 'who established the university of notredame',
 'result': 'The University of Notre Dame du Lac was established by Father Edward Sorin, who was a priest of the Congregation of Holy Cross. He founded the university in 1842.'}

In [73]:
from langchain.agents import Tool

tools = [
    Tool(
    name = 'Knowledge Base',
    func = qa.invoke,
    description = ('use this when answering based on knwowledge')
    )
]

In [74]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conv_mem 
)

In [75]:
agent("when was university of notredame established") # chat gpt kind



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "University of Notre Dame establishment date"
}
```[0m
Observation: [36;1m[1;3m{'query': 'University of Notre Dame establishment date', 'result': 'The University of Notre Dame du Lac was founded on November 26, 1842.'}[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The University of Notre Dame du Lac was founded on November 26, 1842."
}
```[0m

[1m> Finished chain.[0m


{'input': 'when was university of notredame established',
 'chat_history': [],
 'output': 'The University of Notre Dame du Lac was founded on November 26, 1842.'}

In [76]:
agent("who founded the university")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "University of Notre Dame du Lac was founded by Rev. Edward Sorin, C.S.C., who was a priest of the Congregation of Holy Cross."
}
```[0m
Observation: [36;1m[1;3m{'query': 'University of Notre Dame du Lac was founded by Rev. Edward Sorin, C.S.C., who was a priest of the Congregation of Holy Cross.', 'result': 'Yes, that is correct. Rev. Edward Sorin, C.S.C., a priest of the Congregation of Holy Cross, founded the University of Notre Dame du Lac in 1842.'}[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Rev. Edward Sorin, C.S.C., a priest of the Congregation of Holy Cross, founded the University of Notre Dame du Lac in 1842."
}
```[0m

[1m> Finished chain.[0m


{'input': 'who founded the university',
 'chat_history': [HumanMessage(content='when was university of notredame established', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The University of Notre Dame du Lac was founded on November 26, 1842.', additional_kwargs={}, response_metadata={})],
 'output': 'Rev. Edward Sorin, C.S.C., a priest of the Congregation of Holy Cross, founded the University of Notre Dame du Lac in 1842.'}

In [77]:
agent("20+6")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "26"
}
```[0m

[1m> Finished chain.[0m


{'input': '20+6',
 'chat_history': [HumanMessage(content='when was university of notredame established', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The University of Notre Dame du Lac was founded on November 26, 1842.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='who founded the university', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rev. Edward Sorin, C.S.C., a priest of the Congregation of Holy Cross, founded the University of Notre Dame du Lac in 1842.', additional_kwargs={}, response_metadata={})],
 'output': '26'}

#### Note on the Rate Limit

Rate Limit: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

Retry Options: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb

### Further Reading

https://arxiv.org/abs/2005.11401 

https://platform.openai.com/docs/models/gpt-3-5