# CHATBOT AGENT using LANGCHAIN and OPENAI

# Install libraries

In [12]:
#!python3 -m pip install openai==1.55.2
#!python3 -m pip install langchain==0.3.9 --user 
#! pip install langchain-openai==0.2.10
#! pip install -U langchain-pinecone==0.2.0 
#(Some older versions are used due to compatibility issues)

# Load Data

In [8]:
# Create openai key and pinecone key
# Assign the API key as a string to a variable
OPENAI_API_KEY ="Your_openai_secret_key"
MY_KEY= "Your_pinecone_key"

In [10]:
# https://rajpurkar.github.io/SQuAD-explorer/
# we will be using squad 2.0 dataset

In [13]:
# load the dataset by importing load_dataset
from datasets import load_dataset 
data = load_dataset('squad', split='train') # using the training data

# convert this dataset to pandas dataframe
import pandas as pd
df = data.to_pandas()

In [14]:
# As the context is duplicated, we need to clean and remove duplicates
df.drop_duplicates(subset='context', keep='first', inplace=True)

# Embedding via API
## convert documents to vectors

In [17]:
# we will use openai for embedding and create a helper function
# so that we dont have to embed every text individually

from openai import OpenAI
import os

MODEL = "text-embedding-ada-002"
#This model is used as it takes lesser credits compared to other text converters

# Initiate a client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", OPENAI_API_KEY))

# helper function for embeddings
def get_embedding(text, model):
    text = text.replace("\n", " ")
    res = client.embeddings.create(input = text, model = model) 
    return (res.data)[0].embedding
# check the dimension of embedded vector as the same dimension will be used
# for Pinecone index creation
# here dimension=1536



In [23]:
# we can also use langchain for embeddings
# we will use this for better compatibilty between openai and langchain

from langchain_openai import OpenAIEmbeddings 

MODEL = "text-embedding-ada-002"
embed = OpenAIEmbeddings(
    model = MODEL,
    openai_api_key= OPENAI_API_KEY)

# Vector DB setup
## Pinecone setup

In [18]:
from pinecone import Pinecone, ServerlessSpec
# MY_KEY = "YOUR API KEY"
pc = Pinecone(api_key = MY_KEY)

# create an index
pc.create_index("ai-agent", dimension=1536, metric='dotproduct',
                     spec=ServerlessSpec(cloud="aws", region="us-east-1"))
# dimensions of 1536
# region="us-east-1" for free usage

# Initiate the index
index = pc.Index("ai-agent")

# Indexing/Upserting

In [22]:
# create a df sample due to rate limit on openai

df_sample = df.sample(1000, random_state=45)
batch_size = 100

# prepare ids,embeddings,metadata for upserting
from tqdm.auto import tqdm #for large datasets
import time #optional


for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
#     print(i, i_end)
    batch = df_sample.iloc[i:i_end]
    meta_data = [{"title" : row['title'], 
              "context": row['context']} 
             for i, row in batch.iterrows()]
    
    # embedding  
    docs = batch['context'].tolist()  # pd.Series to python list
#     emb_vectors = [get_embedding(doc, MODEL) for doc in docs] 
    emb_vectors = embed.embed_documents(docs) # list of list
# used the langchain embeddings, can also use the helper func
    ids = batch['id'].tolist()
    
    # upsert
    to_upsert = zip(ids, emb_vectors, meta_data)    
    index.upsert(vectors=to_upsert)

  0%|          | 0/10 [00:00<?, ?it/s]

# Querying the db

## Semantic Search type

In [25]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

MODEL = "text-embedding-ada-002"

# Initialize the vector store with the correct embedding method
embeddings = OpenAIEmbeddings(model=MODEL, api_key=OPENAI_API_KEY)


vectorstore = PineconeVectorStore(index, embeddings, "context", pinecone_api_key= MY_KEY) # df['context'] column is the actual text field to search from

# Perform the similarity search, pure semantic, nothing generative
query = "destruction of US fifth fleet"
results = vectorstore.similarity_search(query, k=2)


In [2]:
#results 
# Here it is working as semantic search agent which gives the full documents
# as context

# Retrieval Augmented Generation

In [35]:
# Now lets use langchain chat models for RAG type answers

In [41]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory \
import ConversationBufferWindowMemory

from langchain.chains import RetrievalQA

# OpenAI LLM
llm = ChatOpenAI(openai_api_key = OPENAI_API_KEY,
                model_name = 'gpt-3.5-turbo', # for convenience, can also use other version
                temperature = 0.0) # answers from the db only, not generate random answers

# conversational memory
conv_mem = ConversationBufferWindowMemory(
    memory_key = 'chat_history', # can answers based on previous context
    k = 5,
    return_messages =True)

# retrieval qa
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever())


In [42]:
query = "Which year university of notredame was established"
qa.invoke(query) # retrieving the info

{'query': 'Which year university of notredame was established',
 'result': 'The University of Notre Dame du Lac was established in 1842.'}

# CHATBOT AGENT

In [40]:
# Initialize tools to be used by agent
from langchain.agents import Tool

tools = [
    Tool(
    name = 'Knowledge Base',
    func = qa.invoke,
    description = ('use this when answering based on knowledge')
    )
]

In [37]:
#Initialize agent
from langchain.agents import initialize_agent
from langchain.agents import AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conv_mem 
)

In [45]:
#agent("when was university of notredame established") # chat gpt kind

In [47]:
#agent("who founded the university") 
# will answers about notredame without mentioning using memory_key

In [46]:
#agent("20+6")