In [None]:
#Libraries and stuff
#pip install -U langchain --user
#pip install openai==0.28 #embedding doesn't work for recent version????

# Chatbot agent using OpenAI and LangChain

### Data Loading

In [None]:
from datasets import load_dataset

dataset = load_dataset('squad', split='train')

In [None]:
import pandas as pd
df = dataset.to_pandas()

In [None]:
df.head(2)

In [None]:
df.iloc[0]['answers']

In [None]:
df.iloc[0]['question']

In [None]:
df.iloc[0]['context']

In [None]:
sum(df['context'].duplicated()) #check duplicates, rerun after next cell

In [None]:
df.drop_duplicates(subset='context', keep='first', inplace=True) #remove duplicates of context, keeping ONE

### Embedding API

In [None]:
import openai

openai.api_key = 'openai key'

model = 'text-embedding-ada-002' #commonly used embedding model

res = openai.Embedding.create(input = 'I love OpenAI', engine = model)

In [None]:
emb_vector = res['data'][0]['embedding'] #parsing embedded vectors

In [None]:
def get_embedding(text, model):
    text = text.replace('\n', ' ')
    res = openai.Embedding.create(input = text, engine = model)
    return res['data'][0]['embedding']

In [None]:
vec = get_embedding('Testing the embedding', model)

In [None]:
len(vec) #save this, we need the length to adjust the dimension in Pinecone

### Setup Pinecone

In [None]:
api_key = 'pinecone key'
env = 'pinecone env'

In [None]:
from pinecone import Pinecone, PodSpec
from tqdm.autonotebook import tqdm #progress bar
pc = Pinecone(api_key=api_key)
#delete data in old index (delete the actual index in the Pinecone client)
pc.create_index(name='ai-agent',dimension=len(vec),metric='dotproduct',spec=PodSpec(
    environment=env,
    pod_type= "Starter",
    pods= 1
  ))
idx = pc.Index('ai-agent')

### Indexing

In [None]:
df_sample = df.sample(2500, random_state=45)
batch_size = 10
import time

In [None]:
#%%use to compute waiting time
for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
    batch = df_sample.iloc[i:i_end]
    metadata = [{"title": row['title'], "context": row['context']} for i, row in batch.iterrows()] 
    #for the embedding, we use the openai api
    docs = batch['context'].tolist() #pd-series to python list
    emb_vectors = [get_embedding(doc, model) for doc in docs]

    ids = batch['id'].tolist()

    to_upsert = zip(ids,emb_vectors,metadata)
    idx.upsert(vectors=to_upsert)

    time.sleep(15) #use to avoid overloading, meaning that you aren't billed from openai
    #basically, you want an iteration of batch_size=10 to take 1 minute in total
    #subtract the wall time from 60 to find your computers time to sleep
    #ONLY WORKS FOR SMALL LOADS

In [None]:
def get_embedding2(text):
    text = text.replace('\n', ' ')
    res = openai.Embedding.create(input = text, engine = 'text-embedding-ada-002')
    return res['data'][0]['embedding']

In [None]:
from langchain.vectorstores import Pinecone

In [None]:
idx.describe_index_stats()

In [None]:
vectorstore = Pinecone(idx, get_embedding2, 'text')

In [None]:
query = 'What does the state of obesity contribute to'

In [None]:
#pure semantic, no generation
vectorstore.similarity_search(query, k=3)
#may not receive a proper answer with 100 records, try increasing it if you want
#may also be completely wrong somewhere, don't want to debug

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings
#from langchain.embeddings.openai import OpenAIEmbeddings
openai_key = 'openai key'
model_name = 'text-embedding-ada-002'
embed = OpenAIEmbeddings(
    model = model_name,
    openai_api_key = openai_key
)

In [None]:
#embed.embed_query("embedding single query") #embedding single document

embed.embed_documents(["first doc", "second doc"])

In [None]:
#Clear pinecone database before running this cell
for i in tqdm(range(0, len(df_sample), batch_size)):
    i_end = min(i+batch_size, len(df_sample))
    batch = df_sample.iloc[i:i_end]
    metadata = [{"title": row['title'], "context": row['context']} for i, row in batch.iterrows()] 
    #for the embedding, we use the openai api
    docs = batch['context'].tolist() #pd-series to python list
    emb_vectors = embed.embed_documents(docs) #use this instead of having to define our own embedding model
    #either one works, langchain is nice to use so we don't have to write our own functions which can contain bugs
    ids = batch['id'].tolist()

    to_upsert = zip(ids,emb_vectors,metadata)
    idx.upsert(vectors=to_upsert)

    #time.sleep(15) #use to avoid overloading, meaning that you aren't billed from openai
    #basically, you want an iteration of batch_size=10 to take 1 minute in total
    #subtract the wall time from 60 to find your computers time to sleep
    #ONLY WORKS FOR SMALL LOADS

### Define AI Agent

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [None]:
llmodel = ChatOpenAI(openai_api_key=openai_key, model_name = 'gpt-3.5-turbo', temperature = 0.0)
 #model trained up to september 2021, temp to get conservative responses

#define conversational memory
conv_mem = ConversationBufferWindowMemory(memory_key='chat_history', k=5, return_messages=True)

In [None]:
qa = RetrievalQA.from_chain_type(llm=llmodel, chain_type='stuff', retriever = vectorstore.as_retriever())

### Invoke the agent

In [None]:
qa.invoke(query)

The key difference here between using an agent and semantic search is simply that we get the answer straight away using an agent. Semantic search instead gives the documents where we can find the answers for ourselves.

In [None]:
from langchain.agents import Tool
tools = [
    Tool(
        name = 'Knowledge Base',
        func = qa.run,
        description = 'Use this when answering based on knowledge'
    )
]

In [None]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [None]:
agent = initialize_agent(
    agent = AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools = tools,
    llm = llmodel,
    verbose = True,
    max_iterations = 3,
    early_stopping_method = 'generate',
    memory = conv_mem
)

In [None]:
agent('What does obesity contribute to')

## Code seems to be faulty somewhere, doesn't return any response from the vector database. Might do something about it later on.