# Custom Chatbot

The dataset chosen for this project is a Wikipedia API used to extract data for events of 2024.

# Table Of Contents

1. [Data Wrangling](#processdata)
2. [Custom Query Completion](#querycompl)
3. [Custom Performance Demonstration](#demo)

## Data Wrangling
<a id ="processdata"></a>

In [1]:
import getpass
import openai
import pandas as pd
import numpy as np
import tiktoken
import requests
from dateutil.parser import parse
from openai.embeddings_utils import distances_from_embeddings

In [2]:
openai.api_base = "https://openai.vocareum.com/v1"
def get_openai_api_key():
    key = getpass.getpass("Enter OpenAI API key (input hidden): ").strip()
    while not key:
        print("API key cannot be empty!")
        key = getpass.getpass("Enter OpenAI API key (input hidden): ").strip()

    print(f"API key configured (last 4 chars): ****{key[-4:]}")
    return key

openai.api_key = get_openai_api_key()

Enter OpenAI API key (input hidden): ········
API key configured (last 4 chars): ****1104


In [3]:
# ===============================
# Dataset & Embedding Functions
# ===============================

def load_dataset():
    # Get the Wikipedia page for "2024" since OpenAI's models stop in 2021
    resp = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2024&explaintext=1&formatversion=2&format=json")

    # Load page text into a dataframe
    df = pd.DataFrame()
    df["text"] = resp.json()["query"]["pages"][0]["extract"].split("\n")
    
    # Clean up text to remove empty lines and headings
    df = df[(df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))]
    
    # In some cases dates are used as headings instead of being part of the
    # text sample; adjust so dated text samples start with dates
    prefix = ""
    for (i, row) in df.iterrows():
        # If the row already has " - ", it already has the needed date prefix
        if " – " not in row["text"]:
            try:
                 # If the row's text is a date, set it as the new prefix
                 parse(row["text"])
                 prefix = row["text"]
            except:
                 # If the row's text isn't a date, add the prefix
                 row["text"] = prefix + " – " + row["text"]
    df = df[df["text"].str.contains(" – ")]
 
    return df

def get_embedding_for_text(text, embedding_model_name="text-embedding-ada-002"):
    #Get embeddings from the model
    response = openai.Embedding.create(input=[text], model=embedding_model_name)
    if 'data' in response and isinstance(response['data'], list):
        return response['data'][0]['embedding']
    else:
        print("Error: unexpected response for text:", text)
        return None

def generate_embeddings(df, embedding_model_name="text-embedding-ada-002"):
    embeddings = []
    for i, row in df.iterrows():
        print(f"Processing index {i}")
        try:
            text = row['text']
            embedding = get_embedding_for_text(text, embedding_model_name)
            embeddings.append(embedding)
        except Exception as e:
            print(f"Exception at index {i}: {e}")
            embeddings.append(None)
    df["embeddings"] = embeddings
    return df

def save_embeddings(df, output_file):
    df.to_csv(output_file) 

def load_embeddings(file_path):
    df = pd.read_csv(file_path)
    df["embeddings"] = df["embeddings"].apply(eval).apply(np.array) 
    return df

def get_relevant_rows(question, df, embedding_model_name="text-embedding-ada-002", top_n=10):
    question_embedding = openai.Embedding.create(
        model=embedding_model_name,
        input=question
    )
    # Check if the response contains the 'data' key
    if 'data' in question_embedding and len(question_embedding['data']) > 0:
        question_embedding = question_embedding['data'][0]['embedding']
    else:
        print(f"Warning: 'data' key not found or empty in question embedding response. Response: {question_embedding}")
        return df.head(top_n)  # Return top rows as fallback

    df_copy = df.copy()
    df_copy['distance'] = distances_from_embeddings(question_embedding, df_copy['embeddings'].values, distance_metric="cosine")  

    return df_copy.nsmallest(top_n, 'distance')

## Custom Query Completion
<a id ="querycompl"></a>
In the cells below, we compose a custom query using our chosen dataset and retrieve results from an OpenAI `Completion` model.

In [4]:
# ===============================
# Prompt Creation & Answering
# ===============================

def create_prompt(question, df, max_token_count=1500):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    prompt_template = """
    Answer the question based on the context below. If the question can't be answered based on the context, say "I don't know."

    Context: {}

    ---

    Question: {}

    Answer:
    """
    current_token_count = len(tokenizer.encode(prompt_template)) + len(tokenizer.encode(question))  

    context = []
    for text in df["text"].values:
        tokens_in_text = len(tokenizer.encode(text))
        if current_token_count + tokens_in_text <= max_token_count:
            context.append(text)
            current_token_count += tokens_in_text
        else:
            break

    return prompt_template.format("\n\n###\n\n".join(context), question)

def get_openai_answer(prompt, max_answer_tokens=150):
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=max_answer_tokens
        )
        # Check if the response contains the 'choices' key and it's not empty.
        if 'choices' in response and len(response['choices']) > 0:
            return response["choices"][0]["text"].strip()  
        else:
            print(f"Warning: 'choices' key not found or empty in response. Response: {response}")
            return "No answer found in response."
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred."

In [5]:
# ===============================
# Question Answering Functions
# ===============================

def answer_basic_question(question, max_answer_tokens=150):
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=question,
            max_tokens=max_answer_tokens
        )
        return response["choices"][0]["text"].strip()  
    except Exception as e:
        print(f"Error: {str(e)}")
        return "An error occurred."

def answer_question_with_context(question, df, max_prompt_tokens=1500, max_answer_tokens=150, top_n=10):
    relevant_rows = get_relevant_rows(question, df, top_n=top_n)
    prompt = create_prompt(question, relevant_rows, max_token_count=max_prompt_tokens)  
    return get_openai_answer(prompt, max_answer_tokens=max_answer_tokens)  

In [7]:
# ===============================
# Main Function
# ===============================

def main():
    df = load_dataset()  

    # Generate embeddings and save them to a CSV file.
    df = generate_embeddings(df)
    save_embeddings(df, "./embeddings_with_vectors.csv")  

    df = load_embeddings("./embeddings_with_vectors.csv")  
    
    return df

In [None]:
# ===============================
# Execution
# ===============================
if __name__ == "__main__":
    data = main()

In [None]:
while True:
    user_input = input("Ask a question (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    
    # Placeholder for query processing and response generation
    print(f"You asked: {user_input}")
    #print("(Response generation logic would go here)")
    print(answer_question_with_context(user_input, data))