## Import common items and setting up OpenAI api keys

In [2]:
# Importing open AI and also initiating client with api_key
import os
import openai

client = openai.OpenAI(
  api_key="", #TO FILL IN
)

## Testcases: Using original model

In [None]:
print("Using original model to answer cricket player stats")
print("***************************************************")

# Testcase#1: Testing with cricket player stats from 2023 based on original model response
dhoni_stats_2023_question = """
Question: "What are Dhoni stats in 2023?"
Answer:
"""

response = client.completions.create(
    model = "gpt-3.5-turbo-instruct",
    prompt = dhoni_stats_2023_question,
    max_tokens = 1000,
)
dhoni_stats_2023_answer = response.choices[0].text.strip()
print("Question: What are Dhoni stats in 2023?")
print(f"Answer: {dhoni_stats_2023_answer}\n\n")

# Testcase#2: Testing with cricket player stats from 2023 based on original model response
rahane_matches_2023_question = """
Question: "How many matches Ruturaj played in 2023?"
Answer:
"""

response = client.completions.create(
    model = "gpt-3.5-turbo-instruct",
    prompt = rahane_matches_2023_question,
    max_tokens = 1000,
)
rahane_matches_2023_answer = response.choices[0].text.strip()

print("Question: How many matches Ruturaj played in 2023?")
print(f"Answer: {rahane_matches_2023_answer}\n\n")

## Data Wrangling

### Load into Dataframe and restrict to 25 rows and create synthentic text column that would be used for semantic retrieval


In [None]:
# Reading data cricket_data.csv that contains cricket player stats data imported from Kaggle
import pandas as pd

df = pd.read_csv('./cricket_data.csv')

# Convert year to string before concatenating and adding space. This is used to create embeddings
df["text"] = df["Year"].astype(str) + ' ' + df["Player_Name"]

# Restricting rows to 25 to avoid embedding API compute cost.
top_25_rows = df.head(25)
df = top_25_rows

In [None]:
# Calculate the embedding and store it in local file if we have to rerun the script:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding
df['embedding'] = df.text.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv("./embeddings.csv", index=False)
df

In [None]:
#import numpy as np
#import pandas as pd
#import openai

#df = pd.read_csv("./embeddings.csv")
#df['embedding'] = df.embedding.apply(eval).apply(np.array)

## Retrieving rows sorted by relevance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_rows_sorted_by_relevance(question, df):
  """
  Function that takes in a question string and a dataframe containing
  rows of text and associated embeddings, and returns that dataframe
  sorted from least to most relevant for that question
  """

  question_embedding = get_embedding(question)

  # Make a copy of the dataframe and add a "distances" column containing
  # the cosine distances between each row's embeddings and the
  # embeddings of the question
  df_copy = df.copy()

  # Reshape the question embedding to a 2D array with a single row
  question_embedding_reshaped = np.expand_dims(question_embedding, axis=0)

  # Calculate cosine similarities for each row's embedding with the question embedding
  df_copy["distances"] = df_copy["embedding"].apply(
      lambda x: cosine_similarity(question_embedding_reshaped, np.array([x]))[0][0]
  )

  # Sort the copied dataframe by the distances and return it
  # (shorter distance = more relevant so we sort in ascending order)
  df_copy.sort_values("distances", ascending=False, inplace=True)
  return df_copy


# Testing the relevance function created above with sample queries:
import numpy as np
get_rows_sorted_by_relevance("How many matches Gaikwad played in 2024", df)
get_rows_sorted_by_relevance("What are batting stats for Dhoni in 2024", df)

### Create prompt to create with our custom information to augment with and answering questions

In [None]:
# Create prompt to create with our custom information to augment with 

import tiktoken

def create_prompt(question, df, max_token_count):
    """
    Given a question and a dataframe containing rows of text and their
    embeddings, return a text prompt to send to a Completion model
    """
    # Create a tokenizer that is designed to align with our embeddings
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    # Count the number of tokens in the prompt template and question
    prompt_template = """
Answer the question based on the context below.

Context: 
The below data contains cricket player batting and bowling statistics by year where they played.
Using this information answer user questions.

{}

---

Question: {}
Answer:"""
    
    current_token_count = len(tokenizer.encode(prompt_template)) + \
                            len(tokenizer.encode(question))
    
    context = []

    df_sorted = get_rows_sorted_by_relevance(question, df)
    for index, row in df_sorted.iterrows():
        # Construct the text for the current row
        text = f"Year Played: {row['Year']}, Player Name: {row['Player_Name']}, Matches Batted: {row['Matches_Batted']}, Not Outs: {row['Not_Outs']}, Runs Scored: {row['Runs_Scored']}, Highest Score: {row['Highest_Score']}, Batting Average: {row['Batting_Average']}, Balls Faced: {row['Balls_Faced']}, Batting Strike Rate: {row['Batting_Strike_Rate']}, Centuries: {row['Centuries']}, Half Centuries: {row['Half_Centuries']}, Fours: {row['Fours']}, Sixes: {row['Sixes']}, Catches Taken: {row['Catches_Taken']}, Stumpings: {row['Stumpings']}, Matches Bowled: {row['Matches_Bowled']}, Balls Bowled: {row['Balls_Bowled']}, Runs Conceded: {row['Runs_Conceded']}, Wickets Taken: {row['Wickets_Taken']}, Best Bowling Match: {row['Best_Bowling_Match']}, Bowling Average: {row['Bowling_Average']}, Economy Rate: {row['Economy_Rate']}, Bowling Strike Rate: {row['Bowling_Strike_Rate']}, Four Wicket Hauls: {row['Four_Wicket_Hauls']}, Five Wicket Hauls: {row['Five_Wicket_Hauls']}\n"
        
        # Increase the counter based on the number of tokens in this row
        text_token_count = len(tokenizer.encode(text))
        current_token_count += text_token_count
        
        # Add the row of text to the list if we haven't exceeded the max token count
        if current_token_count <= max_token_count:
            context.append(text)
        else:
            break

    return prompt_template.format("\n\n###\n\n".join(context), question)

# In[ ]:
COMPLETION_MODEL_NAME = "gpt-3.5-turbo-instruct"

def answer_question(
    question, df, max_prompt_tokens=3000, max_answer_tokens=1000
):
    """
    Given a question, a dataframe containing rows of text, and a maximum
    number of desired tokens in the prompt and response, return the
    answer to the question according to an OpenAI Completion model
    
    If the model produces an error, return an empty string
    """
    
    prompt = create_prompt(question, df, max_prompt_tokens)
    try:
        response = client.completions.create(
            model = "gpt-3.5-turbo-instruct",
            prompt = prompt,
            max_tokens = max_answer_tokens,
        )
        return response.choices[0].text.strip()
    except Exception as e:
        print(e)
    return ""

### Test with RAG Supplied data

In [None]:
# Testcases: Using RAG supplied data
print("Using RAG supplied data to answer cricket player stats")
print("******************************************************")

# Testcase#1: Testing with a prompt using our custom augmented information
custom_answer1 = answer_question("What are Dhoni stats in 2023?", df)
print("Question: What are Dhoni stats in 2023?")
print(f"Answer: {custom_answer1}\n\n")

# Testcase#2: Testing with a prompt using our custom augmented information
custom_answer2 = answer_question("How many matches Ruturaj played in 2023", df)
print("Question: How many matches Ruturaj played in 2023?")
print(f"Answer: {custom_answer2}\n\n")