In [16]:
# %pip install python-dotenv openai pandas

from typing import Dict, List
from utilities import num_tokens_from_messages, get_embedding, get_n_nearest_neighbors, memoize_to_sqlite
from f1_utilities import wikipedia_splitter, Section
from io import StringIO

import csv
import requests
import os
import itertools
import tiktoken
import openai
import pandas as pd

from dotenv import load_dotenv

load_dotenv("../.env")

openai.api_key = os.environ["OPENAI_API_KEY"]

In [17]:
from typing import Optional


MAX_CONTEXT_WINDOW = 4097
MINIMUM_RESPONSE_SPACE = 1000
MAX_PROMPT_SIZE = MAX_CONTEXT_WINDOW - MINIMUM_RESPONSE_SPACE


def ask_embedding_store(question: str, embeddings: Dict[Section, List[float]], max_documents: int) -> str:
    """
    Fetch necessary context from our embedding store, striving to fit the top max_documents
    into the context window (or fewer if the total token count exceeds the limit)

    :param question: The question to ask
    :param embeddings: A dictionary of Section objects to their corresponding embeddings
    :param max_documents: The maximum number of documents to use as context
    :return: GPT's response to the question given context provided in our embedding store
    """
    query_embedding = get_embedding(question)

    nearest_neighbors = get_n_nearest_neighbors(query_embedding, embeddings, max_documents)
    messages: Optional[List[Dict[str, str]]] = None

    base_token_count = num_tokens_from_messages(get_messages([], question), chat_model)
    token_counts = [len(enc.encode(document.text.replace("\n", " "))) for document, _ in nearest_neighbors]
    cumulative_token_counts = list(itertools.accumulate(token_counts))
    indices_within_limit = [True for x in cumulative_token_counts if x <= (MAX_PROMPT_SIZE - base_token_count)]
    most_messages_we_can_fit = len(indices_within_limit)

    context = [x[0] for x in nearest_neighbors[: most_messages_we_can_fit + 1]]

    debug_str = "\n".join([f"{x[0].location}: {x[1]}" for x in nearest_neighbors[: most_messages_we_can_fit + 1]])
#     print(f"Using {most_messages_we_can_fit} documents as context:\n" + debug_str)
    messages = get_messages(context, question)

#     print(f"Prompt: {messages[-1]['content']}")
    result = openai.ChatCompletion.create(model=chat_model, messages=messages)
    return result.choices[0].message["content"]

In [18]:
df = pd.read_csv('f1_2022.csv')
df

Unnamed: 0,Link
0,2022_Formula_One_World_Championship
1,2022_Abu_Dhabi_Grand_Prix
2,2022_Sao_Paulo_Grand_Prix
3,2022_Mexico_City_Grand_Prix
4,2022_United_States_Grand_Prix
5,2022_Japanese_Grand_Prix
6,2022_Singapore_Grand_Prix
7,2022_Italian_Grand_Prix
8,2022_Dutch_Grand_Prix
9,2022_Belgian_Grand_Prix


In [19]:
@memoize_to_sqlite("cache.db")
def wikipedia_api_fetch(article_title: str, field: str) -> str:
    base_url = "https://en.wikipedia.org/w/api.php"

    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'titles': article_title,
        'explaintext': 1
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    if 'query' in data and 'pages' in data['query']:
        page = list(data['query']['pages'].values())[0]
        if field in page:
            return page[field]
        else:
            raise ValueError(f"Could not find {field} for page {page}")
    else:
        raise ValueError(f"Could not find page {article_title}")


# Loop through the DataFrame and fetch the content of each Grand Prix
df['Page_Content'] = df['Link'].apply(lambda x: wikipedia_api_fetch(x, 'extract'))
df['Display Title'] = df['Link'].apply(lambda x: wikipedia_api_fetch(x, 'title'))
sections: List[Section] = []



Cached result found for 405f7f474b123d02942f572fc195203eee8b0aeb06b01e97f784b0aeb3a4397b. Returning it.
Cached result found for 88f28c6e89ade6008a77363b97d9e3c5d1cda79e52dc954ec151056c75333f0a. Returning it.
Cached result found for c08e87b4c9af191c8f0e620381cb7e0aca0200c4f0c2741223a2966fefed8939. Returning it.
Cached result found for d78032576844b983b08afb71b060114dcdcada7f95230120d8d5657ebba669ff. Returning it.
Cached result found for b77c47593e3c2eb79e8ada66fc4ac81386b0df66d51ca527566df9eb4f052859. Returning it.
Cached result found for 096a4466eb15781953f77d3064803462c784fa507666a39731d6693fc8e80b00. Returning it.
Cached result found for 757ed0dfd5ea5fecdd1850f516c848141ea6ea1602155e3eb4b20a2cb396b341. Returning it.
Cached result found for 33f16f5c1b65e3009ae24a1c7ec0b6d91eb673e5b7ac7bffa6c89c2a9057bc99. Returning it.
Cached result found for 35ec0f1cba81ad5a4595dcd631c7308b536acfafb63547ad03a6dcf7e5065978. Returning it.
Cached result found for 4a8cc17107778e5e4d0a8414c3fc60ec99d3f683

In [20]:
# The precedence of points to split on if a section cant be fit in max length
split_point_regexes = [r'\n==\s', r'\n===\s', r'\n====\s', r'\n\n', r'\n']


# Calculate wikipedia content for each row in the data frame
for index, row in df.iterrows():
    page_content = row['Page_Content']
    for section in wikipedia_splitter(row['Page_Content'], row['Display Title'], token_limit=MAX_CONTEXT_WINDOW, split_point_regexes=split_point_regexes):
        sections.append(section)

Section is too long: 2022 Formula One World Championship - Regulation changes, splitting
Section is too long: 2022 Formula One World Championship - Regulation changes - Technical regulations, splitting
Section is too long: 2022 Formula One World Championship - Season summary, splitting
Section is too long: 2022 Formula One World Championship - Season summary - Opening rounds, splitting
Section is too long: 2022 Formula One World Championship - Season summary - Mid-season rounds, splitting
Section is too long: 2022 Formula One World Championship - Season summary - Closing rounds, splitting
Section is too long: 2022 Japanese Grand Prix - Race, splitting
Section is too long: 2022 Belgian Grand Prix - Background, splitting
Section is too long: 2022 British Grand Prix - Race, splitting
Section is too long: 2022 British Grand Prix - Race - Race report, splitting
Section is too long: 2022 Monaco Grand Prix - Race, splitting
Section is too long: 2022 Monaco Grand Prix - Race - Race report, spl

In [21]:
chat_model = "gpt-3.5-turbo"
embedding_enc = tiktoken.encoding_for_model("text-embedding-ada-002")
enc = tiktoken.encoding_for_model(chat_model)

# Calculate the total number of tokens in the Page Content Column
print(str(sections[0]))

total_tokens = sum([len(embedding_enc.encode(str(section))) for section in sections])

# $0.0004 per 1000 tokens
cost = total_tokens * (0.0004 / 1000)
print(f"Estimated Cost ${cost:.2f}")

2022 Formula One World Championship - 2022 Formula One World Championship:
The 2022 FIA Formula One World Championship was a motor racing championship for Formula One cars, which was the 73rd running of the Formula One World Championship. It is recognised by the Fédération Internationale de l'Automobile (FIA), the governing body of international motorsport, as the highest class of competition for open-wheel racing cars. The championship was contested over twenty-two Grands Prix, which were held around the world, and ended earlier than in recent years to avoid overlapping with the FIFA World Cup.Drivers and teams competed for the titles of World Drivers' Champion and World Constructors' Champion, respectively. The 2022 championship saw the introduction of significant changes to the sport's technical regulations. These changes had been intended to be introduced in 2021, but were delayed until 2022 in response to the COVID-19 pandemic. Max Verstappen, who was the reigning Drivers' Champio

In [22]:
embeddings: Dict[Section, List[float]] = {
    section: get_embedding(str(section)) for section in sections
}

Cached result found for 6ee038f7f4ac7b92578c25607be931223f703920f4cbbdd89d8161157c47a4af. Returning it.
Cached result found for cf131c891a045fb6531696d0aa78fdf1f6d5c0707c1a00437c99db397c7c38be. Returning it.
Cached result found for fc73fe8165ff2c95416b89a21a257f57df01a48f4a06d1f1749bf9ecad0c4a8a. Returning it.
Cached result found for 8308be37c3d9f85e6ed097404a2e0212e457cb48c1e9587a8d13b251b1dc24f3. Returning it.
Cached result found for ca0d0f69e6a04ba22253713bdc0b74aef17ae8a1db108b2db4a43d8e78cc3006. Returning it.
Cached result found for c70685ff7c00c7985cfc05e8f7afbf976d4ef8c43daf3b931c629a5f876295d3. Returning it.
Cached result found for 37bb2e767965f2ed2303abe54c39e5991eeaf19d55f39b77a08d55a4c31d6585. Returning it.
Cached result found for 96f771be965d2e5efb3379ec17077e8d2aa2f3c642a023793983bf9271fe0ef7. Returning it.
Cached result found for 106ae7bbecf77226acd774590beeb81174d9b6cae458d3b7d2272b405ec51982. Returning it.
Cached result found for ce9ad853b6f057c07a39ce8a9848dc515cfc8a68

In [23]:
def get_messages(context: List[Section], question: str) -> List[Dict[str, str]]:
    context_str = "\n\n".join([f"Path: {x.location}\nBody:\n{x.text}" for x in context])
    return [
        {"role": "system", "content": """
You will receive a question from the user and some context to help you answer the question.

Evaluate the context and provide an answer if you can confidently answer the question.

If you are unable to provide a confident response, kindly state that it is the case and explain the reason.

Prioritize offering an "I don't know" response over conveying potentially false information.

The user will only see your response and not the context you've been provided. Thus, respond in precise detail, directly repeating the information that you're referencing from the context.
""".strip()},
        {"role": "user", "content": f"""
Using the following information as context, I'd like you to answer a question.

{context_str}

Please answer the following question: {question}
""".strip()}
    ]

In [24]:
ask_embedding_store("Who came in 2nd at the British Grand Prix in 2022", embeddings, 5)

Cached result found for e4b2aa69a4f85a6bf21b2c631af51d45cc4295b700bc5a2b9c08d240e2a9f261. Returning it.


'Lewis Hamilton came in 2nd at the British Grand Prix in 2022.'

In [25]:
ask_embedding_store("Who won the 2022 Monaco f1 Grand Prix?", embeddings, 5)

Cached result found for 65c886eedf78edddf1d72ab768cdb11fe1ed6b4db3c9f7512868580e39c193d1. Returning it.


'Pérez won the 2022 Monaco Grand Prix.'

In [26]:
ask_embedding_store("What happened in the first lap of the 2022 British Grand Prix?", embeddings, 10)

Cached result found for f8ff0cd014ee197988ef8c19e7539c0046fc42b228c2bed826254a697e08a935. Returning it.


"In the first lap of the 2022 British Grand Prix, there was a multi-car crash that occurred. George Russell had a bad start and was passed by Nicholas Latifi and Zhou Guanyu. Pierre Gasly attempted to pass between Zhou and Russell but made contact with Russell's left rear tyre with his right front tyre. The impact caused Russell's car to spin across the track and directly into the side of Zhou's car. This collision launched Zhou's car into the air, flipping it over the tyre wall and ultimately landing upside down. As a result of this incident, other cars behind them, including Valtteri Bottas and Alexander Albon, slowed down and got involved in a secondary collision. Race control red-flagged the race due to these crashes."

In [27]:
ask_embedding_store("Who finished 9th in the French Grand Prix in 2022?", embeddings, 10)

Cached result found for 68ae105f227607f5a2f32fbc816df92992a1a888c5bb7fd311a5226aaea829fb. Returning it.


'Daniel Ricciardo finished ninth in the French Grand Prix in 2022.'

In [28]:
ask_embedding_store("Who won the F1 drivers championship in 2022?", embeddings, 5)

Cached result found for ebdb78c0f0c96823c004debcc609edb2ad8a6cace05f1d1a738e363b5bd2c4a4. Returning it.


'Max Verstappen won the F1 drivers championship in 2022.'