In [1]:
from openai import OpenAI
client = OpenAI()

In [2]:
response = client.embeddings.create(
    input="I have a dream",
    model="text-embedding-3-small"
)

print(response.data[0].embedding)

[0.00629584351554513, -0.03262520208954811, -0.051215626299381256, 0.009882352314889431, -0.014324812218546867, -0.026697205379605293, 0.023641245439648628, 0.06366582959890366, -0.029173098504543304, -0.036954477429389954, -0.00380933890119195, 0.01367400586605072, -0.022000081837177277, 0.026400098577141762, -0.0005526547902263701, -0.007632825989276171, -0.0263010635972023, -0.03013516031205654, 0.0352284274995327, -0.024376939982175827, 0.02156149595975876, 0.0011937343515455723, 0.0408027246594429, -0.05016867443919182, 0.04122716188430786, 0.00411705719307065, 0.037152551114559174, 0.0072366828098893166, -0.019212933257222176, 0.025098485872149467, 0.03556797653436661, -0.04733908176422119, -0.029710721224546432, -0.04682975634932518, 0.039218153804540634, 0.028975028544664383, 0.04422653093934059, 0.043575726449489594, -0.001369699602946639, 0.021165352314710617, 0.02166053093969822, 0.015562758781015873, 0.057553913444280624, -0.008920290507376194, 0.005217061378061771, 0.03098

In [3]:
import pandas as pd
import tiktoken

In [4]:
input_datapath = "reviews-100.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [5]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000 

In [6]:
top_n = 100
df = df.sort_values("Time").tail(top_n * 2)
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)


df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

100

In [7]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding



In [9]:
df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv('output/embedded_100_reviews.csv', index=False)

In [11]:

df = pd.read_csv("winter_olympics_2022.csv")
df.head(2)

Unnamed: 0,text,embedding
0,Lviv bid for the 2022 Winter Olympics\n\n{{Oly...,"[-0.005021067801862955, 0.00026050032465718687..."
1,Lviv bid for the 2022 Winter Olympics\n\n==His...,"[0.0033927420154213905, -0.007447326090186834,..."


In [12]:
import ast

df['embedding'] = df['embedding'].apply(ast.literal_eval)
df

Unnamed: 0,text,embedding
0,Lviv bid for the 2022 Winter Olympics\n\n{{Oly...,"[-0.005021067801862955, 0.00026050032465718687..."
1,Lviv bid for the 2022 Winter Olympics\n\n==His...,"[0.0033927420154213905, -0.007447326090186834,..."
2,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.00915789045393467, -0.008366798982024193, ..."
3,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[0.0030951891094446182, -0.006064314860850573,..."
4,Lviv bid for the 2022 Winter Olympics\n\n==Ven...,"[-0.002936174161732197, -0.006185177247971296,..."
...,...,...
6054,Anaïs Chevalier-Bouchet\n\n==Personal life==\n...,"[-0.027750400826334953, 0.001746018067933619, ..."
6055,Uliana Nigmatullina\n\n{{short description|Rus...,"[-0.021714167669415474, 0.016001321375370026, ..."
6056,Uliana Nigmatullina\n\n==Biathlon results==\n\...,"[-0.029143543913960457, 0.014654331840574741, ..."
6057,Uliana Nigmatullina\n\n==Biathlon results==\n\...,"[-0.024266039952635765, 0.011665306985378265, ..."


In [18]:
from scipy import spatial

In [21]:
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [25]:
# strings, relatednesses = strings_ranked_by_relatedness("curling gold medal", df, top_n=5)
strings, relatednesses = strings_ranked_by_relatedness("life is a box of cholocates", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.755


'Madison Chock\n\n==Chock and Bates==\n\n=== 2015–2016 season: World bronze medal ===\n\nChock/Bates won gold at the [[2015 Nebelhorn Trophy]], again an [[ISU Challenger Series]] event. At the event, they received comments that "[[Dark Eyes (Russian song)|Dark Eyes]]" was not suitable for a polka rhythm.<ref name="IN151024"/><ref name="GS151112"/> They changed the short dance music to "[[More (Theme from Mondo Cane)|More]]" and "[[Unchained Melody]]" to clarify the rhythms,<ref name="Flade151021"/> and won the gold at the [[2015 Skate America]] followed by a silver at [[2015 Cup of China]]. They then won the silver medal at the [[2015–16 Grand Prix of Figure Skating Final|2015–16 Grand Prix Final]] in [[Barcelona]], behind Canadians [[Kaitlyn Weaver|Weaver]]/[[Andrew Poje|Poje]].\n\nIn March, Chock/Bates won the bronze medal at the [[2016 World Figure Skating Championships|2016 World Championships]] in [[Boston]], having finished third behind Papadakis/Cizeron and Shibutani/Shibutani i

relatedness=0.752


'Madison Chock\n\n== Programs ==\n\n=== With Zuerlein ===\n\n{| class="wikitable" style="text-align:center"\n|-\n! Season\n! [[Short dance]]\n! [[Free dance (figure skating)|Free dance]]\n! Exhibition\n|-\n! 2010–2011 <br> <ref name="ISU-1011"/><ref name="USFS-MCGZ"/>\n| \n* [[Milord (song)|Milord]]\n* [[Padam Padam]] <br>{{small|by [[Edith Piaf]] }}\n| \n* Cabaret (soundtrack)\n| \n* Nothing Else Matters <br>{{small| performed by Santa Esmeralda }}\n* Satellite <br>{{small| by J. Moreno feat. Santana }}\n|-\n!\n! [[Original dance]]\n!\n!\n|-\n! 2009–2010 <br> <ref name="USFS-MCGZ"/><ref name="ISU-0910"/>\n| \n* Yema Ya\n* Agua Nile <br>{{small| by Afro-Cuban Folk }}\n| \n* La Vie est Belle <br>{{small| performed by [[André Rieu]] }}\n|\n|-\n! 2008–2009 <br> <ref name="USFS-MCGZ"/><ref name="ISU-0809"/>\n| \n* [[Minnie the Moocher]] <br>{{small| by [[The Dancing Fool]] }}\n| \n* [[The Phantom of the Opera (Andrew Lloyd Webber song)|The Phantom of the Opera]] <br>{{small| by [[Andrew Ll

relatedness=0.752


'Madison Chock\n\n== Personal life ==\n\nMadison La\'akea Te-Lan Hall Chock was born in [[Redondo Beach, California]].<ref name="ISU-1011"/> She went to [[Novi High School]]. She is of [[Chinese-Hawaiian]] descent on her father\'s side and European descent on her mother\'s side.<ref name="GS090308"/> La\'akea means "sacred light from heaven" and Te-Lan (德蘭) means "virtuous orchid."<ref name="GS090308"/> \n\nAfter partnering on ice for several years, Chock and Bates began a romantic relationship in 2017.<ref name="today"/><ref name="time"/> On June 11, 2022 they became engaged.<ref name=People220616/>'

relatedness=0.751


'Emma Maltais\n\n== Personal life ==\n\nMaltais studies health sciences at Ohio State University and has interned at the Ohio State Neurological Institute.'

relatedness=0.748


'Evan Bates\n\n==Chock and Bates==\n\n=== 2015–2016 season: World bronze medal ===\n\nChock/Bates won gold at the [[2015 Nebelhorn Trophy]], again an [[ISU Challenger Series]] event. At the event, they received comments that "[[Dark Eyes (Russian song)|Dark Eyes]]" was not suitable for a polka rhythm.<ref name=IN151024/><ref name=GS151112/> They changed the short dance music to "[[More (Theme from Mondo Cane)|More]]" and "[[Unchained Melody]]" to clarify the rhythms,<ref name=Flade151021/> and won the gold at the [[2015 Skate America]] followed by a silver at [[2015 Cup of China]]. They then won the silver medal at the [[2015–16 Grand Prix of Figure Skating Final|2015–16 Grand Prix Final]] in [[Barcelona]], behind Canadians [[Kaitlyn Weaver|Weaver]]/[[Andrew Poje|Poje]].\n\nIn March, Chock/Bates won the bronze medal at the [[2016 World Figure Skating Championships|2016 World Championships]] in [[Boston]], having finished third behind Papadakis/Cizeron and [[Maia Shibutani|Shibutani]]/[

In [26]:
GPT_MODEL = "gpt-3.5-turbo"
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message



In [27]:
ask('Which athletes won the gold medal in curling at the 2022 Winter Olympics?')

"The athletes who won the gold medal in curling at the 2022 Winter Olympics were:\n\n- Men's tournament: Team Sweden, consisting of Niklas Edin, Oskar Eriksson, Rasmus Wranå, Christoffer Sundgren, and Daniel Magnusson.\n- Women's tournament: Team Great Britain, consisting of Eve Muirhead, Vicky Wright, Jennifer Dodds, Hailey Duff, and Mili Smith.\n- Mixed doubles tournament: Team Italy, consisting of Stefania Constantini and Amos Mosaner."