In [9]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken
import os

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [5]:
openai_key_var_name = "OPENAI_KEY"
openai.api_key = os.getenv(openai_key_var_name)

In [11]:
prompt = "Who won the 2020 Summer Olympics men's high jump?"

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")


#(openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": 'user', "content": prompt}])['choices'][0]['message']['content']).strip()

"Marcelo Chierighini of Brazil won the gold medal in the men's high jump at the 2020 Summer Olympics."

In [5]:
# We have hosted the processed dataset, so you can download it directly without having to recreate it.
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('https://cdn.openai.com/API/examples/data/olympics_sections_text.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

3964 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
Somalia at the 2020 Summer Olympics,Summary,Somalia competed at the 2020 Summer Olympics i...,69
Iran at the 2020 Summer Olympics,Men's tournament,Iran men's basketball team qualified for the O...,60
Cycling at the 2020 Summer Olympics – Women's BMX racing,Summary,The women's BMX racing competition at the 2020...,42
Sailing at the 2020 Summer Olympics – 49er,Summary,The Men's 49er was a sailing event on the Sail...,73
Afghanistan at the 2020 Summer Olympics,Shooting,Afghanistan received an invitation from the Tr...,76


In [7]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [8]:
document_embeddings = load_embeddings("https://cdn.openai.com/API/examples/data/olympics_sections_document_embeddings.csv")

# ===== OR, uncomment the below line to recaculate the embeddings from scratch. ========

# document_embeddings = compute_doc_embeddings(df)

In [9]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('2020 Summer Olympics', 'Summary') : [0.0037565305829048, -0.0061981128528714, -0.0087078781798481, -0.0071364338509738, -0.0025227521546185]... (1536 entries)


In [11]:
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [13]:
order_document_sections_by_query_similarity("Who won the men's high jump?", document_embeddings)[:5]

[(0.8848838116467931,
  ("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')),
 (0.8634516122222149,
  ("Athletics at the 2020 Summer Olympics – Men's pole vault", 'Summary')),
 (0.8616689251543943,
  ("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')),
 (0.8560916109708376,
  ("Athletics at the 2020 Summer Olympics – Men's triple jump", 'Summary')),
 (0.8469427954223732,
  ("Athletics at the 2020 Summer Olympics – Men's 110 metres hurdles",
   'Summary'))]

In [14]:

order_document_sections_by_query_similarity("Who won the women's high jump?", document_embeddings)[:5]

[(0.872634113225956,
  ("Athletics at the 2020 Summer Olympics – Women's long jump", 'Summary')),
 (0.8682291583662216,
  ("Athletics at the 2020 Summer Olympics – Women's high jump", 'Summary')),
 (0.8631917331894747,
  ("Athletics at the 2020 Summer Olympics – Women's pole vault", 'Summary')),
 (0.860951601983651,
  ("Athletics at the 2020 Summer Olympics – Women's triple jump", 'Summary')),
 (0.8581876900497667,
  ("Athletics at the 2020 Summer Olympics – Women's 100 metres hurdles",
   'Summary'))]

In [15]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [16]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [17]:
prompt = construct_prompt(
    "Who won the 2020 Summer Olympics men's high jump?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 2 document sections:
("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')
("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* The men's high jump event at the 2020 Summer Olympics took place between 30 July and 1 August 2021 at the Olympic Stadium. 33 athletes from 24 nations competed; the total possible number depended on how many nations would use universality places to enter athletes in addition to the 32 qualifying through mark or ranking (no universality places were used in 2021). Italian athlete Gianmarco Tamberi along with Qatari athlete Mutaz Essa Barshim emerged as joint winners of the event following a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal in a rare instance where the athletes of different nations h

In [18]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [20]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings,
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [21]:
answer_query_with_context("Who won the 2020 Summer Olympics men's high jump?", df, document_embeddings)

Selected 2 document sections:
("Athletics at the 2020 Summer Olympics – Men's high jump", 'Summary')
("Athletics at the 2020 Summer Olympics – Men's long jump", 'Summary')


'Gianmarco Tamberi and Mutaz Essa Barshim emerged as joint winners of the event following a tie between both of them as they cleared 2.37m. Both Tamberi and Barshim agreed to share the gold medal.'