In [193]:
# create embeddings given links
# only wikipedia for now https://github.com/openai/openai-cookbook/blob/838f000935d9df03e75e181cbcea2e306850794b/examples/fine-tuned_qa/olympics-1-collect-data.ipynb
# create a csv file with headings from the page and the content
# we will use this to create embeddings https://github.com/openai/openai-cookbook/blob/838f000935d9df03e75e181cbcea2e306850794b/examples/fine-tuned_qa/olympics-1-collect-data.ipynb

import wikipedia
import re
import pandas as pd

wiki_page = "Ingenuity (helicopter)"

def get_wiki_page(title):
    """
    Get the wikipedia page given a title
    """
    try:
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        raise Exception("Page not found")

# we know that openai ada model costs $0.0004 / 1K tokens
# 1000 tokens ~ 750 words; there is no way to get the number of tokens from the API for 2nd gen models for now
# lets get a cost estimate
# 1 token ~ 4 characters
# anything above 8000 tokens is too long for the ada model
def token_estimate(text):
    return len(text) / 4

page = get_wiki_page(wiki_page)
print('token estimate', token_estimate(page.content))
print('cost estimate', token_estimate(page.content)/1000 * 0.0004)

token estimate 9741.5
cost estimate 0.0038966000000000005


In [285]:
discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes"]

def extract_sections(
    wiki_text: str,
    title: str,
) -> str:
    """
    Extract the sections of a Wikipedia page, discarding the references and other low information sections
    """
    content = re.split(r'==+ .* ==+', wiki_text)
    headings = re.findall('==+ .* ==+', wiki_text)
    
    all_content = {
        "title": [],
        "heading": [],
        "content": [],
    }
    all_content["title"].append(title)
    all_content["heading"].append("Summary")
    all_content["content"].append(content.pop(0).strip())

    for heading, cont in zip(headings, content):
        plain_heading = " ".join(heading.split(" ")[1:-1]).strip()
        if plain_heading in discard_categories:
            continue
        cont = cont.replace("\n", " ").strip()
        if cont == "":
            continue
        tokens = token_estimate(plain_heading+cont)
        if tokens > 8000:
            continue
        if tokens < 40:
            continue
        
        all_content["title"].append(title)
        all_content["heading"].append(plain_heading)
        all_content["content"].append(cont)

    return pd.DataFrame.from_dict(all_content).set_index(['title', 'heading'])

# Example page being processed into sections
df = extract_sections(page.content, page.title)

In [198]:
EMBEDDINGS_MODEL = "text-embedding-ada-002"
OPENAI_API_KEY = "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"

In [286]:
import openai

openai.api_key = OPENAI_API_KEY

def get_embedding(text: str) -> list[float]:
    result = openai.Embedding.create(
      model=EMBEDDINGS_MODEL,
      input=text
    )
    return result["data"][0]["embedding"]

df['embeddings'] = df.content.apply(lambda x: get_embedding(x))
df.to_csv(f"{page.title}_embeddings.csv")

In [287]:
import numpy as np

#load csv file and parse it to np array
df = pd.read_csv(f"{page.title}_embeddings.csv")
df['embeddings'] = df.embeddings.apply(eval).apply(np.array)

max_dim = len(df.iloc[0]['embeddings'])
document_embeddings = {(r.title, r.heading): r.embeddings for _, r in df.iterrows()}
df.set_index(['title', 'heading'], inplace=True)

In [288]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

order_document_sections_by_query_similarity("How many flight has the mars helicopter completed?", document_embeddings)[:5]

[(0.8545141012945915,
  ('Ingenuity (helicopter)', 'Stages of creating a Martian helicopter')),
 (0.851276858931782, ('Ingenuity (helicopter)', 'Mission profile')),
 (0.8498188905970633,
  ('Ingenuity (helicopter)', 'Opposition to the helicopter')),
 (0.8494428712609733, ('Ingenuity (helicopter)', 'Operational history')),
 (0.8471530109198314, ('Ingenuity (helicopter)', 'Conceptual design'))]

In [263]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
separator_len = token_estimate(SEPARATOR)

In [291]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index[0], section_index[1]]
        print(document_section)
        
        # chosen_sections_len += document_section.tokens + separator_len
        # if chosen_sections_len > MAX_SECTION_LEN:
        #     break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



In [292]:
prompt = construct_prompt(
    "How many flights has the mars helicopter flown?",
    document_embeddings,
    df
)
print("===\n", prompt)

content       After deployment, the rover drove approximatel...
embeddings    [0.0037609373684972525, -0.011912411078810692,...
Name: (Ingenuity (helicopter), Mission profile), dtype: object
content       The history of the Mars Helicopter team dates ...
embeddings    [-0.009509674273431301, -0.017775215208530426,...
Name: (Ingenuity (helicopter), Stages of creating a Martian helicopter), dtype: object
content       NASA's JPL and AeroVironment published the con...
embeddings    [0.0008890391909517348, -0.0016654065111652017...
Name: (Ingenuity (helicopter), Conceptual design), dtype: object
content       Recalling in late 2021 the ups and downs of In...
embeddings    [-0.006836581975221634, -0.017787212505936623,...
Name: (Ingenuity (helicopter), Opposition to the helicopter), dtype: object
content       Perseverance dropped the debris shield protect...
embeddings    [-0.007003068923950195, -0.005659968592226505,...
Name: (Ingenuity (helicopter), Operational history), dtype: object
co