In [172]:
"""
- Create embeddings given a list of links and upload to pinecone.
- v1: only wikipedia. Ask a question, give wikipedia titles and we will add context to the question
- v2: wikipedia + other sources (depends on the quality of scraper)
-- if the scraper is good we can create embeddings for anything on the internet given a link
-- we can also search public datasets to get more context
-- ideal UI: user posts a bunch of links, asks question, we add context based on those links and answer the question
- v3: based on the info we have on what type of context worked for what question, we can then create fine tuned models and then use those to answer questions
-- I'm thinking a marketplace where people can use fine tuned gpt's to ask about niche and specific topics like aerospace, finance, etc

thx to openai for vv nice cookbooks and examples.
"""

# import everything we need
import pandas as pd
from dotenv import load_dotenv
import os
import openai
import numpy as np
# annoying coz conda doesnt list it, so have to install pip in venv and then use the venv's pip to install it
import pinecone
from ipynb.fs.full.wiki_extract import wiki_extract
from tqdm.auto import tqdm

# set constants
EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_DIMENSION = 1536
PINECONE_BATCH_SIZE = 32
MAX_SECTION_LEN = 500
SEPARATOR = "\n "

# load env variables
load_dotenv()

# setup openai and pinecone
openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment="us-west1-gcp"
)
if openai.api_key is None:
    print("openai api key not found")
if pinecone is None:
    print("pinecone api key not found")

# 1000 tokens ~ 750 words; there is no way to get the number of tokens from the API for 2nd gen models for now
# 1 token ~ 4 characters
def token_estimate(text):
    # anything above 8000 tokens is too long for the ada model
    return len(text) / 4

# we know that openai ada model costs $0.0004 / 1K tokens
def cost_estimate(tokens):
    return tokens / 1000 * 0.0004

In [118]:
# check if 'openai' index already exists (only create index if not)
if 'openai' not in pinecone.list_indexes():
    pinecone.create_index('openai', dimension=EMBEDDINGS_DIMENSION)
# connect to index
index = pinecone.Index('openai')

In [174]:
# openai and pinceone stuff

# get embeddings for text
def get_embedding(text: str) -> list[float]:
    result = openai.Embedding.create(
      model=EMBEDDINGS_MODEL,
      input=text
    )
    return result["data"][0]["embedding"]

# add embeddings to pinecone index
def add_to_pinecone(df: pd.DataFrame):
    for i in tqdm(range(0, df.shape[0], PINECONE_BATCH_SIZE)):
      # set end position of batch
      i_end = min(i+PINECONE_BATCH_SIZE, df.shape[0])
      # slice df
      temp_df = df.loc[i: i_end]
      # get batch of lines and IDs
      ids_batch = [str(n) for n in range(i, i_end)]
      # prep metadata and upsert batch
      meta = [{'content': line} for line in temp_df['content'].values]
      embeds = temp_df['embeddings'].values
      to_upsert = zip(ids_batch, embeds, meta)
      # upsert to Pinecone
      index.upsert(vectors=list(to_upsert))

In [170]:
# calculate embeddings and enforce token rules for any df
# run this function once your parser has created a df with columns 'title', 'heading', 'content'
def get_df_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    df['tokens'] = df['content'].apply(token_estimate)
    # filter tokens by 40-8000
    df = df[df['tokens'] > 40]
    df = df[df['tokens'] < 8000]
    # get embeddings
    df['embeddings'] = df['content'].apply(get_embedding)
    return df

def construct_prompt(question: str) -> str:
    """
    Fetch relevant context for a question, and construct a prompt
    """
    query_embedding = get_embedding(question)
    res = index.query([query_embedding], top_k=5, include_metadata=True)
    token_len = 0
    header = """\n\nContext:\n"""
     
    for match in res["matches"]:
        # compute token length for match metadata
        metadata = match["metadata"]["content"]
        metadata_len = token_estimate(metadata)
        # one for the separator
        token_len += metadata_len + 1
        if token_len > MAX_SECTION_LEN:
            break
        header += metadata + SEPARATOR
    return header + "\n Q: " + question + "\n A:"

In [153]:
# create embeddings for given list of wikipedia pages, am not recursively adding pages since most likely the user will only give a few pages that are beyond the knowledge cutoff date
# this is a very simple way to get embeddings, but it works for now

input = ["Ingenuity (helicopter)", "List of Ingenuity flights"]
print('upper bound cost estimate', [sum([cost_estimate(token_estimate(wiki.content)) for wiki in wiki_pages])])
df = wiki_extract(input)
df = get_df_embeddings(df)

upper bound cost estimate [0.0038966000000000005]


In [175]:
query = "How many flight has the mars helicopter completed?"
print(construct_prompt(query))



Context:
The NASA helicopter Ingenuity on Mars made the first powered controlled flights by an aircraft on a planet other than Earth. Its first flight was April 19, 2021, after landing February 18 attached to the underside of the Perseverance rover. Ingenuity weighs 1.8 kilograms (4.0 lb) and is 49 cm (19 in) tall. It is powered by six lithium-ion solar-charged batteries. It was built and is operated by the Jet Propulsion Laboratory (JPL), a field center of NASA. It was designed for a 30-day demonstration period, but has operated far above expectations, making its 37th flight 607 days after its first Martian flight.
 
 Q: How many flight has the mars helicopter completed?
 A:
