In [240]:
"""
- Create embeddings given a list of links and upload to pinecone.
- v1: only wikipedia. Ask a question, give wikipedia titles and we will add context to the question
- v2: wikipedia + other sources (depends on the quality of scraper)
-- if the scraper is good we can create embeddings for anything on the internet given a link
-- we can also search public datasets to get more context
-- ideal UI: user posts a bunch of links, asks question, we add context based on those links and answer the question
- v3: based on the info we have on what type of context worked for what question, we can then create fine tuned models and then use those to answer questions
-- I'm thinking a marketplace where people can use fine tuned gpt's to ask about niche and specific topics like aerospace, finance, etc

thx to openai for vv nice cookbooks and examples.
"""

# import everything we need
import pandas as pd
from dotenv import load_dotenv
import os
import openai
import numpy as np
# annoying coz conda doesnt list it, so have to install pip in venv and then use the venv's pip to install it
import pinecone
from tqdm.auto import tqdm

# set constants
EMBEDDINGS_MODEL = "text-embedding-ada-002"
EMBEDDINGS_DIMENSION = 1536
PINECONE_BATCH_SIZE = 32
MAX_SECTION_LEN = 500
SEPARATOR = "\n "

# load env variables
load_dotenv()

# setup openai and pinecone
openai.api_key = os.environ.get('OPENAI_API_KEY')
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment="us-west1-gcp"
)
if openai.api_key is None:
    print("openai api key not found")
if pinecone is None:
    print("pinecone api key not found")

# 1000 tokens ~ 750 words; there is no way to get the number of tokens from the API for 2nd gen models for now
# 1 token ~ 4 characters
def token_estimate(text):
    # anything above 8000 tokens is too long for the ada model
    return len(text) / 4

# we know that openai ada model costs $0.0004 / 1K tokens
def cost_estimate(tokens):
    return tokens / 1000 * 0.0004

In [241]:
# check if 'openai' index already exists (only create index if not)
if 'openai' not in pinecone.list_indexes():
    pinecone.create_index('openai', dimension=EMBEDDINGS_DIMENSION)
# connect to index
index = pinecone.Index('openai')

In [242]:
# openai and pinceone stuff

# get embeddings for text
def get_embedding(text: str) -> list[float]:
    result = openai.Embedding.create(
      model=EMBEDDINGS_MODEL,
      input=text
    )
    return result["data"][0]["embedding"]

# add embeddings to pinecone index
def add_to_pinecone(df: pd.DataFrame):
    for i in tqdm(range(0, df.shape[0], PINECONE_BATCH_SIZE)):
      # set end position of batch
      i_end = min(i+PINECONE_BATCH_SIZE, df.shape[0])
      # slice df
      temp_df = df.loc[i: i_end]
      # get batch of lines and IDs
      ids_batch = [str(n) for n in range(i, i_end)]
      # prep metadata and upsert batch
      meta = [{'content': line} for line in temp_df['content'].values]
      embeds = temp_df['embeddings'].values
      to_upsert = zip(ids_batch, embeds, meta)
      # upsert to Pinecone
      index.upsert(vectors=list(to_upsert))

In [243]:
# calculate embeddings and enforce token rules for any df
# run this function once your parser has created a df with columns 'title', 'heading', 'content'
def get_df_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    df['tokens'] = df['content'].apply(token_estimate)
    # filter tokens by 40-8000
    df = df[df['tokens'] > 40]
    df = df[df['tokens'] < 8000]
    # get embeddings
    df['embeddings'] = df['content'].apply(get_embedding)
    return df

def construct_prompt(question: str) -> str:
    """
    Fetch relevant context for a question, and construct a prompt
    """
    query_embedding = get_embedding(question)
    res = index.query([query_embedding], top_k=5, include_metadata=True)
    token_len = 0
    header = """\n\nContext:\n"""
     
    for match in res["matches"]:
        # compute token length for match metadata
        metadata = match["metadata"]["content"]
        metadata_len = token_estimate(metadata)
        # one for the separator
        token_len += metadata_len + 1
        if token_len > MAX_SECTION_LEN:
            break
        header += metadata + SEPARATOR
    return header + "\n Q: " + question + "\n A:"

In [245]:
from duckduckgo_search import ddg
from bs4 import BeautifulSoup
import requests

CLEANR = re.compile('<.*?>')

def ddg_extract(question: str) -> pd.DataFrame:
    """
    Get top 3 links for a given query
    Get results from 2022
    extract all <p> tags and put them in pinecone
    """
    # get top 3 links
    results = ddg(question, region='wt-wt', safesearch='Off', max_results=3)
    df = pd.DataFrame(columns=['content'])
    # get text from each link
    for result in results:
        soup = BeautifulSoup(requests.get(result['href']).text, 'html.parser')
        p_tags = soup.find_all('p')
        df1 = pd.DataFrame(columns=['content'])
        df1['content'] = [p_tag.text for p_tag in p_tags]
        df = pd.concat([df, df1], ignore_index=True)
    return df


query = "How many flights has 787 completed?"
df = ddg_extract(query)
print("upper bound cost estimate", sum(df.content.apply(lambda x: cost_estimate(token_estimate(x)))))

upper bound cost estimate 0.008573899999999997


Unnamed: 0,content
0,\n
1,The Boeing 787 Dreamliner is an American wide-...
2,"At launch, Boeing targeted the 787 with 20% le..."
3,The initial 186-foot-long (57 m) 787-8 typical...
4,Early 787 operations encountered several probl...
...,...
168,Boeing plans to conduct one certification demo...
169,"Separately, the U.S. National Transportation S..."
170,The public forum will examine the design and d...
171,The FAA grounded all 50 Boeing 787s in use wor...


In [246]:
df = get_df_embeddings(df)
df

Unnamed: 0,content,tokens,embeddings
1,The Boeing 787 Dreamliner is an American wide-...,188.50,"[-0.0187480878084898, -0.016766581684350967, -..."
2,"At launch, Boeing targeted the 787 with 20% le...",207.00,"[-0.008956356905400753, -0.011624492704868317,..."
3,The initial 186-foot-long (57 m) 787-8 typical...,120.25,"[0.00795331783592701, -0.017392633482813835, 0..."
4,Early 787 operations encountered several probl...,188.50,"[-0.012899452820420265, -0.015989739447832108,..."
5,"During the late 1990s, Boeing considered repla...",321.00,"[0.0019640999380499125, -0.006949892267584801,..."
...,...,...,...
165,Birtel said it wasn't clear if the demonstrati...,63.50,"[-0.009498683735728264, -0.009994784370064735,..."
168,Boeing plans to conduct one certification demo...,46.75,"[-0.0043954490683972836, -0.014120709151029587..."
169,"Separately, the U.S. National Transportation S...",74.75,"[-0.00421537971124053, -0.008491461165249348, ..."
170,The public forum will examine the design and d...,53.50,"[0.025333436205983162, 0.01158954855054617, 0...."


In [247]:
add_to_pinecone(df)

  0%|          | 0/5 [00:00<?, ?it/s]

In [248]:
construct_prompt(query)

'\n\nContext:\nOn December 13, 2018, the 787th Boeing 787 was delivered to AerCap. By then the 787 had flown 300 million passengers on 1.5 million flights and opened 210 new nonstop routes.[373] The 1000th Dreamliner, a 787-10 for Singapore Airlines, made its maiden flight on April 3, 2020.[374]\n\n On November 5, 2010, it was reported that some 787 deliveries would be delayed to address problems found during flight testing.[138][139] In January 2011, the first 787 delivery was rescheduled to the third quarter of 2011 due to software and electrical updates following the in-flight fire.[140][141] By February 24, 2011, the 787 had completed 80% of the test conditions for the Rolls-Royce Trent 1000 engine and 60% of the conditions for the General Electric GEnx-1B engine.[142] In July 2011, ANA performed a week of operations testing using a 787 in Japan.[143] The test aircraft had flown 4,828 hours in 1,707 flights combined by August 15, 2011.[106] During testing, the 787 visited 14 countr