In [2]:
# wikipedia specific info

import wikipedia
import re
import pandas as pd

# wikipedia categories to discard
discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes"]

# Get the wikipedia page given a title
def get_wiki_page(title):    
    try:
        return wikipedia.page(title)
    except wikipedia.exceptions.DisambiguationError as e:
        return wikipedia.page(e.options[0])
    except wikipedia.exceptions.PageError as e:
        raise Exception("Page not found")

# extract the sections of a wikipedia page into a df with title, heading and content
# you can use a function similar to this for any website
def extract_page(title: str):
    wiki_text = get_wiki_page(title).content
    content = re.split(r'==+ .* ==+', wiki_text)
    headings = re.findall('==+ .* ==+', wiki_text)
    # first element of content is the summary
    df = pd.DataFrame({
        'title': title,
        'heading': "summary",
        'content': content.pop(0).replace("\n", " ").strip()
    }, index=[0])

    # add the rest of the sections
    # len of content is equal len of headings as we popped the first element of content
    for heading, cont in zip(headings, content):
        plain_heading = " ".join(heading.split(" ")[1:-1]).strip()
        # discarding the references and other low information sections
        if plain_heading in discard_categories:
            continue
        cont = cont.replace("\n", " ").strip()
        if cont == "":
            continue
        df1 = pd.DataFrame({
            'title': title,
            'heading': plain_heading,
            'content': cont
        }, index=[0])
        # concat the new section to the dataframe
        df = pd.concat([df, df1], ignore_index=True)
    return df

# extract info for all pages in a list of titles and return a dataframe
def wiki_extract(
    titles: [str],
) -> pd.DataFrame:
    df = pd.DataFrame(columns=['title', 'heading', 'content'])
    for title in titles: df = pd.concat([df, extract_page(title)], ignore_index=True)
    return df

In [None]:
# create embeddings for given list of wikipedia pages, am not recursively adding pages since most likely the user will only give a few pages that are beyond the knowledge cutoff date
# this is a very simple way to get embeddings, but it works for now

# input = ["Ingenuity (helicopter)", "List of Ingenuity flights"]
# print('upper bound cost estimate', [sum([cost_estimate(token_estimate(wiki.content)) for wiki in wiki_pages])])
# df = wiki_extract(input)
# df = get_df_embeddings(df)