In [None]:
import requests
import csv
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from google.colab import drive


In [None]:
def get_html(url):
    return urlopen(url)
def get_soup(html):
    return BeautifulSoup(html, 'html.parser')


def get_wikipedia_tables(url):
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    return tables

In [None]:
def extract_person_data(cells):
    if len(cells) > 1:
        person = cells[0]
        name = person.text.strip()
        url = person.a['href'] if person.a else ''
        return name, url
    return None, None

In [None]:
def get_wikidata_id(text_id):
    json_html = f'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles={text_id}&format=json'
    headers = {'Accept': 'application/json'}

    try:
        response = requests.get(json_html, headers=headers).json()
        if response.get('query'):
            for key, page in response['query']['pages'].items():
                wikibase_item = page.get('pageprops', {}).get('wikibase_item', '')
                if wikibase_item:
                    return wikibase_item
    except:
        pass

    return None


In [None]:


def is_dinner_party_member(api_response, wikibase):
    claims = api_response.get('entities', {}).get(wikibase, {}).get('claims', {})
    part_of = claims.get('P361', [])
    depicted_by = claims.get('P1299', [])

    for index in part_of + depicted_by:
        if index.get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id') == 'Q2915473':
            return True

    return False

In [None]:
def process_tables(tables):
    data_urls = []
    wikidata_ids = []  # Declare an empty list
    csv_file = 'Dinner_Party.csv'

    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['name', 'url', 'status'])
        writer.writeheader()

        for table in tables:
            process_table_rows(table, writer, data_urls, wikidata_ids)

    return data_urls


In [None]:

def process_table_rows(table, writer, data_urls, wikidata_ids, manual_check):
    rows = table.find_all('tr')
    error_no_name = "Error encountered"
    for index, row in enumerate(rows):
        rows_left = len(rows) - index - 1
        if index == 0:  # Skip the header row
            continue
        cells = row.find_all('td')
        name, url = extract_person_data(cells)
        if name and url:
            text_id = url.split("/")[-1]
            wikibase_item = get_wikidata_id(text_id)
            if wikibase_item:
                data_html = f'https://www.wikidata.org/wiki/{wikibase_item}'  # Fixed variable name
                data_json_html = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={wikibase_item}&format=json'  # Fixed variable name
                api_response = requests.get(data_json_html).json()
                if is_dinner_party_member(api_response, wikibase_item):
                    status = f'{name} is a Dinner Party member. {rows_left} item{"s" if rows_left != 1 else ""} to add'
                else:
                    status = f'{name} needs Dinner Party added to their Wikidata. {rows_left} items to add'
                writer.writerow({'name': name, 'url': data_html, 'status': status})
            else:
                status = f'Error encountered with {name}. Check manually. {rows_left} items to review.'
                manual_check.append({'name': name, 'url': url})
        else:
            status = error_no_name
        if name:
            print(status)
        else:
            print(f"Error encountered. {rows_left} items to review.")

In [None]:
def save_to_csv(data_urls, csv_path):
    header = ['name', 'url', 'status']
    df = pd.DataFrame(data_urls, columns=header)
    df.to_csv(csv_path, encoding='utf-8-sig', index=False)

    drive.mount('/content/drive', force_remount=True)
    path = '/content/drive/My Drive/output.csv'
    with open(path, 'w', encoding = 'utf-8-sig') as f:
      df.to_csv(f)




In [None]:
def read_csv(csv_path):
    return pd.read_csv(csv_path)


In [None]:
def save_output_csv(df, output_csv_path):
    df.to_csv(output_csv_path, encoding='utf-8-sig', index=False)



In [None]:
def main():
    url = 'https://en.wikipedia.org/wiki/List_of_women_in_the_Heritage_Floor'
    html = get_html(url)
    soup = get_soup(html)
    tables = get_wikipedia_tables(url)
    data_urls = []
    manual_check = []
    wikidata_ids = []  # Declare an empty list
    csv_file = 'Dinner_Party.csv'

    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['name', 'url', 'status'])
        writer.writeheader()

        for table in tables:
            process_table_rows(table, writer, data_urls, wikidata_ids, manual_check)

    # Save data_urls to CSV
    save_to_csv(data_urls, csv_file)

    # Print manual check items
    for item in manual_check:
        print(f"Manual check required for: {item['name']} - URL: {item['url']}")


In [None]:
if __name__ == "__main__":
    main()


Abella of Salerno is a Dinner Party member. 995 items to add
Abigail is a Dinner Party member. 994 items to add
Abigail Adams is a Dinner Party member. 993 items to add
Error encountered with Adela of Blois. Check manually. 992 items to review.
Error encountered with Adela Zamudio-Ribero. Check manually. 991 items to review.
Adelaide is a Dinner Party member. 990 items to add
Adelaide Labille-Guiard is a Dinner Party member. 989 items to add
Adelaide of Susa is a Dinner Party member. 988 items to add
Adelberger is a Dinner Party member. 987 items to add
Adelheid Popp is a Dinner Party member. 986 items to add
Eudocia is a Dinner Party member. 985 items to add
Eudoxia is a Dinner Party member. 984 items to add
Error encountered with Aemilia. Check manually. 983 items to review.
Æthelburg needs Dinner Party added to their Wikidata. 982 items to add
Ethelberga needs Dinner Party added to their Wikidata. 981 items to add
Æthelflæd is a Dinner Party member. 980 items to add
Agatha needs Din