This is for collecting deletion cases that occurred in 2003 and 2004. We'll do it roughly by year.

In [40]:
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import wikifunctions as wf
import re, os
from pathlib import Path
import requests

# 2003

In [41]:
year_page = "Wikipedia:Archived_articles_for_deletion_discussions/2003"

In [42]:
archive_home = wf.get_page_raw_content(year_page)
soup = BeautifulSoup( archive_home , features="html.parser")

a_tags = soup.find_all("a")
a_tags = [a for a in a_tags if a.has_attr('href')]

links = [
    a for a in a_tags
    if (
        a['href'].startswith("/wiki/Wikipedia:Articles_for_deletion/")
        or a['href'].startswith("/wiki/Wikipedia:Votes_for_deletion/")
    )
]

In [43]:
links[1]['href'].replace("/wiki/", "")

'Wikipedia:Articles_for_deletion/List_of_interesting_or_unusual_place_names/2003_discussion'

In [178]:
def get_earliest_page_revision(page_title, endpoint='en.wikipedia.org/w/api.php', redirects=1):
    """Takes Wikipedia page title and returns a DataFrame of revisions
    
    page_title - a string with the title of the page on Wikipedia
    endpoint - a string that points to the web address of the API.
        This defaults to the English Wikipedia endpoint: 'en.wikipedia.org/w/api.php'
        Changing the two letter language code will return a different language edition
        The Wikia endpoints are slightly different, e.g. 'starwars.wikia.com/api.php'
    redirects - a Boolean value for whether to follow redirects to another page
        
    Returns:
    df - a pandas DataFrame where each row is a revision and columns correspond
         to meta-data such as parentid, revid, sha1, size, timestamp, and user name
    """
    
    # A container to store all the revisions
    revision_list = list()
    
    # Set up the query
    query_url = "https://{0}".format(endpoint)
    query_params = {}
    query_params['action'] = 'query'
    query_params['titles'] = page_title
    query_params['prop'] = 'revisions'
    query_params['rvprop'] = 'timestamp' #userid
    query_params['rvlimit'] = 1
    query_params['rvdir'] = 'newer'
    query_params['format'] = 'json'
    query_params['redirects'] = redirects
    query_params['formatversion'] = 2
    
    # Make the query
    json_response = requests.get(url = query_url, params = query_params).json()
    revision_list = list()
    revision_list += wf.response_to_revisions(json_response)

    # Convert to a DataFrame
    df = pd.DataFrame(revision_list)

    # Add in some helpful fields to the DataFrame
    final_title = json_response['query']['pages'][0]['title']
    df['page'] = final_title
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].apply(lambda x:x.date())
    
    return df

In [69]:
output = []
# log_link	case_title	case_discussion_url	multiple_noms	year	month	day

for l in links: 
    # extract the href from the a_tag
    case_discussion_url = l['href'].replace("/wiki/", "")
    # remove the Wikipedia:*for deletion/ from the href
    case_title = re.sub(r'Wikipedia:(Articles_for_deletion|Votes_for_deletion)/', '', case_discussion_url)

    if case_discussion_url == "Wikipedia:Articles_for_deletion/Slogan_%27Jesus_is_Lord%27":
        # This is a special case where the title has an apostrophe in it
        # and the API does not like it.
        case_discussion_url = "Wikipedia:Articles_for_deletion/Slogan_\'Jesus_is_Lord\'"

    _revisions_df = get_earliest_page_revision(case_discussion_url)

    # get the value of the 'timestamp' column in the df, which only has one row. this value is the earliest revision date
    earliest_revision_date = _revisions_df['timestamp'].iloc[0]

    # get year, month, day from the earliest revision date
    year = earliest_revision_date.year 
    month = earliest_revision_date.month
    day = earliest_revision_date.day
    print(case_discussion_url, year, month, day)

    content = wf.get_page_raw_content(case_discussion_url)
    c = BeautifulSoup(content, features="html.parser")
    block = c.find(string=lambda text: text and "AfDs for this article:" in text)
    if block:
        multiple_noms = True
    else:
        multiple_noms = False

    _temp = [year_page, case_title, case_discussion_url, multiple_noms, year, month, day]

    output.append(_temp)

              timestamp
0  2003-06-28T20:03:30Z
Wikipedia:Articles_for_deletion/Wikicide 2003 6 28
              timestamp
0  2008-11-11T14:14:34Z
Wikipedia:Articles_for_deletion/List_of_interesting_or_unusual_place_names/2003_discussion 2008 11 11
              timestamp
0  2004-08-31T12:15:12Z
Wikipedia:Votes_for_deletion/Reciprocal_System_of_Theory 2004 8 31
              timestamp
0  2024-12-02T11:59:58Z
Wikipedia:Articles_for_deletion/AKFD 2024 12 2
              timestamp
0  2003-08-31T21:29:01Z
Wikipedia:Articles_for_deletion/List_of_heterosexuals 2003 8 31
              timestamp
0  2003-10-24T01:44:57Z
Wikipedia:Articles_for_deletion/List_of_black_people/white_people 2003 10 24
              timestamp
0  2003-09-09T22:44:01Z
Wikipedia:Articles_for_deletion/Md._Ahiduzzaman_Liton 2003 9 9
              timestamp
0  2003-09-20T18:13:24Z
Wikipedia:Articles_for_deletion/Misconceptions_and_disputed_facts 2003 9 20
              timestamp
0  2003-09-23T01:30:06Z
Wikipedia:Articles_fo

In [67]:
_df_2003 = pd.DataFrame(output, columns=['log_link', 'case_title', 'case_discussion_url', 'multiple_noms', 'year', 'month', 'day'])

In [71]:
_df_2003.sort_values(by=['month', 'day'], inplace=True)

In [73]:
_df_2003[_df_2003['month'] == 11]

Unnamed: 0,log_link,case_title,case_discussion_url,multiple_noms,year,month,day
20,Wikipedia:Archived_articles_for_deletion_discu...,Bill_Gates/Criminal_record,Wikipedia:Articles_for_deletion/Bill_Gates/Cri...,False,2003,11,1
19,Wikipedia:Archived_articles_for_deletion_discu...,Prime_Minister_of_the_United_States,Wikipedia:Articles_for_deletion/Prime_Minister...,False,2003,11,3
21,Wikipedia:Archived_articles_for_deletion_discu...,Silesian_language,Wikipedia:Articles_for_deletion/Silesian_language,False,2003,11,4
1,Wikipedia:Archived_articles_for_deletion_discu...,List_of_interesting_or_unusual_place_names/200...,Wikipedia:Articles_for_deletion/List_of_intere...,False,2008,11,11
22,Wikipedia:Archived_articles_for_deletion_discu...,Easter_Bradford,Wikipedia:Articles_for_deletion/Easter_Bradford,False,2003,11,12
23,Wikipedia:Archived_articles_for_deletion_discu...,Reptilian_humanoid,Wikipedia:Articles_for_deletion/Reptilian_huma...,False,2003,11,18
24,Wikipedia:Archived_articles_for_deletion_discu...,Michal_Arkusz,Wikipedia:Articles_for_deletion/Michal_Arkusz,False,2003,11,24
25,Wikipedia:Archived_articles_for_deletion_discu...,Gregor_Brand,Wikipedia:Articles_for_deletion/Gregor_Brand,False,2003,11,27
26,Wikipedia:Archived_articles_for_deletion_discu...,Sunset_High_School,Wikipedia:Articles_for_deletion/Sunset_High_Sc...,False,2003,11,27
27,Wikipedia:Archived_articles_for_deletion_discu...,French_alphabet,Wikipedia:Articles_for_deletion/French_alphabet,False,2003,11,30


In [76]:
months = [1,2,3,4,5,6,7,8,9,10,11,12]

for m in months:
    #print(_df_2003[_df_2003['month'] == m])
    _df_2003[_df_2003['month'] == m].to_csv(f"./deletion_cases/deletion_cases_2003_{m:02d}_uncleaned.tsv", sep='\t', index=False,header=True)
    print(f"Saved data/2003_{m:02d}.csv")

Saved data/2003_01.csv
Saved data/2003_02.csv
Saved data/2003_03.csv
Saved data/2003_04.csv
Saved data/2003_05.csv
Saved data/2003_06.csv
Saved data/2003_07.csv
Saved data/2003_08.csv
Saved data/2003_09.csv
Saved data/2003_10.csv
Saved data/2003_11.csv
Saved data/2003_12.csv


# 2004

In [81]:
year_page = "Wikipedia:Archived_articles_for_deletion_discussions/2004"

In [82]:
archive_home = wf.get_page_raw_content(year_page)
soup = BeautifulSoup( archive_home , features="html.parser")

a_tags = soup.find_all("a")
a_tags = [a for a in a_tags if a.has_attr('href')]

### first we will deal with the december 2004 daily log page

In [134]:
def extract_date_link(text):
    match = re.search(r'Log/(\d{4})_(\w+)_(\d{1,2})', text)
    if match:
        year = int(match.group(1))
        month = match.group(2)
        # convert month to two digit number
        month_dict = {
            'January': 1, 'February': 2, 'March': 3, 'April': 4,
            'May': 5, 'June': 6, 'July': 7, 'August': 8,
            'September': 9, 'October': 10, 'November': 11, 'December': 12
        }
        month = month_dict.get(month, 0)
        if month == 0:
            print("Invalid month name: {}".format(month))
            return [0, 0, 0]
        day = int(match.group(3))
        return [year, month, day]
    else:
        print("We couldn't find a date for {}".format(text))
        return [0, 0, 0]

def get_deletion_cases(log_page_link, case_list_output):
    # open log page and soup it
    title = log_page_link[6:]
    print(title)
    soup = BeautifulSoup( wf.get_page_raw_content(title), features="html.parser")

    # get all the deletion cases for that day
    cases = soup.find_all("div", class_="mw-heading mw-heading3")
    year, month, day = extract_date_link(log_page_link)

    for c in cases:
        case_title = c.find("h3").get_text()
        case_discussion_url = f"Wikipedia:Articles_for_deletion/{case_title}"

        # look for multiple nominations noted = "AfDs for this article:"
        block = c.find(string=lambda text: text and "AfDs for this article:" in text)
        if block:
            multiple_noms = True
        else:
            multiple_noms = False
        
        

        formatted_case = [log_page_link, case_title, case_discussion_url, multiple_noms, year, month, day]
        #print(formatted_case)

        case_list_output.append(formatted_case)

In [135]:
daily_links_december_2004 = [a['href'] for a in a_tags if (a['href'].startswith("/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_"))]

In [137]:
daily_links_december_2004

['/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_31',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_30',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_29',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_28',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_27',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_26',
 '/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_25']

In [138]:
cases_2004 = []
for day in daily_links_december_2004:
    get_deletion_cases(day, cases_2004)

Wikipedia:Articles_for_deletion/Log/2004_December_31
Wikipedia:Articles_for_deletion/Log/2004_December_30
Wikipedia:Articles_for_deletion/Log/2004_December_29
Wikipedia:Articles_for_deletion/Log/2004_December_28
Wikipedia:Articles_for_deletion/Log/2004_December_27
Wikipedia:Articles_for_deletion/Log/2004_December_26
Wikipedia:Articles_for_deletion/Log/2004_December_25


In [167]:
cases_2004[:2]

[['/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_31',
  'Freemasonic Order of the Golden Centurium',
  'Wikipedia:Articles_for_deletion/Freemasonic Order of the Golden Centurium',
  False,
  2004,
  12,
  31],
 ['/wiki/Wikipedia:Articles_for_deletion/Log/2004_December_31',
  'Schrift',
  'Wikipedia:Articles_for_deletion/Schrift',
  False,
  2004,
  12,
  31]]

### now we'll deal with the archived delete debates, in chunks of months

In [None]:
archived_links = [a for a in a_tags if (a['href'].startswith("/wiki/Wikipedia:Archived_delete_debates/"))]

In [176]:
get_earliest_page_revision("Wikipedia:Articles_for_deletion/Sister_Imelda_D\'Agostino", [])

dia:Articles_for_deletion/Sister_Imelda_D'Agostino
We couldn't find a date for Wikipedia:Articles_for_deletion/Sister_Imelda_D'Agostino


In [195]:
archived_links

[<a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/December_2004" title="Wikipedia:Archived delete debates/December 2004">Wikipedia:Archived delete debates/December 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/November_2004" title="Wikipedia:Archived delete debates/November 2004">Wikipedia:Archived delete debates/November 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/October_2004" title="Wikipedia:Archived delete debates/October 2004">Wikipedia:Archived delete debates/October 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/September_2004" title="Wikipedia:Archived delete debates/September 2004">Wikipedia:Archived delete debates/September 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/August_2004" title="Wikipedia:Archived delete debates/August 2004">Wikipedia:Archived delete debates/August 2004</a>,
 <a class="mw-redirect" href="/wiki/

In [None]:
# a function that takes a URL and returns the text of the URL string with any ASCII characters converted to regular characters
def get_url_text(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve {url}")
        return None

In [200]:
a_tags[0]["title"]

'Wikipedia:Votes for deletion/Simpson Gene'

In [201]:
get_earliest_page_revision(a_tags[0]["title"])

Unnamed: 0,timestamp,page,date
0,2004-11-13 05:41:49+00:00,Wikipedia:Articles for deletion/Simpson Gene,2004-11-13


In [212]:
archived_links

[<a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/December_2004" title="Wikipedia:Archived delete debates/December 2004">Wikipedia:Archived delete debates/December 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/November_2004" title="Wikipedia:Archived delete debates/November 2004">Wikipedia:Archived delete debates/November 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/October_2004" title="Wikipedia:Archived delete debates/October 2004">Wikipedia:Archived delete debates/October 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/September_2004" title="Wikipedia:Archived delete debates/September 2004">Wikipedia:Archived delete debates/September 2004</a>,
 <a class="mw-redirect" href="/wiki/Wikipedia:Archived_delete_debates/August_2004" title="Wikipedia:Archived delete debates/August 2004">Wikipedia:Archived delete debates/August 2004</a>,
 <a class="mw-redirect" href="/wiki/

Tragically this cell below takes about an hour to run.

In [231]:
_output = []

for a in archived_links:
    link = a['href'].replace("/wiki/", "")

    # go to link and get soup
    content = wf.get_page_raw_content(link)
    c = BeautifulSoup(content, features="html.parser")

    # get all <a> tags that are nested within <ul> and <li> tags
    li_tags = c.find_all("li")

    a_tags = [l.a for l in li_tags if l.a and l.a.has_attr('href')]
    try:
        titles = [a['title'] for a in a_tags]
    except Exception as e:
        a_tags = [a for a in a_tags if a.has_attr('title')]
        titles = [a['title'] for a in a_tags]

    for title in titles: 
        case_discussion_url = title

        if "(page does not exist)" in case_discussion_url:
            # This is a special case where the title is not a valid page
            continue
        if "deletion/Log/2004" in case_discussion_url:
            # This is a log page, not a case discussion page
            continue
        if "Neilie Casey" in case_discussion_url:
            # This is a special case where the title is not a valid page
            continue

        case_title = re.sub(r'Wikipedia:(Articles_for_deletion|Votes_for_deletion)/', '', case_discussion_url)
        print(link, "|", case_discussion_url)

        _revisions_df = get_earliest_page_revision(case_discussion_url)
        earliest_revision_date = _revisions_df['timestamp'].iloc[0]

        # get year, month, day from the earliest revision date
        year = earliest_revision_date.year 
        month = earliest_revision_date.month
        day = earliest_revision_date.day
        print(" > ", year, month, day)

        content = wf.get_page_raw_content(case_discussion_url)
        c = BeautifulSoup(content, features="html.parser")
        block = c.find(string=lambda text: text and "AfDs for this article:" in text)
        if block:
            multiple_noms = True
        else:
            multiple_noms = False

        _temp = [year_page, case_title, case_discussion_url, multiple_noms, year, month, day]

        _output.append(_temp)

Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Simpson Gene
 >  2004 11 13
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Together We Are Strong
 >  2004 11 13
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Dan Evehema
 >  2004 11 14
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/How to learn a language
 >  2004 11 15
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Soda Club at Kulturbrauerei
 >  2004 11 16
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Bud smiley
 >  2004 11 19
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/SMSAH
 >  2004 11 19
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Dan Saul
 >  2004 11 19
Wikipedia:Archived_delete_debates/December_2004 | Wikipedia:Votes for deletion/Entrepreneurial culture
 >  2004 11 19
Wiki

In [232]:
len(_output)

3943

In [234]:
len(cases_2004)

303

In [235]:
# add _output to cases_2004
cases_2004 += _output
# make into df 
_df_2004 = pd.DataFrame(cases_2004, columns=['log_link', 'case_title', 'case_discussion_url', 'multiple_noms', 'year', 'month', 'day'])
_df_2004.sort_values(by=['month', 'day'], inplace=True)
print(len(_df_2004))

# drop duplicate rows, keeping only the first occurrence
_df_2004.drop_duplicates(subset=['case_discussion_url'], keep='first', inplace=True)
print(len(_df_2004))

4246
4129


In [245]:
_df_2004.tail(5)

Unnamed: 0,log_link,case_title,case_discussion_url,multiple_noms,year,month,day
43,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,The Dork Cheese,Wikipedia:Articles_for_deletion/The Dork Cheese,False,2004,12,31
44,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Exploratoria,Wikipedia:Articles_for_deletion/Exploratoria,False,2004,12,31
45,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Bancroft Clan,Wikipedia:Articles_for_deletion/Bancroft Clan,False,2004,12,31
46,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Wikipedia:WikiWhacking,Wikipedia:Articles_for_deletion/Wikipedia:Wiki...,False,2004,12,31
4087,Wikipedia:Archived_articles_for_deletion_discu...,Wilfredo G. Santa,Wikipedia:Articles for deletion/Wilfredo G. Santa,False,2003,12,31


In [244]:
# change case_title so that if it starts with "Wikipedia:Articles for deletion/" of "Wikipedia: Votes for Deletion" it is removed
_df_2004['case_title'] = _df_2004['case_title'].apply(lambda x: re.sub(r'Wikipedia:(Articles for deletion|Votes for deletion)/', '', x))

In [246]:
_df_2004

Unnamed: 0,log_link,case_title,case_discussion_url,multiple_noms,year,month,day
1120,Wikipedia:Archived_articles_for_deletion_discu...,Anasteemaphilia,Wikipedia:Articles for deletion/Anasteemaphilia,False,2006,1,6
4070,Wikipedia:Archived_articles_for_deletion_discu...,Talk:Indo-European Dravidian words,Talk:Indo-European Dravidian words,False,2004,1,6
4071,Wikipedia:Archived_articles_for_deletion_discu...,List of people known as war heroes,Wikipedia:Articles for deletion/List of people...,False,2004,1,7
4076,Wikipedia:Archived_articles_for_deletion_discu...,Initial-stress-derived noun,Wikipedia:Articles for deletion/Initial-stress...,False,2004,1,12
4077,Wikipedia:Archived_articles_for_deletion_discu...,Morissettian irony,Wikipedia:Articles for deletion/Morissettian i...,False,2004,1,12
...,...,...,...,...,...,...,...
43,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,The Dork Cheese,Wikipedia:Articles_for_deletion/The Dork Cheese,False,2004,12,31
44,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Exploratoria,Wikipedia:Articles_for_deletion/Exploratoria,False,2004,12,31
45,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Bancroft Clan,Wikipedia:Articles_for_deletion/Bancroft Clan,False,2004,12,31
46,/wiki/Wikipedia:Articles_for_deletion/Log/2004...,Wikipedia:WikiWhacking,Wikipedia:Articles_for_deletion/Wikipedia:Wiki...,False,2004,12,31


In [249]:
months = [1,2,3,4,5,6,7,8,9,10,11,12]

for m in months:
    #print(_df_2004[_df_2004['month'] == m].head(5))
    _df_2004[_df_2004['month'] == m].to_csv(f"./deletion_cases/deletion_cases_2004_{m:02d}_uncleaned.tsv", sep='\t', index=False,header=True)
    print(f"Saved data/2004_{m:02d}.csv")

Saved data/2004_01.csv
Saved data/2004_02.csv
Saved data/2004_03.csv
Saved data/2004_04.csv
Saved data/2004_05.csv
Saved data/2004_06.csv
Saved data/2004_07.csv
Saved data/2004_08.csv
Saved data/2004_09.csv
Saved data/2004_10.csv
Saved data/2004_11.csv
Saved data/2004_12.csv


In [161]:
a_tags[0]['href']

'/wiki/Wikipedia:Votes_for_deletion/Simpson_Gene'