In [2]:
!pip install mwparserfromhell



In [3]:
import json
import re
import mwparserfromhell
import pandas as pd
import urllib.parse # urllib.parse.quote() is used to percent-encode article titles
import requests

In [4]:
# Load data from archives JSON file, which contains URLs of all the pages we want to check
archives = []
with open('meta_talk_archives.json', 'r') as file:
    for rec in file:
        archives.append(json.loads(rec))

In [5]:
# Filter out false positives
i = 0
filtered_in = []
filtered_out = []

for rec in archives:
    title = rec['page']['title']
    # Anything that doesn't follow one of these patterns is a false positive
    if re.search('^List of articles every Wikipedia should have$', title) == None and re.search('^List of articles every Wikipedia should have/Archives/200[4-9]+$', title) == None and re.search('^List of articles every Wikipedia should have/Archives/201[0-9]+$', title) == None and re.search('^List of articles every Wikipedia should have/Archives/2020$', title) == None:
        print(i, title, "BAD")
        filtered_out.append(rec)
    else:
        print(i, title, "GOOD")
        filtered_in.append(rec)
    i += 1

0 List of articles every Wikipedia should have GOOD
1 List of articles every Wikipedia should have/Version 1.1 BAD
2 List of articles every Wikipedia should have/Archives/2009 GOOD
3 List of articles every Wikipedia should have/Archives/2008 GOOD
4 List of articles every Wikipedia should have/Archives/2007 GOOD
5 List of articles every Wikipedia should have/Archives/2004 GOOD
6 List of articles every Wikipedia should have/Archives/2005 GOOD
7 List of articles every Wikipedia should have/Archives/2006 GOOD
8 List of articles every Wikipedia should have/Archives BAD
9 List of articles every Wikipedia should have/Archives/2010 GOOD
10 List of articles every Wikipedia should have/Expanded BAD
11 List of articles every Wikipedia should have/Archives/2011 GOOD
12 List of articles every Wikipedia should have/Archives/2012 GOOD
13 List of articles every Wikipedia should have/Expanded/People BAD
14 List of articles every Wikipedia should have/Archives/2013 GOOD
15 List of articles every Wikiped

In [6]:
filtered_in[0].keys()
filtered_in[0]

{'bytes': 61405,
 'comment': '/* A new property on Wikidata for these articles? */',
 'deleted': {'comment': False, 'text': False, 'user': False},
 'format': 'text/x-wiki',
 'id': 21739619,
 'minor': True,
 'model': 'wikitext',
 'page': {'id': 7075,
  'namespace': 1,
  'restrictions': [],
  'title': 'List of articles every Wikipedia should have'},
 'parent_id': 21738705,
 'sha1': '17l7csz0h5xcsf0li6rbuswj3i2e5mm',
 'text': '{| class="messagebox standard-talk"\n|-\n||\'\'\'See the [[Talk:List of articles all languages should have/Removed|list of removed entries]] for articles that were listed in the past or are still under consideration.\'\'\'\n|}\n{{User:MiszaBot/config\n|archive = Talk:List of articles every Wikipedia should have/Archives/%(year)d\n|algo = old(180d)\n|minthreadstoarchive = 1\n|minthreadsleft = 2\n}}\n{{Archive box|search=yes|\n{{hlist\n|[[Talk:List of articles every Wikipedia should have/Archives/2004 |2004]]\n|[[Talk:List of articles every Wikipedia should have/Archi

In [7]:
# page
# section number
# section name
# order (indicates order of sentences within a section)
# user
# timestamp
# content


# mwparserfromhell's get_sections method doesn't work as expected, so writing my own based on regex and filter_headings()
def get_sections(text):
    
    # Start by labeling headings based on whether it is the highest level heading, or a subheading
    # If a "===" heading is preceded by a "==" heading somewhere, it is a subheading
    headings = mwparserfromhell.parse(text).filter_headings()
    print(headings)
    flag_sub = False # Indicates whether we have arrived at a "==" heading yet.
    highest = [] # List indicating type of each heading; true = heading, false = subheading
    toplevel = 6 # Variable to keep track of top level header
    for heading in headings:
        level = heading.level # Number of consecutive "=" in heading
        print(level)
        toplevel = min(level, toplevel) # top level heading found so far
        if level > toplevel:
            if flag_sub:
                highest.append(False)
            else:
                highest.append(True)
        if level <= toplevel:
            highest.append(True)
            flag_sub = True
    
    new_headings = [] # List of top-level headings because we don't care about subheadings
    assert len(highest) == len(headings)
    for i in range(0, len(headings)):
        if highest[i]:
            new_headings.append(headings[i])
    
    heading_positions = []
    latest = 0 # Avoid duplicates by making sure we always look for next header after previous one
    for heading in new_headings:
        match = text.find(str(heading), latest, len(text))
        latest = match + 1
        heading_positions.append(match)
    
    sections = []
    for i in range(0, len(heading_positions)):
        
        current_heading = new_headings[i]
        
        # # Store linked article names so we can use ORES' topic classifier on headings
        # articles = []
        # articles_unformatted = current_heading.title.filter_wikilinks()
        # for article in articles_unformatted:
        #     a = re.sub('\|.*', '', str(article)).strip()
        #     a = re.sub('[\[\]]', '', a).strip()
        #     articles.append(a)
        
        if i+1 < len(heading_positions):
            section = text[heading_positions[i]:heading_positions[i+1]]
            section = re.sub('^.*\n+', '', section)
        else:
            section = text[heading_positions[i]:]
            section = re.sub('^.*\n+', '', section)
            
       
        
        sections.append({'heading': new_headings[i], 'content': section})
    return sections

In [8]:
# Function to split up discussion sections based on user who wrote each portion
def section_by_users(section):
    section = str(section) # Just in case section passed in is Text object instead of string
    section = re.sub('\{\{archive top.*\}\}', '', section)
    section = re.sub('\{\{archive bottom.*\}\}', '', section)
    
    # Remove style tags
    section = section = re.sub('<[^>]*>', '', section)
    section = re.sub('<\/[^>]*>', '', section)
    
    # Keep a list for each piece of data
    # Appending to lists is substantially faster than appending rows to a DataFrame
    user_list = []
    content_list = []
    timestamp_list = []
    
    # Scanning through text, so keep track of where we are
    current_i = 0
    
    # Match to the part of text that cites the user.
    # Don't match user subpage links, which are similarly structured
    # Everything before is content, stuff immediately after is user + timestamp info
    user_match = re.finditer(r'\[\[User[^/\n]*?\]\][^\n]*?(?:\(UTC\)|\n)', str(section))
    
    # Do this for each user matched to
    for match in user_match:
        # Extract string that was matched
        match_content = section[match.start():match.end()].strip()
        
        # Extract username from matched string
        user = re.sub('\|.*\]\].*', '', match_content)
        user = re.sub('\]\].*', '', user)
        user = re.sub('\[\[User.*:', '', user)
        user = re.sub('\#.*', '', user).strip()
        
        # Extract timestamp from matched string
        timestamp_match = re.search('[0-2][0-9]:[0-5][0-9].*', match_content)
        if timestamp_match:
            timestamp = match_content[timestamp_match.start():timestamp_match.end()].strip()
        else:
            timestamp = ''
        # Extract discussion content
        content = mwparserfromhell.parse(section[current_i:match.start()].strip()).strip_code()
        
        # Next section starts after the end of current section
        current_i = match.end()+1
        
        # Append all our info to lists
        user_list.append(user)
        timestamp_list.append(timestamp)
        content_list.append(content)
    
    print(user_list)
    return user_list, timestamp_list, content_list


In [9]:
# Function to convert text to a data frame that can include metadata
# Order column is used to indicate the order of content within each section.
def sections_to_df(pagename, sections):
    
    # Create our DataFrame for storing all discussion data/metadata
    df = pd.DataFrame(columns=['Page', 'Section number', 'Section name', 'Order', 'User', 'Timestamp', 'Content'])
    
    # Create a bunch of lists and concatenate into df at the end
    # Much faster than DataFrame.append()
    Page = []
    Section_number = []
    Section_name = []
    Order = []
    User = []
    Timestamp = []
    Content = []
 
    
    s = 0
    for section in sections:
        section_name = str(mwparserfromhell.parse(section['heading']).strip_code())
        user, timestamp, content = section_by_users(section['content'])
        l = len(user)
        page = [pagename] * l
        section_number = [s] * l
        section_name = [section_name] * l
        order = range(0, l)
   
        
        Page.extend(page)
        Section_number.extend(section_number)
        Section_name.extend(section_name)
        Order.extend(order)
        User.extend(user)
        Timestamp.extend(timestamp)
        Content.extend(content)
       
        s = s + 1
            
    zipped = list(zip(Page, Section_number, Section_name, Order, User, Timestamp, Content))        
    df = pd.DataFrame(zipped, columns = ['Page', 'Section_number', 'Section_name', 'Order', 'User', 'Timestamp', 'Content'])
    return df

In [10]:
# Iterate through all our pages using the functions defined above
df = pd.DataFrame(columns = ['Page', 'Section_number', 'Section_name', 'Order', 'User', 'Timestamp', 'Content'])
for page in filtered_in:
    pagename = page['page']['title'].replace(" ", "_")
    sections = get_sections(page['text'])
    newdf = sections_to_df(pagename, sections)
    df = df.append(newdf)

['== Going to make some changes ==', "== Cynnig i ddiddymu'r prosiect ==", '== Swap: Remove Hebrew, Add Meditation ==', '== COVID-19 pandemic ==', '== Additions and removals ==', '== Some proposals ==', "===Swap St. Peter's Basilica for Shia Islam===", '===Swap Chekov for <s>Louis Pasteur</s> Hygiene===', '===Swap Ovid for Protestantism===', '===Swap Marlene Dietrich for Eastern Orthodox Church===', '===Swap Dvořák for Adult===', '===Swap Tchaikovsky for Adolescence===', '===Swap Mahler for Information Age===', '===Swap Rubens for Bow and arrow===', '== Swap Tim Berners-Lee for Mental health ==', '== Remove State (polity), Add Country ==', '== Industry  industry (Q8148) ==', '== Archiving ==', '== Swap: Remove Vatican City, Add New religious movements ==', '== Creating a better list ==', '===Possible removals===', '===Possible additions===', '===Discussion===', '== Swap Chekov for Sappho ==', '== Adding Black Death, removing Ethanol ==', '== Atacama Desert ==', '== Swap: Remove Umm Kul

In [11]:
# Add some columns for iterative coding tasks
df['Page'] = 'https://meta.wikimedia.org/wiki/Talk:' + df['Page'].astype(str)

In [12]:
df.iloc[1]['Content']

"Edited: 06:26, 28 January 2020 (UTC)\n  About the changes, Franklin D. Roosevelt is one of the most important president in the history of US. Shaka is an important king in Africa but it is not more important than others African kings of the period (Usman dan Fodio, Omar Saidou Tall...). Regarding Ayn Rand, she's an infuential essayist in the US but her theory (laissez-faire, importance of individual rights...) was better described by others philosophers or economists. And Sartre developped original theories (existentialism...) was a key figure of the philosophy of the 20th century. Best regards, --"

In [13]:
df['Page']

0     https://meta.wikimedia.org/wiki/Talk:List_of_a...
1     https://meta.wikimedia.org/wiki/Talk:List_of_a...
2     https://meta.wikimedia.org/wiki/Talk:List_of_a...
3     https://meta.wikimedia.org/wiki/Talk:List_of_a...
4     https://meta.wikimedia.org/wiki/Talk:List_of_a...
                            ...                        
61    https://meta.wikimedia.org/wiki/Talk:List_of_a...
62    https://meta.wikimedia.org/wiki/Talk:List_of_a...
63    https://meta.wikimedia.org/wiki/Talk:List_of_a...
64    https://meta.wikimedia.org/wiki/Talk:List_of_a...
65    https://meta.wikimedia.org/wiki/Talk:List_of_a...
Name: Page, Length: 2436, dtype: object

In [14]:
# Alphabetize and check manually that all pages were included.

# Function passed to .sort() so we can sort records by title
def get_title(record):
    return record['page']['title']

filtered_in.sort(key=get_title)

i = 0
for rec in filtered_in:
    print(i, rec['page']['title'])
    i += 1

0 List of articles every Wikipedia should have
1 List of articles every Wikipedia should have/Archives/2004
2 List of articles every Wikipedia should have/Archives/2005
3 List of articles every Wikipedia should have/Archives/2006
4 List of articles every Wikipedia should have/Archives/2007
5 List of articles every Wikipedia should have/Archives/2008
6 List of articles every Wikipedia should have/Archives/2009
7 List of articles every Wikipedia should have/Archives/2010
8 List of articles every Wikipedia should have/Archives/2011
9 List of articles every Wikipedia should have/Archives/2012
10 List of articles every Wikipedia should have/Archives/2013
11 List of articles every Wikipedia should have/Archives/2014
12 List of articles every Wikipedia should have/Archives/2015
13 List of articles every Wikipedia should have/Archives/2016
14 List of articles every Wikipedia should have/Archives/2017
15 List of articles every Wikipedia should have/Archives/2018
16 List of articles every Wikipe

In [15]:
# Export data frame of full data to tsv
df.to_csv('meta_talk_pages.tsv', sep="\t", index=False)

In [16]:
# Keep only meaningful discussion content, not votes and quick expressions of agreement
df_filtered = df[df.Content.str.len() > 100]
df_filtered.describe()

Unnamed: 0,Page,Section_number,Section_name,Order,User,Timestamp,Content
count,1783,1783,1783,1783,1783,1783.0,1783
unique,18,73,426,52,314,1706.0,1780
top,https://meta.wikimedia.org/wiki/Talk:List_of_a...,7,"replacing some artists, architects and musicians",0,Yerpo,,I have just read the latest result of List of ...
freq,258,134,41,410,196,8.0,2


In [17]:
# Export data frame of "meaningful" discussion data to tsv
# Meaningful discussion consists of rows with content at least 100 characters long
# This eliminates rows where the person just wrote a single word such as "Support" or "Oppose"
df_filtered.to_csv('meta_talk_pages_long_only.tsv', sep="\t", index=False)