In [None]:
# Update filename for each new file from revisions1 to revisions27
filename = 'revisions1'

In [None]:
# Download packages
import pandas as pd

# Load the revisions as a dataframe
revisions = pd.read_csv(filename + '.csv')
revisions_df = pd.DataFrame(revisions)

# Rename columns
revisions_df = revisions_df.rename(columns={"id": "page_id", "title": "page_title"})

# Convert to ID to integer for pywikibot
revisions_df['rev_id'] = revisions_df['rev_id'].astype(int)

revisions_df

In [None]:
# Load IP address list
import netaddr
import csv

# Load the IP address ranges as a dataframe
ip_database = open('ip_list.csv')
ip_reader = csv.reader(ip_database)
ip_df = pd.DataFrame(ip_reader)

# Set the column names
ip_df.columns = ip_df.iloc[0]
ip_df = ip_df.drop(ip_df.index[0])

# Define a function to split the IP addresses (stored as strings) into a list
def split(s):
    parts = []
    bracket_level = 0
    current = []
    # Remove special-case of trailing characters
    for c in (s + ","):
        if c == "," and bracket_level == 0:
            parts.append("".join(current))
            current = []
        else:
            if c == "[":
                bracket_level += 1
            elif c == "]":
                bracket_level -= 1
            current.append(c)
    return parts

# Create a list of IP_ranges
ip_list_bunched = []

for i in list(ip_df['ip_range']):
    ip_list_bunched.append(split(i[1:-1]))

# Convert CIDR to IP ranges
for i in range(len(ip_list_bunched)):
    for j in range(len(ip_list_bunched[i])):
        if '[' in ip_list_bunched[i][j]:
            # Remove whitespace and leading and trailing inverted commas
            startip = ip_list_bunched[i][j].split()[0].strip('[], ')[1:-1]
            endip = ip_list_bunched[i][j].split()[1].strip('[], ')[1:-1]

            # Convert to CIDR as a string
            temp_ip = netaddr.iprange_to_cidrs(startip, endip)[0]

            # Replace value in list
            ip_list_bunched[i][j] = temp_ip
        else:
            # Strip the leading and trailing inverted commas
            temp_ip = ip_list_bunched[i][j].strip('\'\" ')
            ip_list_bunched[i][j] = netaddr.IPNetwork(temp_ip)

# Convert each sub-list of ipnetworks to a set
ip_set_bunched = []

for i in range(len(ip_list_bunched)):
    ip_set_bunched.append(netaddr.IPSet(ip_list_bunched[i]))

In [None]:
# Load missing title and pageids; load previous revision id number
import requests
import pywikibot
from tqdm import tqdm
import concurrent
import numpy as np

# Set up requests
S = requests.Session()
S.mount('http://en.wikipedia.org/w/api.php', requests.adapters.HTTPAdapter(max_retries = 10))
URL = "http://en.wikipedia.org/w/api.php"

# Set up pywikibot
site = pywikibot.Site("en", "wikipedia")

bar = tqdm()

def run_iter(index, row):
    if index < 0:
        return
    
    # Replace page id if missing
    if row['page_id'] == -1:
        PARAMS = {
            "action": "query",
            "prop": "revisions",
            "titles": row['page_title'],
            "rvlimit": "1",
            "rvdir": "newer",
            "rvstartid": str(row['rev_id']),
            "rvdiffto" : "prev",
            "formatversion": "2",
            "format": "json"
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        checked_id = DATA['query']['pages'][0]['pageid']
        revisions_df.loc[index, 'page_id'] = checked_id
        
    # Find previous revision id
    PARAMS = {
        "action": "query",
        "prop": "revisions",
        "pageids": revisions_df.loc[index, 'page_id'],
        "rvlimit": "1",
        "rvdir": "newer",
        "rvstartid": str(row['rev_id']),
        "rvdiffto" : "prev",
        "formatversion": "2",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    
    try:
        rev_id_prev = DATA['query']['pages'][0]['revisions'][0]['diff']['from']
        revisions_df.loc[index, 'rev_id_prev'] = int(rev_id_prev)
    
    except:
        rev_id_prev = 0
    
    # Replace title if missing
    if not isinstance(revisions_df.loc[index, 'page_title'], str):
        checked_title = DATA['query']['pages'][0]['title']
        revisions_df.loc[index, 'page_title'] = checked_title
            
    # Import document text before and after revision
    try: 
        page = pywikibot.Page(site, revisions_df.loc[index, 'page_title'])
        revisions_df.loc[index, 'new_text'] = page.getOldVersion(oldid = row['rev_id'])
    except: # To accommodate pages whose title has changed
        page = pywikibot.Page(site, DATA['query']['pages'][0]['title'])
        revisions_df.loc[index, 'new_text'] = page.getOldVersion(oldid = row['rev_id'])

    if rev_id_prev == 0:
        revisions_df.loc[index, 'old_text'] = ''
    else:
        revisions_df.loc[index, 'old_text'] = page.getOldVersion(oldid = rev_id_prev)
        
    # Import country names
    for i in range(len(ip_set_bunched)):
        if netaddr.IPNetwork(str(row['ip'])) in ip_set_bunched[i]:
            country = ip_df.iloc[i, 0]
            org = ip_df.iloc[i, 1]
            revisions_df.loc[index, 'country'] = country
            revisions_df.loc[index, 'org'] = org
            break
        
    bar.update(1)

num_concurrent = 20
pool = concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent)
futures = []
for k, g in revisions_df.groupby(np.arange(len(revisions_df))//num_concurrent):
    for index, row in g.iterrows():
        f = pool.submit(run_iter, index, row)
        futures.append(f)
    concurrent.futures.wait(futures)
    futures = []
bar.close()
pool.shutdown()
            
revisions_df.to_csv(filename + '_temp.csv', index=False)

In [None]:
# Clean missing rows
missing_old = revisions_df[revisions_df['old_text'].isna()]
revisions_2 = revisions_df[revisions_df['old_text'].notna()]

missing_new = revisions_2[revisions_2['new_text'].isna()]
revisions_3 = revisions_2[revisions_2['new_text'].notna()]

missing_text = pd.concat([missing_old, missing_new], axis=0)
missing_text.to_csv(filename + '_missing.csv', index=False)

In [None]:
# Fix missing country and org names
missing_country = revisions_3[revisions_3['country'].isna()]
revisions_4 = revisions_3[revisions_3['country'].notna()]

missing_org = revisions_4[revisions_4['org'].isna()]
revisions_5 = revisions_4[revisions_4['org'].notna()]

missing_author = pd.concat([missing_country, missing_org], axis=0)

# Add country and org names back to missing rows
from tqdm import tqdm 

for index, row in tqdm(missing_author.iterrows()):
    for i in range(len(ip_set_bunched)):
        if netaddr.IPNetwork(str(row['ip'])) in ip_set_bunched[i]:
            country = ip_df.iloc[i, 0]
            org = ip_df.iloc[i, 1]
            missing_author.loc[index, 'country'] = country
            missing_author.loc[index, 'org'] = org
            break
            
revisions_6 = pd.concat([revisions_5, missing_author], axis=0)
revisions_6.to_csv(filename + '_complete.csv', index=False)