Get interlanguage link information for all of our rule pages and export as tsv

In [1]:
import argparse
import re, random, urllib, simplejson, copy, itertools
import urllib.parse, urllib.request, urllib.error
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('whitegrid')

import requests, json, time, os
from pathlib import Path

import networkx as nx
from operator import itemgetter
from collections import Counter
from bs4 import BeautifulSoup

import wikifunctions as wf

from functools import partial

In [2]:
# Some helpful structures
"""
Subject Talk    Name        Description
0       1       Main        Articles, lists, & redirects
2       3       User        User pages & sandboxes
4       5       Wikipedia   Policy, essay, & processes
-6       7       File        Media file descriptions
-8       9       MediaWiki   Auto-generated pages
10      11      Template    Infoboxes, nav boxes
-12      13      Help        Software help
14      15      Category    Categorized pages
100     101     Portal      Topics & WikiProjects
"""
langs = ["de","en","es","fr","ja"]
namespaces = [0, 1, 2, 3, 4, 5, 10, 11, 14, 15, 100, 101]

## Load data
(1) Source data (lists of rules)

(2) Revision history tsvs

In [3]:
# list of rules
rules_df_es = pd.read_csv(Path(os.getcwd()) / "dewiki.tsv",sep="\t",header=None)
rules_df_en = pd.read_csv(Path(os.getcwd()) / "enwiki.tsv",sep="\t",header=None)
rules_df_es = pd.read_csv(Path(os.getcwd()) / "eswiki.tsv",sep="\t",header=None)
rules_df_fr = pd.read_csv(Path(os.getcwd()) / "frwiki.tsv",sep="\t",header=None)
rules_df_ja = pd.read_csv(Path(os.getcwd()) / "jawiki.tsv",sep="\t",header=None)
rules_dfs = [rules_df_es,rules_df_en,rules_df_es,rules_df_fr,rules_df_ja]

In [5]:
# page revision histories
rev_path = Path(os.getcwd()) / "output_rulepagerevs" / "2020-07-31"

rev_df_de = pd.read_csv( rev_path / "de_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_en = pd.read_csv( rev_path / "en_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_es = pd.read_csv( rev_path / "es_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_fr = pd.read_csv( rev_path / "fr_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_ja = pd.read_csv( rev_path / "ja_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])

rev_dfs = [rev_df_de,rev_df_en,rev_df_es,rev_df_fr,rev_df_ja]
rev_dfs_dict = { "rev_df_de":rev_df_de, "rev_df_en":rev_df_en, "rev_df_es":rev_df_es, "rev_df_fr":rev_df_fr, "rev_df_ja":rev_df_ja}

In [6]:
# talk page revision histories
talkrev_df_de = pd.read_csv( rev_path / "de_revisions_talk.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'],low_memory=False)
talkrev_df_en = pd.read_csv( rev_path / "en_revisions_talk.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'],low_memory=False)
talkrev_df_es = pd.read_csv( rev_path / "es_revisions_talk.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'],low_memory=False)
talkrev_df_fr = pd.read_csv( rev_path / "fr_revisions_talk.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'],low_memory=False)
talkrev_df_ja = pd.read_csv( rev_path / "ja_revisions_talk.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'],low_memory=False)

In [7]:
# get rid of duplicate revisions for each language edition
rev_df_de.drop_duplicates(subset=['revid'],inplace=True)
rev_df_en.drop_duplicates(subset=['revid'],inplace=True)
rev_df_es.drop_duplicates(subset=['revid'],inplace=True)
rev_df_fr.drop_duplicates(subset=['revid'],inplace=True)
rev_df_ja.drop_duplicates(subset=['revid'],inplace=True)

talkrev_df_de.drop_duplicates(subset=['revid'],inplace=True)
talkrev_df_en.drop_duplicates(subset=['revid'],inplace=True)
talkrev_df_es.drop_duplicates(subset=['revid'],inplace=True)
talkrev_df_fr.drop_duplicates(subset=['revid'],inplace=True)
talkrev_df_ja.drop_duplicates(subset=['revid'],inplace=True)

## get interlanguage links

In [None]:
def get_ills(pagetitle,endpoint):
    ill_list = list()
    return ill_list

## Parse revision histories to find when the interlanguage links happened

In [None]:
# get page revisions

In [None]:
# parse page revisions 