Get interlanguage link information for all of our rule pages and export as tsv

In [1]:
import argparse
import re, random, urllib, simplejson, copy, itertools
import urllib.parse, urllib.request, urllib.error
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style('whitegrid')

import requests, json, time, os
from pathlib import Path

from operator import itemgetter
from collections import Counter
from bs4 import BeautifulSoup

import wikifunctions as wf

In [2]:
# Some helpful structures
"""
Subject Talk    Name        Description
0       1       Main        Articles, lists, & redirects
2       3       User        User pages & sandboxes
4       5       Wikipedia   Policy, essay, & processes
-6       7       File        Media file descriptions
-8       9       MediaWiki   Auto-generated pages
10      11      Template    Infoboxes, nav boxes
-12      13      Help        Software help
14      15      Category    Categorized pages
100     101     Portal      Topics & WikiProjects
"""
langs = ["de","en","es","fr","ja"]
namespaces = [0, 1, 2, 3, 4, 5, 10, 11, 14, 15, 100, 101]

## Load data
(1) Source data (lists of rules)

(2) Revision history tsvs

In [3]:
# list of rules
rules_df_de = pd.read_csv(Path(os.getcwd()) / "dewiki.tsv",sep="\t",header=None)
rules_df_de = rules_df_de.rename(columns={0: "NA", 1: "links",2:"titles"})
rules_df_en = pd.read_csv(Path(os.getcwd()) / "enwiki.tsv",sep="\t",header=None)
rules_df_en = rules_df_en.rename(columns={0: "NA", 1: "links", 2: "titles", 3:"shortcuts"})
rules_df_es = pd.read_csv(Path(os.getcwd()) / "eswiki.tsv",sep="\t",header=None)
rules_df_es = rules_df_es.rename(columns={0: "NA", 1: "links", 2: "titles", 3:"shortcuts"})
rules_df_fr = pd.read_csv(Path(os.getcwd()) / "frwiki.tsv",sep="\t",header=None)
rules_df_fr = rules_df_fr.rename(columns={0: "NA", 1: "links", 2: "titles", 3:"shortcuts"})
rules_df_ja = pd.read_csv(Path(os.getcwd()) / "jawiki.tsv",sep="\t",header=None)
rules_df_ja = rules_df_ja.rename(columns={0: "NA", 1: "links", 2: "titles", 3:"shortcuts"})

rules_dfs = [rules_df_de,rules_df_en,rules_df_es,rules_df_fr,rules_df_ja]
rules_dfs_dict = {'de':rules_df_de,'en':rules_df_en,'es':rules_df_es,'fr':rules_df_fr,'ja':rules_df_ja}

In [4]:
rules_df_de

Unnamed: 0,NA,links,titles
0,0,https://de.wikipedia.org/wiki/Wikipedia:Richtl...,Wikipedia:Richtlinien
1,1,https://de.wikipedia.org/wiki/Wikipedia:Grundp...,Wikipedia:Grundprinzipien
2,2,https://de.wikipedia.org/wiki/Wikipedia:Was_Wi...,Wikipedia:Was Wikipedia nicht ist
3,3,https://de.wikipedia.org/wiki/Wikipedia:Wikipe...,Wikipedia:Wikipedia ist kein Wörterbuch
4,4,https://de.wikipedia.org/wiki/Wikipedia:Keine_...,Wikipedia:Keine Theoriefindung
...,...,...,...
117,119,https://de.wikipedia.org/wiki/Wikipedia:Namens...,Wikipedia:Namenskonventionen/Medizin
118,120,https://de.wikipedia.org/wiki/Portal:Luftfahrt...,Portal:Luftfahrt/Namenskonventionen
119,121,https://de.wikipedia.org/wiki/Portal:Antarktis...,Portal:Antarktis/Konventionen
120,122,https://de.wikipedia.org/wiki/Wikipedia:Namens...,Wikipedia:Namenskonventionen/Usbekisch


'''
### page revision histories
rev_path = Path(os.getcwd()) / "output_rulepagerevs" / "2020-07-31"

rev_df_de = pd.read_csv( rev_path / "de_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_en = pd.read_csv( rev_path / "en_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_es = pd.read_csv( rev_path / "es_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_fr = pd.read_csv( rev_path / "fr_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])
rev_df_ja = pd.read_csv( rev_path / "ja_revisions.tsv",sep='\t',header=0,encoding='utf8',parse_dates=['date','timestamp'])

rev_dfs = [rev_df_de,rev_df_en,rev_df_es,rev_df_fr,rev_df_ja]
rev_dfs_dict = { "rev_df_de":rev_df_de, "rev_df_en":rev_df_en, "rev_df_es":rev_df_es, "rev_df_fr":rev_df_fr, "rev_df_ja":rev_df_ja}
'''

'''
### get rid of duplicate revisions for each language edition
rev_df_de.drop_duplicates(subset=['revid'],inplace=True)
rev_df_en.drop_duplicates(subset=['revid'],inplace=True)
rev_df_es.drop_duplicates(subset=['revid'],inplace=True)
rev_df_fr.drop_duplicates(subset=['revid'],inplace=True)
rev_df_ja.drop_duplicates(subset=['revid'],inplace=True)
'''

## get interlanguage links

In [5]:
def get_interlanguage_links(page_title, endpoint='en.wikipedia.org/w/api.php', redirects=1, multicore_dict=None):
    """The function accepts a page_title and returns a dictionary containing 
    the title of the page in its other languages
       
    page_title - a string with the title of the page on Wikipedia
    endpoint - a string that points to the web address of the API.
        This defaults to the English Wikipedia endpoint: 'en.wikipedia.org/w/api.php'
        Changing the two letter language code will return a different language edition
        The Wikia endpoints are slightly different, e.g. 'starwars.wikia.com/api.php'
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
       
    Returns:
    langlink_dict - a dictionary keyed by lang codes and page title as values
    """
    
    #query_string = "https://{1}.wikipedia.org/w/api.php?action=query&format=json&prop=langlinks&formatversion=2&titles={0}&llprop=autonym|langname&lllimit=500".format(page_title,lang)
    query_url = "https://{0}".format(endpoint)
    query_params = {}
    query_params['action'] = 'query'
    query_params['prop'] = 'langlinks'
    query_params['titles'] = page_title
    query_params['redirects'] = redirects
    query_params['llprop'] = 'autonym|langname'
    query_params['lllimit'] = 500
    query_params['format'] = 'json'
    query_params['formatversion'] = 2
    json_response = requests.get(url=query_url,params=query_params).json()
    #print(json_response)
    
    interlanguage_link_dict = dict()
    start_lang = endpoint.split('.')[0]
    #print(json_response['query']['pages'][0])
    if 'title' in json_response['query']['pages'][0]:
        final_title = json_response['query']['pages'][0]['title']
        interlanguage_link_dict[start_lang] = final_title
    else:
        final_title = page_title
        interlanguage_link_dict[start_lang] = final_title

    if 'langlinks' in json_response['query']['pages'][0]:
        langlink_dict = json_response['query']['pages'][0]['langlinks']

        for d in langlink_dict:
            lang = d['lang']
            title = d['title']
            interlanguage_link_dict[lang] = title
            
    if multicore_dict is None:
        return {final_title:interlanguage_link_dict}
    else:
        multicore_dict[final_title] = interlanguage_link_dict


In [6]:
test_ills = dict()
endpoint = "{}.wikipedia.org/w/api.php".format('de')
p = 'Wikipedia:Neutraler Standpunkt'
testtemp = get_interlanguage_links(p,endpoint)

test_ills['test'] = testtemp

In [7]:
langs = ["de","en","es","fr","ja"]
ills = {}

In [8]:
for lang in langs:
    print(lang)
    endpoint = "{}.wikipedia.org/w/api.php".format(lang)
    df = rules_dfs_dict[lang]
    pagetitles = df["titles"].tolist()

    temp = {}
    prev = ""
    for p in sorted(pagetitles):
        #print(p)

        if p == prev:
            print("\n","Error! A repeat: ",p)
        prev = p

        try:
            temp.update(get_interlanguage_links(p,endpoint))
        except KeyboardInterrupt:
            break
        except:
            print('!!! FAILED > {}'.format(p))
            pass

    ills[lang] = temp

de
en
es
fr
ja


In [9]:
def get_ills_aslist(langlinks_dict):
    '''
    Takes in the langlinks dict and returns a dictionary that has page titles as keys and a list of ILL language codes as values.
    '''
    ill_list = dict()
    for pagekey in langlinks_dict.keys():
        _temp = list()
        for langkey in langlinks_dict[pagekey].keys():
            _temp.append(langkey)
        ill_list[pagekey] = _temp
    return ill_list

In [10]:
rule_interlanguagelinks = dict()

In [11]:
for lang in langs:
    print(lang)
    rule_interlanguagelinks[lang] = get_ills_aslist(ills[lang])

de
en
es
fr
ja


In [12]:
len(rule_interlanguagelinks['de'].keys())

122

In [13]:
#ills['de']['Wikipedia:Richtlinien']

## Parse revision histories to find when the interlanguage links happened

In [14]:
#https://en.wikipedia.org/w/index.php?title=Wikipedia:Neutral_point_of_view&diff=prev&oldid=270458
def get_addedcontent(pagetitle,revid,lang):
    pagetitle = pagetitle.replace(" ","_")
    url = "https://{}.wikipedia.org/w/index.php?title={}&diff=prev&oldid={}".format(lang,pagetitle,revid)
    #if lang == 'de':
    #    url = "https://{}.wikipedia.org/w/index.php?title={}&dir=prev&oldid={}".format(lang,pagetitle,revid)
    # if en,es,ja,fr, diff=prev. if de, dir=prev
    #print(url)

    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    revadds = soup.find_all("td", class_="diff-addedline")
    revadds = [ str(r) for r in revadds if "<div>" in str(r) ]
    
    if len(revadds) != 0:
        cleaned_revadds = list()
        
        for r in revadds:
            minisoup = BeautifulSoup(r,"html.parser")
            text = minisoup.get_text()
            #print(text)
            cleaned_revadds.append(text)
        return ' '.join(cleaned_revadds)
    
    # else, there are no additions in this revision
    else:
        return None

Starting with a test version

'''# test
_testtemp = dict()
_testtemp.update(wf.get_all_page_revisions('Wikipedia:Neutral point of view',endpoint='en.wikipedia.org/w/api.php'))
revdf = pd.concat(_testtemp)
revdf = revdf.drop(columns=['sha1','anon','userhidden','suppressed','sha1hidden','user'])
revdf = revdf.sort_values(by='timestamp')

tupled_df = pd.DataFrame()
tupled_df['tuples'] = list(zip(revdf.timestamp, revdf.revid))
oldestfirst_revs = tupled_df['tuples'].tolist()

for i in oldestfirst_revs[:2]:
    timestamp,revid = i[0],i[1]
    print(timestamp,revid)

if not os.path.isdir("./ill_tester"):
    os.mkdir("./ill_tester")

tester = dict()
for r in sorted_revids[15:30]:
    print(r)
    data = get_addedcontent('Wikipedia:Neutral point of view',r,'en')
    with open('./ill_tester/tester_{}.json'.format(r),'w') as f:
        json.dump(data, f)
'''

Actual get

In [15]:
rule_interlanguagelinks['de']['Wikipedia:Richtlinien']

['de',
 'af',
 'am',
 'ar',
 'arz',
 'as',
 'av',
 'az',
 'azb',
 'ba',
 'be',
 'be-x-old',
 'bg',
 'bh',
 'bjn',
 'bn',
 'br',
 'bs',
 'ca',
 'ce',
 'ckb',
 'cs',
 'cy',
 'da',
 'el',
 'en',
 'eo',
 'es',
 'eu',
 'fa',
 'fi',
 'fr',
 'ga',
 'gl',
 'glk',
 'gor',
 'he',
 'hi',
 'hr',
 'hsb',
 'hu',
 'hy',
 'ia',
 'id',
 'inh',
 'is',
 'it',
 'ja',
 'ka',
 'kk',
 'kl',
 'kn',
 'ko',
 'ku',
 'ky',
 'lb',
 'mai',
 'min',
 'mk',
 'ml',
 'ms',
 'mzn',
 'nap',
 'ne',
 'nl',
 'nn',
 'no',
 'nv',
 'pl',
 'ps',
 'pt',
 'rmy',
 'ro',
 'ru',
 'scn',
 'sco',
 'sd',
 'sh',
 'si',
 'simple',
 'sk',
 'sl',
 'sq',
 'sr',
 'su',
 'sv',
 'ta',
 'tg',
 'th',
 'tr',
 'tt',
 'udm',
 'uk',
 'ur',
 'uz',
 'vi',
 'yi',
 'zh',
 'zh-yue']

In [31]:
def process_languagelinksdates(lang):
    # initialize output
    ill_dates = dict()

    pageslist = list(rule_interlanguagelinks[lang].keys())
    #test = ["Wikipedia:Richtlinien","Wikipedia:Grundprinzipien","Wikipedia:Was Wikipedia nicht ist"]

    for page in pageslist:
        print(page)
        found_langs = list()
        ill_dates[page] = dict()

        _temprevs = wf.get_all_page_revisions(str(page),endpoint='{}.wikipedia.org/w/api.php'.format(lang))
        _revdf = pd.concat(_temprevs).sort_values(by='timestamp')

        _revdf['tuples'] = list(zip(_revdf.timestamp, _revdf.revid))
        _tuples = _revdf['tuples'].tolist()

        # the interlanguage links that this page has
        _ills = rule_interlanguagelinks[lang][page]
        print(_ills)
        if lang in _ills:
            _ills.remove(lang)
        _ills_full = ['{}:{}'.format(key,value) for (key,value) in ills[lang][page].items()]
        print(_ills_full)
        otherlangs = ['de','en','fr','es','ja']
        # remove self lang
        otherlangs.remove(lang)

        # ill subset should the subset of ills that this page has that are in the langs we care about
        _ills_subset = ['{}:{}'.format(key,value) for (key,value) in ills[lang][page].items() if key in otherlangs] #otherlangs or _ills
        _ills_subset = set(_ills_subset)

        print(page, len(_tuples),_revdf.shape)
        #print(_ills)

        pattern = re.compile(r"\[\[[^\]\d]+?\:.+?\]\]")

        for revision in _tuples:
            currentilldateswehave = set(list(ill_dates[page].keys()))
            if _ills_subset.issubset(currentilldateswehave):
                break

            timestamp,revid = revision[0],revision[1]

            bit = get_addedcontent(page,revid,lang)
            if bit == None:
                continue
            #print(timestamp,revid)

            finds = re.findall(pattern,bit)
            if len(finds) == 0:
                continue
            else:
                for f in finds:
                    # change this to a set of langs of interest
                    if f[2:4] in _ills:
                        if f[2:-2] not in ill_dates[page].keys():
                            ill_dates[page][f[2:-2]] = timestamp
                            found_langs.append(f[2:4])                            
                        else:
                            continue
                    else:
                        continue

    return ill_dates


Running the get ill dates for reals

In [32]:
de_illdates = process_languagelinksdates('de')

к?', 'su:Wikipedia:Artikel téh naon?', 'sv:Wikipedia:Artikel', 'tg:Википедиа:Мақола', 'th:วิกิพีเดีย:บทความหมายถึงอะไร', 'tl:Wikipedia:Ano ang isang artikulo', 'tr:Vikipedi:Madde', 'tt:Википедия:Мәкалә', 'uk:Вікіпедія:Стаття', 'ur:ویکیپیڈیا:مضمون کیا ہے؟', 'uz:Vikipediya:Maqola', 'vi:Wikipedia:Bài bách khoa là gì?', 'zh:Wikipedia:什么是条目', 'zh-min-nan:Wikipedia:Bûn-chiuⁿ', 'zh-yue:Wikipedia:乜嘢係文章']
Wikipedia:Artikel 609 (609, 16)
['af', 'als', 'ar', 'as', 'ast', 'azb', 'be', 'bg', 'bh', 'bjn', 'bs', 'ca', 'ckb', 'cs', 'cv', 'cy', 'da', 'diq', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fo', 'gl', 'he', 'hi', 'hr', 'hsb', 'hu', 'ia', 'id', 'ilo', 'it', 'ja', 'jv', 'kk', 'ko', 'ksh', 'ky', 'map-bms', 'min', 'mk', 'ml', 'mr', 'mwl', 'nl', 'no', 'pa', 'pam', 'pt', 'ro', 'roa-tara', 'ru', 'sd', 'si', 'simple', 'sk', 'sl', 'sq', 'sr', 'su', 'sv', 'tg', 'th', 'tl', 'tr', 'tt', 'uk', 'ur', 'uz', 'vi', 'zh', 'zh-min-nan', 'zh-yue']
[[en:Wikipedia:What is an article]]
[[eo:Vikipedio:Kio estas artik

ConnectionError: HTTPSConnectionPool(host='de.wikipedia.org', port=443): Max retries exceeded with url: /w/index.php?title=Wikipedia:Artikel_illustrieren&diff=prev&oldid=47220746 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001A2C9E9C4F0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [None]:
with open('interlanguagelinks_dates_de.json', 'w') as f:
    json.dump(de_illdates, f)

In [None]:
es_illdates = process_languagelinksdates('es')

In [None]:
with open('interlanguagelinks_dates_es.json', 'w') as f:
    json.dump(es_illdates, f)

In [None]:
ja_illdates = process_languagelinksdates('ja')

In [None]:
with open('interlanguagelinks_dates_ja.json', 'w') as f:
    json.dump(ja_illdates, f)

In [None]:
fr_illdates = process_languagelinksdates('fr')

In [None]:
with open('interlanguagelinks_dates_fr.json', 'w') as f:
    json.dump(fr_illdates, f)

In [None]:
en_illdates = process_languagelinksdates('en')

In [None]:
with open('interlanguagelinks_dates_en.json', 'w') as f:
    json.dump(en_illdates, f)

# Load ``interlanguagelinks_dates_[lang].json`` from disk and clean
Output ``interlanguagelinks_dates_[lang]_cleaned.json``

In [None]:
de_illdates_load = json.load("interlanguagelinks_dates_de.json")
en_illdates_load = json.load("interlanguagelinks_dates_en.json")
es_illdates_load = json.load("interlanguagelinks_dates_es.json")
fr_illdates_load = json.load("interlanguagelinks_dates_fr.json")
ja_illdates_load = json.load("interlanguagelinks_dates_ja.json")

In [None]:
# clean the ill info data


In [None]:
# output the cleaned data
with open('interlanguagelinks_dates_de_cleaned.json', 'w') as f:
    json.dump(de_illdates_cleaned, f)