# Find the list of missing dates

In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
import requests

In [2]:
def get_interlanguage_links(page_title, endpoint='en.wikipedia.org/w/api.php', redirects=1, multicore_dict=None):
    """
    FROM BRIAN KEEGAN'S SHARED WIKIFUNCTIONS.
    Returns:
    langlink_dict - a dictionary keyed by lang codes and page title as values
    """

    query_url = "https://{0}".format(endpoint)
    query_params = {}
    query_params['action'] = 'query'
    query_params['prop'] = 'langlinks'
    query_params['titles'] = page_title
    query_params['redirects'] = redirects
    query_params['llprop'] = 'autonym|langname'
    query_params['lllimit'] = 500
    query_params['format'] = 'json'
    query_params['formatversion'] = 2
    json_response = requests.get(url=query_url,params=query_params).json()
    #print(json_response)

    interlanguage_link_dict = dict()
    start_lang = endpoint.split('.')[0]
    #print(json_response['query']['pages'][0])
    if 'title' in json_response['query']['pages'][0]:
        final_title = json_response['query']['pages'][0]['title']
        interlanguage_link_dict[start_lang] = final_title
    else:
        final_title = page_title
        interlanguage_link_dict[start_lang] = final_title

    if 'langlinks' in json_response['query']['pages'][0]:
        langlink_dict = json_response['query']['pages'][0]['langlinks']

        for d in langlink_dict:
            lang = d['lang']
            title = d['title']
            interlanguage_link_dict[lang] = title

    if multicore_dict is None:
        return {final_title:interlanguage_link_dict}
    else:
        multicore_dict[final_title] = interlanguage_link_dict

def get_ills_aslist(langlinks_dict):
    '''
    Takes in the langlinks dict and returns a dictionary that has page titles as keys and a list of ILL language codes as values.
    '''
    ill_list = dict()
    for pagekey in langlinks_dict.keys():
        _temp = list()
        for langkey in langlinks_dict[pagekey].keys():
            _temp.append(langkey)
        ill_list[pagekey] = _temp
    return ill_list


In [3]:
# load all the lists of rules
rules_df_de = pd.read_csv(Path(os.getcwd()).parent / "dewiki.tsv",sep="\t",header=None)
rules_df_en = pd.read_csv(Path(os.getcwd()).parent / "enwiki.tsv",sep="\t",header=None)
rules_df_es = pd.read_csv(Path(os.getcwd()).parent / "eswiki.tsv",sep="\t",header=None)
rules_df_fr = pd.read_csv(Path(os.getcwd()).parent / "frwiki.tsv",sep="\t",header=None)
rules_df_ja = pd.read_csv(Path(os.getcwd()).parent / "jawiki.tsv",sep="\t",header=None)

In [4]:
interlanguagelinks = dict()
interlanguagelinks_aslist = dict()

A little bit of data wrangling.

In [5]:
# de
rules_df_de = rules_df_de.rename(columns={0: "NA", 1: "links",2:"titles"})

pagetitles = rules_df_de["titles"].tolist()

endpoint = "{}.wikipedia.org/w/api.php".format('de')
temp = {}
for p in sorted(pagetitles):
    temp.update(get_interlanguage_links(p,endpoint))
    try:
        temp.update(get_interlanguage_links(p,endpoint))
    except KeyboardInterrupt:
        break
    except:
        print('!!! FAILED > {}'.format(p))
        pass

interlanguagelinks['de'] = temp

In [6]:
interlanguagelinks_aslist['de'] = get_ills_aslist(interlanguagelinks['de'])

In [7]:
# en
rules_df_en = rules_df_en.rename(columns={0: "NA", 1: "links",2:"titles"})

pagetitles = rules_df_en["titles"].tolist()

endpoint = "{}.wikipedia.org/w/api.php".format('en')
temp = {}
for p in sorted(pagetitles):
    temp.update(get_interlanguage_links(p,endpoint))
    try:
        temp.update(get_interlanguage_links(p,endpoint))
    except KeyboardInterrupt:
        break
    except:
        print('!!! FAILED > {}'.format(p))
        pass

interlanguagelinks['en'] = temp

In [8]:
# es
rules_df_es = rules_df_es.rename(columns={0: "NA", 1: "links",2:"titles"})

pagetitles = rules_df_es["titles"].tolist()

endpoint = "{}.wikipedia.org/w/api.php".format('es')
temp = {}
for p in sorted(pagetitles):
    temp.update(get_interlanguage_links(p,endpoint))
    try:
        temp.update(get_interlanguage_links(p,endpoint))
    except KeyboardInterrupt:
        break
    except:
        print('!!! FAILED > {}'.format(p))
        pass

interlanguagelinks['es'] = temp

In [9]:
# fr
rules_df_fr = rules_df_fr.rename(columns={0: "NA", 1: "links",2:"titles"})

pagetitles = rules_df_fr["titles"].tolist()

endpoint = "{}.wikipedia.org/w/api.php".format('fr')
temp = {}
for p in sorted(pagetitles):
    temp.update(get_interlanguage_links(p,endpoint))
    try:
        temp.update(get_interlanguage_links(p,endpoint))
    except KeyboardInterrupt:
        break
    except:
        print('!!! FAILED > {}'.format(p))
        pass

interlanguagelinks['fr'] = temp

In [10]:
# ja
rules_df_ja = rules_df_ja.rename(columns={0: "NA", 1: "links",2:"titles"})

pagetitles = rules_df_ja["titles"].tolist()

endpoint = "{}.wikipedia.org/w/api.php".format('ja')
temp = {}
for p in sorted(pagetitles):
    temp.update(get_interlanguage_links(p,endpoint))
    try:
        temp.update(get_interlanguage_links(p,endpoint))
    except KeyboardInterrupt:
        break
    except:
        print('!!! FAILED > {}'.format(p))
        pass

interlanguagelinks['ja'] = temp

In [11]:
interlanguagelinks_aslist['en'] = get_ills_aslist(interlanguagelinks['en'])
interlanguagelinks_aslist['es'] = get_ills_aslist(interlanguagelinks['es'])
interlanguagelinks_aslist['fr'] = get_ills_aslist(interlanguagelinks['fr'])
interlanguagelinks_aslist['ja'] = get_ills_aslist(interlanguagelinks['ja'])

## Find missing links for the five language editions

In [12]:
missinglinks = dict()

In [13]:
langs = ['de','en','fr','es','ja']

In [14]:
for lang in langs:
    print(lang)
    missinglinks[lang] = dict()
    with open('interlanguagelinks_dates_{}.json'.format(lang),'r') as f:
        data = json.load(f)

        for rule in data:
            print(rule)
            #print(data[rule])
            missinglinks[lang][rule] = list()

            # the interlanguage links that this page has
            _ills = interlanguagelinks_aslist[lang][rule]
            if lang in _ills:
                _ills.remove(lang)
            print(_ills)

            # the keys we have dates for 
            foundkeys = list(data[rule].keys())
            print(foundkeys)

            otherlangs = ['de','en','fr','es','ja']
            otherlangs.remove(lang)

            _ills_asfullname = ['{}:{}'.format(key,value) for (key,value) in interlanguagelinks[lang][rule].items() if key in otherlangs]

            for linkname in _ills_asfullname:
                if linkname not in foundkeys:
                    missinglinks[lang][rule].append(linkname)


pedia:Interessenkonflikt#Eigendarstellung', 'ru:Википедия:Автобиографии']
Wikipedia:自著作物の持ち込み
['ca', 'en', 'fr', 'is', 'ko', 'sr', 'th', 'vi', 'zh']
['en:Wikipedia:Donating copyrighted materials', 'is:Wikipedia:Gefa höfundaréttarvarin gögn', 'vi:Wikipedia:Hiến các tài liệu có bản quyền', 'zh:Wikipedia:捐赠版权材料', 'sr:Википедија:Донирање материјала са ауторским правима', 'ko:위키백과:자신의 저작물 기부하기']
Wikipedia:自著作物の持ち込み/削除依頼を出されたら
[]
[]
Wikipedia:色の使用
[]
[]
Wikipedia:荒らし
['af', 'als', 'ang', 'ar', 'arz', 'as', 'ast', 'az', 'azb', 'be', 'be-x-old', 'bg', 'bjn', 'bn', 'bs', 'bxr', 'ca', 'ce', 'ceb', 'chr', 'ckb', 'cs', 'da', 'de', 'diq', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'gl', 'gn', 'he', 'hi', 'hr', 'hsb', 'hu', 'hy', 'id', 'ilo', 'is', 'it', 'jv', 'ka', 'kk', 'km', 'ko', 'la', 'lb', 'lt', 'ltg', 'lv', 'mk', 'ml', 'mn', 'ms', 'nds-nl', 'ne', 'nl', 'nn', 'no', 'oc', 'pl', 'pt', 'qu', 'ro', 'ru', 'scn', 'sco', 'sd', 'se', 'sh', 'simple', 'sk', 'sl', 'sr', 'sv', 'sw', 'te',

In [15]:
missinglinks['ja']['Wikipedia:記事名の付け方/鉄道']

['de:Wikipedia:Namenskonventionen/Schienenfahrzeuge']

In [16]:
missinglinks_cleaned = dict()

In [17]:
for _lang in missinglinks:
    missinglinks_cleaned[_lang] = dict()
    for _rule in missinglinks[_lang]:
        if (len(missinglinks[_lang][_rule])!=0):
            # delete the rule pages that don't have missing links
            #missinglinks[_lang].pop(_rule, None)
            missinglinks_cleaned[_lang][_rule] = missinglinks[_lang][_rule]

In [24]:
for x in missinglinks_cleaned:
    print(missinglinks_cleaned[x])

 'Wikipedia:Etiquette': ['fr:Wikipédia:Étiquette'], 'Wikipedia:External links': ['ja:Wikipedia:外部リンク'], 'Wikipedia:Fringe theories': ['es:Wikipedia:Teoría marginal', 'fr:Wikipédia:Théorie controversée'], 'Wikipedia:Gaming the system': ['fr:Wikipédia:Jouer avec les principes, règles et recommandations de Wikipédia', 'ja:Wikipedia:規則の悪用'], 'Wikipedia:IP block exemption': ['es:Wikipedia:Exentos de bloqueo a IP'], 'Wikipedia:Identifying reliable sources (medicine)': ['ja:Wikipedia:信頼できる情報源 (医学)'], 'Wikipedia:Image use policy': ["fr:Wikipédia:Conventions d'utilisation des images"], 'Wikipedia:Images': ['es:Wikipedia:Imágenes', 'fr:Aide:Images'], 'Wikipedia:Interface administrators': ['de:Wikipedia:Benutzeroberflächenadministratoren', 'es:Wikipedia:Administradores de interfaz', "fr:Wikipédia:Administrateur d'interface", 'ja:Wikipedia:インターフェース管理者'], 'Wikipedia:Libel': ['fr:Wikipédia:Diffamation'], 'Wikipedia:Make technical articles understandable': ['ja:Wikipedia:専門的な記事も分かり易く'], 'Wikipedia:Ma

In [26]:
#export for manual missing link finding
for _lang in missinglinks_cleaned:
    print(_lang)
    with open('missinglinks_{}.txt'.format(_lang),'w',encoding="utf-8") as f:
        for _rule in missinglinks_cleaned[_lang]:
            f.write('{}\n'.format(_rule))
            for _link in missinglinks_cleaned[_lang][_rule]:
                #print(str(_link).encode("utf-8"))
                f.write('\t{}\n'.format(_link))
            f.write('\n')

de
en
fr
es
ja
