In [2]:
from bs4 import BeautifulSoup
import requests
import re, os, csv
import wikifunctions_s as wfs
import pandas as pd
import time
import json
from collections import Counter

In [3]:
langs_10 = ['en','fr','de','ja','es','ru','zh','it','pt','fa']
langs_5 = ['en','fr','de','ja','es']

Get all pages in the Wikipedia Project namespace per language edition

In [4]:
%%script false --no-raise-error
# getting the list of all pages in the wikipedia project namespace per language edition
rule_dfs_dict = {}
rule_dfs_list = []

for l in langs_10:
    _temp = wfs.get_all_pages_in_namespace(l)
    rule_dfs_dict[l] = _temp
    rule_dfs_list.append(_temp)
    print('{} has {} pages with language links in the Wikipedia Project namespace'.format(l, len(_temp)))

Couldn't find program: 'false'


In [5]:
%%script false --no-raise-error
# export each list as df -> tsv so that we don't have to get them all over again
output_directory = 'nm4_all_list'

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

for df in rule_dfs_list:
    _lang = df.lang.values.tolist()[0]
    path = './{}/{}.tsv'.format(output_directory,_lang)
    df.to_csv(path,index=False,sep='\t')

Couldn't find program: 'false'


Get all the interlanguage links for the pages in the Wikipedia project namespace

In [6]:
# now load those dfs and get all the ILLs for the pages
rule_dfs_dict = {}
rule_dfs_list = []

for l in langs_10:
    path = './nm4_all_list/{}.tsv'.format(l)
    _df = pd.read_csv(path, sep='\t',header=0)
    rule_dfs_dict[l] = _df
    rule_dfs_list.append(_df)

In [7]:
output_directory = 'nm4_all_ills'

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

This block below takes like 100+ minutes to run. Strap in and work on something else.

In [8]:
%%script false --no-raise-error
interlanguage_links = {}

# run through each language edition
for lang in langs_10:#[1:]:
    print(lang)
    start = time.time()
    interlanguage_links[lang] = {}
    # get the list of pages
    _df = rule_dfs_dict[lang]
    rule_list = _df['title'].values.tolist()

    # for each page
    for rule in rule_list:
        ill_dict = wfs.get_interlanguage_links(rule, endpoint=lang, redirects=1)
        interlanguage_links[lang][rule] = {}
        interlanguage_links[lang][rule]['count'] = len(list(ill_dict.keys()))
        interlanguage_links[lang][rule]['langs'] = list(ill_dict.keys())
        interlanguage_links[lang][rule]['links'] = ill_dict

    # export the interlanguage_links[lang]
    with open("./{}/ills_{}.json".format(output_directory,lang), "w") as outfile:
        json.dump(interlanguage_links[lang], outfile)
    
    end = time.time()
    print("{}: {}".format(lang, end - start))

Couldn't find program: 'false'


```
fr
    Wikipédia:Politique de confidentialité
    {'interwiki': [{'title': 'meta:Privacy policy/fr', 'iw': 'meta'}], 'redirects': [{'from': 'Wikipédia:Politique de confidentialité', 'to': 'meta:Privacy policy/fr', 'tointerwiki': 'meta'}]}
fr: 1433.1813595294952
de: 405.3607313632965
ja: 209.69620060920715
es: 216.3900294303894
ru: 274.83118629455566
zh: 687.3477938175201
it: 105.42117857933044
pt: 516.9702491760254
fa: 493.1874372959137
```

```
f = open("./{}/ills_en.json".format(output_directory))
en_data = json.load(f)
interlanguage_links['en'] = en_data
print(interlanguage_links.keys())
with open("./{}/ills_all.json".format(output_directory), "w") as outfile:
    json.dump(interlanguage_links, outfile)
```

Filter through the pages in the Wikipedia Project namespace by the number of interlanguage links they have

In [9]:
# load up interlanguage_links
f = open("./{}/ills_all.json".format(output_directory))
interlanguage_links = json.load(f)

In [10]:
interlanguage_links.keys(), len(interlanguage_links.keys())

(dict_keys(['fr', 'de', 'ja', 'es', 'ru', 'zh', 'it', 'pt', 'fa', 'en']), 10)

In [11]:
#flatten interlanguage_links
_data = []

for lang in interlanguage_links:
    for rule in interlanguage_links[lang]:
        _count = interlanguage_links[lang][rule]['count']
        _langs = interlanguage_links[lang][rule]['langs']
        _links = interlanguage_links[lang][rule]['links']

        _data.append([lang,rule,_count,_langs,_links])

In [12]:
df = pd.DataFrame(_data,columns=['lang','title','ill_count','ill_langs','ill_links'])

In [13]:
len(df[df.ill_count > 10])

9050

In [14]:
filtered_df = df.loc[df.ill_langs.map(set(langs_10).issubset)]
filtered_df = filtered_df.copy()
print(len(filtered_df))
filtered_df.head(3)

1633


Unnamed: 0,lang,title,ill_count,ill_langs,ill_links
11,fr,Wikipédia:Accueil de la communauté,201,"[fr, aa, ady, af, als, alt, ami, an, ang, ar, ...","{'fr': 'Wikipédia:Accueil de la communauté', '..."
13,fr,Wikipédia:Accueil principal,325,"[fr, aa, ab, ace, ady, af, ak, als, alt, am, a...","{'fr': 'Wikipédia:Accueil principal', 'aa': 'M..."
16,fr,Wikipédia:Administrateur,259,"[fr, aa, ab, ace, ady, af, ak, als, am, an, an...","{'fr': 'Wikipédia:Administrateur', 'aa': 'Wiki..."


In [15]:
for lang in langs_10:
    print(lang,len(filtered_df[filtered_df.lang==lang]))

en 641
fr 84
de 114
ja 97
es 110
ru 124
zh 131
it 78
pt 128
fa 126


In [16]:
len(
    filtered_df[filtered_df.ill_langs.map(set(['en']).issubset)]
)

1633

In [17]:
def get_en_title(row):
    links = row.ill_links
    return links['en']

In [18]:
filtered_df['en_title'] = filtered_df.apply(get_en_title,axis=1)

In [19]:
# unfiltered rules, a set of project namespace pages that are clearly in the Wikipedia: space and have an ILL to all top ten language editions
unfiltered_rules = filtered_df.drop_duplicates(subset=['en_title'], keep='last').loc[filtered_df.en_title.str.contains('Wikipedia:')==True]

In [20]:
set(unfiltered_rules.lang.values.tolist())

{'de', 'en', 'es', 'it', 'pt'}

In [21]:
unfiltered_rules.sort_values(by='ill_count',ascending=False)

Unnamed: 0,lang,title,ill_count,ill_langs,ill_links,en_title
27126,en,Wikipedia:Wheel war,259,"[en, aa, ab, ace, ady, af, ak, als, am, an, an...","{'en': 'Wikipedia:Administrators', 'aa': 'Wiki...",Wikipedia:Administrators
26939,en,Wikipedia:Village pump,245,"[en, ab, ace, ady, af, ak, als, alt, am, an, a...","{'en': 'Wikipedia:Village pump', 'ab': 'Авикип...",Wikipedia:Village pump
22017,en,Wikipedia:Sandbox,203,"[en, aa, ace, ady, af, ak, als, an, ang, ar, a...","{'en': 'Wikipedia:Sandbox', 'aa': 'Wikipedia:S...",Wikipedia:Sandbox
18437,en,Wikipedia:Community portal,201,"[en, aa, ady, af, als, alt, ami, an, ang, ar, ...","{'en': 'Wikipedia:Community portal', 'aa': 'Wi...",Wikipedia:Community portal
18234,en,Wikipedia:Bots,181,"[en, ace, af, als, am, an, ang, ar, arc, ary, ...","{'en': 'Wikipedia:Bots', 'ace': 'Wikipedia:Bot...",Wikipedia:Bots
...,...,...,...,...,...,...
22592,en,Wikipedia:Systemic bias,22,"[en, ar, bg, bn, cs, de, eo, es, fa, fr, hr, i...","{'en': 'Wikipedia:Systemic bias', 'ar': 'ويكيب...",Wikipedia:Systemic bias
27877,en,Wikipedia:WikiProject Economics,21,"[en, ar, de, es, fa, fi, fr, gl, it, ja, ko, m...","{'en': 'Wikipedia:WikiProject Economics', 'ar'...",Wikipedia:WikiProject Economics
27521,en,Wikipedia:WikiProject Basketball,19,"[en, ar, ca, de, el, es, fa, fi, fr, it, ja, k...","{'en': 'Wikipedia:WikiProject Basketball', 'ar...",Wikipedia:WikiProject Basketball
28360,en,Wikipedia:WikiProject Law,18,"[en, de, es, fa, fr, id, it, ja, ko, lt, pl, p...","{'en': 'Wikipedia:WikiProject Law', 'de': 'Wik...",Wikipedia:WikiProject Law


In [33]:
curated_rules_ills =  pd.read_csv("./../1.0/202104_rules_withillcolumns",sep='\t',header=0)
have_4_ills = curated_rules_ills.loc[curated_rules_ills.num_links ==5]
widely_shared_rules = have_4_ills.en.values.tolist()
len(widely_shared_rules)

221

In [35]:
unfiltered_rules.loc[unfiltered_rules.en_title.isin(widely_shared_rules)].sort_values(by='ill_count',ascending=False)

Unnamed: 0,lang,title,ill_count,ill_langs,ill_links,en_title
27126,en,Wikipedia:Wheel war,259,"[en, aa, ab, ace, ady, af, ak, als, am, an, an...","{'en': 'Wikipedia:Administrators', 'aa': 'Wiki...",Wikipedia:Administrators
18234,en,Wikipedia:Bots,181,"[en, ace, af, als, am, an, ang, ar, arc, ary, ...","{'en': 'Wikipedia:Bots', 'ace': 'Wikipedia:Bot...",Wikipedia:Bots
21865,en,Wikipedia:Requests for bureaucratship,146,"[en, ab, ady, an, ang, ar, ary, arz, as, av, a...","{'en': 'Wikipedia:Requests for adminship', 'ab...",Wikipedia:Requests for adminship
22561,en,Wikipedia:Stub,142,"[en, af, als, ang, ar, ary, arz, as, az, azb, ...","{'en': 'Wikipedia:Stub', 'af': 'Wikipedia:Saad...",Wikipedia:Stub
19284,en,Wikipedia:Five pillars,132,"[en, af, als, ang, ar, arz, as, ast, az, azb, ...","{'en': 'Wikipedia:Five pillars', 'af': 'Wikipe...",Wikipedia:Five pillars
18612,en,Wikipedia:Copyrights,125,"[en, ab, af, als, an, ar, ary, arz, as, ast, a...","{'en': 'Wikipedia:Copyrights', 'ab': 'Авикипед...",Wikipedia:Copyrights
20842,en,Wikipedia:Neutral point of view,121,"[en, af, am, an, ar, ary, arz, as, ast, az, az...","{'en': 'Wikipedia:Neutral point of view', 'af'...",Wikipedia:Neutral point of view
27115,en,Wikipedia:What Wikipedia is not,119,"[en, af, als, ang, ar, ary, arz, as, ast, az, ...","{'en': 'Wikipedia:What Wikipedia is not', 'af'...",Wikipedia:What Wikipedia is not
18265,en,Wikipedia:Bureaucrats,115,"[en, af, ak, an, ar, ary, arz, ast, az, azb, b...","{'en': 'Wikipedia:Bureaucrats', 'af': 'Wikiped...",Wikipedia:Bureaucrats
18748,en,Wikipedia:Disambiguation,113,"[en, af, an, ar, arz, as, ast, az, azb, ba, ba...","{'en': 'Wikipedia:Disambiguation', 'af': 'Wiki...",Wikipedia:Disambiguation
