In [None]:
from bs4 import BeautifulSoup
import requests
import re, os, csv
import wikifunctions_sohyeon as wfs
import pandas as pd
import time
import json
from collections import Counter

In [None]:
langs_10 = ['en','fr','de','ja','es','ru','zh','it','pt','fa']
langs_5 = ['en','fr','de','ja','es']

Get all pages in the Wikipedia Project namespace per language edition

In [None]:
# getting the list of all pages in the wikipedia project namespace per language edition
rule_dfs_dict = {}
rule_dfs_list = []

for l in langs_10:
    _temp = wfs.get_all_pages_in_namespace(l)
    rule_dfs_dict[l] = _temp
    rule_dfs_list.append(_temp)
    print('{} has {} pages with language links in the Wikipedia Project namespace'.format(l, len(_temp)))

In [None]:
# export each list as df -> tsv so that we don't have to get them all over again
output_directory = './nm4/nm4_all_list'

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

for df in rule_dfs_list:
    _lang = df.lang.values.tolist()[0]
    path = './{}/{}.tsv'.format(output_directory,_lang)
    df.to_csv(path,index=False,sep='\t')

Get all the interlanguage links for the pages in the Wikipedia project namespace

In [None]:
# now load those dfs and get all the ILLs for the pages
rule_dfs_dict = {}
rule_dfs_list = []

for l in langs_10:
    path = './nm4/nm4_all_list/{}.tsv'.format(l)
    _df = pd.read_csv(path, sep='\t',header=0)
    rule_dfs_dict[l] = _df
    rule_dfs_list.append(_df)

In [None]:
output_directory = './nm4/nm4_all_ills'

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

This block below takes like 100+ minutes to run. Strap in and work on something else.

In [None]:
interlanguage_links = {}

# run through each language edition
for lang in langs_10:
    print(lang)
    start = time.time()
    interlanguage_links[lang] = {}
    # get the list of pages
    _df = rule_dfs_dict[lang]
    rule_list = _df['title'].values.tolist()

    # for each page
    for rule in rule_list:
        ill_dict = wfs.get_interlanguage_links(rule, endpoint=lang, redirects=1)
        interlanguage_links[lang][rule] = {}
        interlanguage_links[lang][rule]['count'] = len(list(ill_dict.keys()))
        interlanguage_links[lang][rule]['langs'] = list(ill_dict.keys())
        interlanguage_links[lang][rule]['links'] = ill_dict

    # export the interlanguage_links[lang]
    with open("./{}/ills_{}.json".format(output_directory,lang), "w") as outfile:
        json.dump(interlanguage_links[lang], outfile)
    
    end = time.time()
    print("{}: {}".format(lang, end - start))

Preserving the last output of running the above code. I actually did English first, and then the rest of the language editions. English took about 45-50 minutes to do.

```
fr
    Wikipédia:Politique de confidentialité
    {'interwiki': [{'title': 'meta:Privacy policy/fr', 'iw': 'meta'}], 'redirects': [{'from': 'Wikipédia:Politique de confidentialité', 'to': 'meta:Privacy policy/fr', 'tointerwiki': 'meta'}]}
fr: 1433.1813595294952
de: 405.3607313632965
ja: 209.69620060920715
es: 216.3900294303894
ru: 274.83118629455566
zh: 687.3477938175201
it: 105.42117857933044
pt: 516.9702491760254
fa: 493.1874372959137
```

```
f = open("./{}/ills_en.json".format(output_directory))
en_data = json.load(f)
interlanguage_links['en'] = en_data
print(interlanguage_links.keys())
with open("./{}/ills_all.json".format(output_directory), "w") as outfile:
    json.dump(interlanguage_links, outfile)
```

Filter through the pages in the Wikipedia Project namespace by the number of interlanguage links they have

In [None]:
# load up interlanguage_links
f = open("./{}/ills_all.json".format(output_directory))
interlanguage_links = json.load(f)

In [None]:
interlanguage_links.keys(), len(interlanguage_links.keys())

In [None]:
#flatten interlanguage_links and make into a df
_data = []

for lang in interlanguage_links:
    for rule in interlanguage_links[lang]:
        _count = interlanguage_links[lang][rule]['count']
        _langs = interlanguage_links[lang][rule]['langs']
        _links = interlanguage_links[lang][rule]['links']

        _data.append([lang,rule,_count,_langs,_links])

df = pd.DataFrame(_data,columns=['lang','title','ill_count','ill_langs','ill_links'])
len(df[df.ill_count > 10])

Now we can filter it by number of ILLs and whatever.

In [None]:
filtered_df = df.loc[df.ill_langs.map(set(langs_10).issubset)]
filtered_df = filtered_df.copy()
print(len(filtered_df))
filtered_df.head(3)

In [None]:
for lang in langs_10:
    print(lang,len(filtered_df[filtered_df.lang==lang]))

In [None]:
len(
    filtered_df[filtered_df.ill_langs.map(set(['en']).issubset)]
)

In [None]:
def get_en_title(row):
    links = row.ill_links
    return links['en']

In [None]:
filtered_df['en_title'] = filtered_df.apply(get_en_title,axis=1)

In [None]:
# unfiltered rules, a set of project namespace pages that are clearly in the Wikipedia: space and have an ILL to all top ten language editions
unfiltered_rules = filtered_df.drop_duplicates(subset=['en_title'], keep='last').loc[filtered_df.en_title.str.contains('Wikipedia:')==True]

calling these pages "rules" (as I suggest in variable naming) per say is a bit premature, because it does still include other non-rule but still meta project content probably..

In [None]:
set(unfiltered_rules.lang.values.tolist())

In [None]:
unfiltered_rules.sort_values(by='ill_count',ascending=False)

In [None]:
# 202104_rules_withillcolumns is my curated set of rules, re-structured a bit for convenience
# i don't quite remember how it is that I generated this file, but I have my suspicions and can probably figure it out if I dig through old code again.

curated_rules_ills =  pd.read_csv("202104_rules_withillcolumns",sep='\t',header=0)
have_4_ills = curated_rules_ills.loc[curated_rules_ills.num_links ==5]
widely_shared_rules = have_4_ills.en.values.tolist()
len(widely_shared_rules)