In [None]:
import requests
import json
import pandas as pd
from pprint import pprint

In [None]:
all_wikis = json.loads(
    requests.get("https://commons.wikimedia.org/w/api.php?action=sitematrix&smtype=language&format=json").content
)
pprint(all_wikis)

In [None]:
def get_wikipedia_dbname(sites):
    dnames = [site["dbname"] for site in sites if site["code"] == "wiki"]
    if len(dnames) > 0:
        return dnames[0]
    return None

wikipedia = {
    c["code"]: dict(
        # code=c["code"],
        name=c["localname"],
        dbname=get_wikipedia_dbname(c["site"])
    )
    for idx, c in all_wikis["sitematrix"].items() if (
        idx != "count" and get_wikipedia_dbname(c["site"]) is not None
    )
}

print("total number of languages for wikipedia:", len(wikipedia))

In [None]:
pprint(wikipedia)

In [None]:
# exclude languages we dont know, like Lombard or Lingala
# also languages that are dead, like Latin
# also languages that were never spoken, such as Esperanto

selected_language_codes = [
    ["ar", "ary", "arz"], # Arabic, Moroccan Arabic, Egyptian Arabic
    ["az", "azb"], # Azerbaijani, South Azerbaijani
    ["bn"], # Bangla (also Bengali), spoken by 150 million just in Bangladesh
    ["bg"], # Bulgarian
    ["bs"], # Bosnian
    ["ca"], # Catalan
    ["cs"], # Czech
    ["da"], # Danish
    ["de"], # German
    ["el"], # Greek
    ["en"], # English
    ["es"], # Spanish
    ["et"], # Estonian
    ["fi"], # Finnish
    ["fr"], # French
    ['fa'], # Persian
    ["ga"], # Irish
    ["hi"], # Hindi
    ["he"], # Hebrew
    ["hu"], # Hungarian
    ['hr', 'sh'], # Croatian, Serbo-Croatian
    ["hy", "hyw"], # Armenian, Western Armenian
    ["id"], # Indonesian
    ["is"], # Icelandic
    ["it"], # Italian
    ["ja"], # Japanese
    ["ko"], # Korean
    ["ku"], # Kurdish
    ["lb"], # Luxembourgish
    ["lt"], # Lithuanian
    ["ms"], # Malay, spoken by 290 million people in Brunei and in malaysia
    # ["my"], # Burmese, 65% in Myanmar/Burma but only 33 million speakers
    ["nl"], # Dutch
    ["no"], # Norwegian
    ["pl"], # Polish
    ["pt"], # Portuguese
    ["ro"], # Romanian
    ["ru", "be", "bxr"], # Russian, Belarusian, Russia Buriat
    ["sl"], # Slovenian
    ['sk'], # Slovak
    ["sq"], # Albanian
    ["sr"], # Serbian
    ["sv"], # Swedish
    # ['tn'], # Tswana, spoken by 77% in botswana
    ["tr"], # Turkish
    ['th'], # Thai
    ["uk"], # Ukrainian
    ["vi"], # Vietnamese
    ["zh", "gan", "cdo", "zh-classical", "zh-min-nan"], # Chinese, Gan Chinese, Min Dong Chinese, Classical Chinese, Chinese (Min Nan)
]
selected_language_names = [[wikipedia[c]["name"] for c in group] for group in selected_language_codes]
pprint(selected_language_names)
print("total number of selected languages for wikipedia: %d (%d)" % (
    sum([len(group) for group in selected_language_codes]), len(selected_language_codes))
)

In [None]:
# check if the wikis actually exists
wikimedia_dump = "https://dumps.wikimedia.org/%s/20211001/"

def page_exists(url):
    try:
        _ = requests.get(url)
        return True
    except requests.exceptions.HTTPError as e:
        print(e)
        if e.response.return_code == 404:
            return False
        raise e

existing_language_codes = [
    [c for c in group if page_exists(wikimedia_dump % (wikipedia[c]["dbname"]))]
    for group in selected_language_codes
]

In [None]:
print("total number of selected languages for wikipedia: %d (%d) of %d" % (
    sum([len(group) for group in existing_language_codes]), len(existing_language_codes), len(wikipedia)
))

In [None]:
# export the list of languages we will use
selected_wikipedia = {}
for group in existing_language_codes:
    for c in group:
        selected_wikipedia[c] = {**wikipedia[c], **dict(group=group[0])}
df = pd.DataFrame.from_dict(selected_wikipedia, orient='index')
df.head()
df.to_csv("./data/languages.csv", index_label="code")