In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from tqdm import tqdm
import json
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
import lsde2021.changepoints as cp
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
import matplotlib.pyplot as plt

In [None]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
all_wikis = json.loads(
    requests.get("https://commons.wikimedia.org/w/api.php?action=sitematrix&smtype=language&format=json").content
)
pprint(all_wikis)

In [None]:
def get_wikipedia_dbname(sites):
    dnames = [site["dbname"] for site in sites if site["code"] == "wiki"]
    if len(dnames) > 0:
        return dnames[0]
    return None

wikipedia = {
    c["code"]: dict(
        # code=c["code"],
        name=c["localname"],
        dbname=get_wikipedia_dbname(c["site"])
    )
    for idx, c in all_wikis["sitematrix"].items() if (
        idx != "count" and get_wikipedia_dbname(c["site"]) is not None
    )
}

print("total number of languages for wikipedia:", len(wikipedia))

In [None]:
pprint(wikipedia)

In [None]:
# exclude languages we dont know, like Lombard or Lingala
# also languages that are dead, like Latin
# also languages that were never spoken, such as Esperanto

selected_language_codes = [
    ["ar", "ary", "arz"], # Arabic, Moroccan Arabic, Egyptian Arabic
    ["az", "azb"], # Azerbaijani, South Azerbaijani
    ["bn"], # Bangla (also Bengali), spoken by 150 million just in Bangladesh
    ["bg"], # Bulgarian
    ["bs"], # Bosnian
    ["ca"], # Catalan
    ["cs"], # Czech
    ["da"], # Danish
    ["de"], # German
    ["el"], # Greek
    ["en"], # English
    ["es"], # Spanish
    ["et"], # Estonian
    ["fi"], # Finnish
    ["fr"], # French
    ['fa'], # Persian
    ["ga"], # Irish
    ["hi"], # Hindi
    ["he"], # Hebrew
    ["hu"], # Hungarian
    ['hr', 'sh'], # Croatian, Serbo-Croatian
    ["hy", "hyw"], # Armenian, Western Armenian
    ["id"], # Indonesian
    ["is"], # Icelandic
    ["it"], # Italian
    ["ja"], # Japanese
    ["ko"], # Korean
    ["ku"], # Kurdish
    ["lb"], # Luxembourgish
    ["lt"], # Lithuanian
    ["ms"], # Malay, spoken by 290 million people in Brunei and in malaysia
    # ["my"], # Burmese, 65% in Myanmar/Burma but only 33 million speakers
    ["nl"], # Dutch
    ["no"], # Norwegian
    ["pl"], # Polish
    ["pt"], # Portuguese
    ["ro"], # Romanian
    ["ru", "be", "bxr"], # Russian, Belarusian, Russia Buriat
    ["sl"], # Slovenian
    ['sk'], # Slovak
    ["sq"], # Albanian
    ["sr"], # Serbian
    ["sv"], # Swedish
    # ['tn'], # Tswana, spoken by 77% in botswana
    ["tr"], # Turkish
    ['th'], # Thai
    ["uk"], # Ukrainian
    ["vi"], # Vietnamese
    ["zh", "gan", "cdo", "zh-classical", "zh-min-nan"], # Chinese, Gan Chinese, Min Dong Chinese, Classical Chinese, Chinese (Min Nan)
]
selected_language_names = [[wikipedia[c]["name"] for c in group] for group in selected_language_codes]
pprint(selected_language_names)
print("total number of selected languages for wikipedia: %d (%d)" % (
    sum([len(group) for group in selected_language_codes]), len(selected_language_codes))
)

In [None]:
# check if the wikis actually exists
wikimedia_dump = "https://dumps.wikimedia.org/%s/20211001/"

def page_exists(url):
    try:
        _ = requests.get(url)
        return True
    except requests.exceptions.HTTPError as e:
        print(e)
        if e.response.return_code == 404:
            return False
        raise e

existing_language_codes = [
    [c for c in group if page_exists(wikimedia_dump % (wikipedia[c]["dbname"]))]
    for group in selected_language_codes
]

In [None]:
print("total number of selected languages for wikipedia: %d (%d) of %d" % (
    sum([len(group) for group in existing_language_codes]), len(existing_language_codes), len(wikipedia)
))

In [None]:
selected_wikipedia = {}
for group in existing_language_codes:
    for c in group:
        selected_wikipedia[c] = {**wikipedia[c], **dict(group=group[0])}
df = pd.DataFrame.from_dict(selected_wikipedia, orient='index')
df = spark.createDataFrame(df)
df.show()

In [None]:
# add the ISO3 country names for each language
stringency = spark.read.format("parquet").load(f"../nvme/oxcgrt-covid-policy-tracker/OxCGRT_withnotes.parquet")
stringency = stringency.select(F.col("CountryName").alias("country"), F.col("CountryCode").alias("iso3")).distinct()
stringency.show()

In [None]:
def get_countries_handler(s):
    return cp.COUNTRIES.get(s, [])
country_udf = F.udf(get_countries_handler, T.ArrayType(T.StringType()))

df = df.withColumn("countries", country_udf(df['group']))
df = df.select("name", "dbname", "group", F.explode("countries").alias("country"))
df.show()

In [None]:
df = df.join(stringency, on="country", how="inner")
df.show()

In [None]:
country_codes_schema = T.StructType([
    T.StructField("name", T.StringType(), True),
    T.StructField("cca2",T.StringType(), True),
    T.StructField("cca3", T.StringType(), True),
    T.StructField("ccn3", T.IntegerType(), True),
])

country_codes = spark.read.format("csv").schema(country_codes_schema).options(header=True).load("../nvme/country_codes_2020.csv")
country_codes = country_codes.select(F.col("name"), F.col("cca3").alias("iso3"))

population_sizes_schema = T.StructType([
    T.StructField("Rank", T.IntegerType(), True),
    T.StructField("name",T.StringType(), True),
    T.StructField("pop2019", T.FloatType(), True),
    T.StructField("pop2018", T.FloatType(), True),
    T.StructField("GrowthRate", T.FloatType(), True),
    T.StructField("area", T.IntegerType(), True),
    T.StructField("Density", T.FloatType(), True),
])

population_sizes = spark.read.format("csv").schema(population_sizes_schema).options(header=True).load("../nvme/countries_by_population_2019.csv")
population_sizes = population_sizes.select(F.col("name"), F.col("pop2019").alias("population_size"))
population_sizes = population_sizes.join(country_codes, on="name", how="inner")
population_sizes = population_sizes.select(F.col("iso3"), F.col("population_size"))
population_sizes = population_sizes.withColumn("population_size", (1_000 * population_sizes["population_size"]).cast(T.IntegerType()))
population_sizes.show()

In [None]:
df = df.join(population_sizes, on="iso3", how="left")
df.show()

In [None]:
# export the list of languages we will use
df.write.format("parquet").mode("overwrite").save("./data/languages.parquet")

In [None]:
out_path = Path("./website/src/data")
languages_countries_out_path = out_path / "languages_countries.json"
languages_countries_out_path.parent.mkdir(parents=True, exist_ok=True)
languages_json = [json.loads(s) for s in df.toJSON().collect()]
pprint(languages_json[:2])

with open(languages_countries_out_path, "w") as f:
    json.dump(languages_json, f, indent=2, sort_keys=True, default=str)