In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from tqdm import tqdm
import json
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
import lsde2021.changepoints as cp
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F
import matplotlib.pyplot as plt

In [2]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/03 12:47:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/11/03 12:47:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/11/03 12:47:49 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
21/11/03 12:47:49 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
21/11/03 12:47:49 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
all_wikis = json.loads(
    requests.get("https://commons.wikimedia.org/w/api.php?action=sitematrix&smtype=language&format=json").content
)
pprint(all_wikis)

{'sitematrix': {'0': {'code': 'aa',
                      'dir': 'ltr',
                      'localname': 'Afar',
                      'name': 'Qafár af',
                      'site': [{'closed': '',
                                'code': 'wiki',
                                'dbname': 'aawiki',
                                'sitename': 'Wikipedia',
                                'url': 'https://aa.wikipedia.org'},
                               {'closed': '',
                                'code': 'wiktionary',
                                'dbname': 'aawiktionary',
                                'sitename': 'Wiktionary',
                                'url': 'https://aa.wiktionary.org'},
                               {'closed': '',
                                'code': 'wikibooks',
                                'dbname': 'aawikibooks',
                                'sitename': 'Wikibooks',
                                'url': 'https://aa.wikibooks.org'}]},
    

In [4]:
def get_wikipedia_dbname(sites):
    dnames = [site["dbname"] for site in sites if site["code"] == "wiki"]
    if len(dnames) > 0:
        return dnames[0]
    return None

wikipedia = {
    c["code"]: dict(
        # code=c["code"],
        name=c["localname"],
        dbname=get_wikipedia_dbname(c["site"])
    )
    for idx, c in all_wikis["sitematrix"].items() if (
        idx != "count" and get_wikipedia_dbname(c["site"]) is not None
    )
}

print("total number of languages for wikipedia:", len(wikipedia))

total number of languages for wikipedia: 325


In [5]:
pprint(wikipedia)

{'aa': {'dbname': 'aawiki', 'name': 'Afar'},
 'ab': {'dbname': 'abwiki', 'name': 'Abkhazian'},
 'ace': {'dbname': 'acewiki', 'name': 'Achinese'},
 'ady': {'dbname': 'adywiki', 'name': 'Adyghe'},
 'af': {'dbname': 'afwiki', 'name': 'Afrikaans'},
 'ak': {'dbname': 'akwiki', 'name': 'Akan'},
 'als': {'dbname': 'alswiki', 'name': 'Alemannisch'},
 'alt': {'dbname': 'altwiki', 'name': 'Southern Altai'},
 'am': {'dbname': 'amwiki', 'name': 'Amharic'},
 'ami': {'dbname': 'amiwiki', 'name': 'Amis'},
 'an': {'dbname': 'anwiki', 'name': 'Aragonese'},
 'ang': {'dbname': 'angwiki', 'name': 'Old English'},
 'ar': {'dbname': 'arwiki', 'name': 'Arabic'},
 'arc': {'dbname': 'arcwiki', 'name': 'Aramaic'},
 'ary': {'dbname': 'arywiki', 'name': 'Moroccan Arabic'},
 'arz': {'dbname': 'arzwiki', 'name': 'Egyptian Arabic'},
 'as': {'dbname': 'aswiki', 'name': 'Assamese'},
 'ast': {'dbname': 'astwiki', 'name': 'Asturian'},
 'atj': {'dbname': 'atjwiki', 'name': 'Atikamekw'},
 'av': {'dbname': 'avwiki', 'name':

In [6]:
# exclude languages we dont know, like Lombard or Lingala
# also languages that are dead, like Latin
# also languages that were never spoken, such as Esperanto

selected_language_codes = [
    ["ar", "ary", "arz"], # Arabic, Moroccan Arabic, Egyptian Arabic
    ["az", "azb"], # Azerbaijani, South Azerbaijani
    ["bn"], # Bangla (also Bengali), spoken by 150 million just in Bangladesh
    ["bg"], # Bulgarian
    ["bs"], # Bosnian
    ["ca"], # Catalan
    ["cs"], # Czech
    ["da"], # Danish
    ["de"], # German
    ["el"], # Greek
    ["en"], # English
    ["es"], # Spanish
    ["et"], # Estonian
    ["fi"], # Finnish
    ["fr"], # French
    ['fa'], # Persian
    ["ga"], # Irish
    ["hi"], # Hindi
    ["he"], # Hebrew
    ["hu"], # Hungarian
    ['hr', 'sh'], # Croatian, Serbo-Croatian
    ["hy", "hyw"], # Armenian, Western Armenian
    ["id"], # Indonesian
    ["is"], # Icelandic
    ["it"], # Italian
    ["ja"], # Japanese
    ["ko"], # Korean
    ["ku"], # Kurdish
    ["lb"], # Luxembourgish
    ["lt"], # Lithuanian
    ["ms"], # Malay, spoken by 290 million people in Brunei and in malaysia
    # ["my"], # Burmese, 65% in Myanmar/Burma but only 33 million speakers
    ["nl"], # Dutch
    ["no"], # Norwegian
    ["pl"], # Polish
    ["pt"], # Portuguese
    ["ro"], # Romanian
    ["ru", "be", "bxr"], # Russian, Belarusian, Russia Buriat
    ["sl"], # Slovenian
    ['sk'], # Slovak
    ["sq"], # Albanian
    ["sr"], # Serbian
    ["sv"], # Swedish
    # ['tn'], # Tswana, spoken by 77% in botswana
    ["tr"], # Turkish
    ['th'], # Thai
    ["uk"], # Ukrainian
    ["vi"], # Vietnamese
    ["zh", "gan", "cdo", "zh-classical", "zh-min-nan"], # Chinese, Gan Chinese, Min Dong Chinese, Classical Chinese, Chinese (Min Nan)
]
selected_language_names = [[wikipedia[c]["name"] for c in group] for group in selected_language_codes]
pprint(selected_language_names)
print("total number of selected languages for wikipedia: %d (%d)" % (
    sum([len(group) for group in selected_language_codes]), len(selected_language_codes))
)

[['Arabic', 'Moroccan Arabic', 'Egyptian Arabic'],
 ['Azerbaijani', 'South Azerbaijani'],
 ['Bangla'],
 ['Bulgarian'],
 ['Bosnian'],
 ['Catalan'],
 ['Czech'],
 ['Danish'],
 ['German'],
 ['Greek'],
 ['English'],
 ['Spanish'],
 ['Estonian'],
 ['Finnish'],
 ['French'],
 ['Persian'],
 ['Irish'],
 ['Hindi'],
 ['Hebrew'],
 ['Hungarian'],
 ['Croatian', 'Serbo-Croatian'],
 ['Armenian', 'Western Armenian'],
 ['Indonesian'],
 ['Icelandic'],
 ['Italian'],
 ['Japanese'],
 ['Korean'],
 ['Kurdish'],
 ['Luxembourgish'],
 ['Lithuanian'],
 ['Malay'],
 ['Dutch'],
 ['Norwegian'],
 ['Polish'],
 ['Portuguese'],
 ['Romanian'],
 ['Russian', 'Belarusian', 'Russia Buriat'],
 ['Slovenian'],
 ['Slovak'],
 ['Albanian'],
 ['Serbian'],
 ['Swedish'],
 ['Turkish'],
 ['Thai'],
 ['Ukrainian'],
 ['Vietnamese'],
 ['Chinese',
  'Gan Chinese',
  'Min Dong Chinese',
  'Classical Chinese',
  'Chinese (Min Nan)']]
total number of selected languages for wikipedia: 58 (47)


In [7]:
# check if the wikis actually exists
wikimedia_dump = "https://dumps.wikimedia.org/%s/20211001/"

def page_exists(url):
    try:
        _ = requests.get(url)
        return True
    except requests.exceptions.HTTPError as e:
        print(e)
        if e.response.return_code == 404:
            return False
        raise e

existing_language_codes = [
    [c for c in group if page_exists(wikimedia_dump % (wikipedia[c]["dbname"]))]
    for group in selected_language_codes
]

In [8]:
print("total number of selected languages for wikipedia: %d (%d) of %d" % (
    sum([len(group) for group in existing_language_codes]), len(existing_language_codes), len(wikipedia)
))

total number of selected languages for wikipedia: 58 (47) of 325


In [76]:
selected_wikipedia = {}
for group in existing_language_codes:
    for c in group:
        selected_wikipedia[c] = {**wikipedia[c], **dict(group=group[0])}
df = pd.DataFrame.from_dict(selected_wikipedia, orient='index')
df = spark.createDataFrame(df)
df.show()

+-----------------+-------+-----+
|             name| dbname|group|
+-----------------+-------+-----+
|           Arabic| arwiki|   ar|
|  Moroccan Arabic|arywiki|   ar|
|  Egyptian Arabic|arzwiki|   ar|
|      Azerbaijani| azwiki|   az|
|South Azerbaijani|azbwiki|   az|
|           Bangla| bnwiki|   bn|
|        Bulgarian| bgwiki|   bg|
|          Bosnian| bswiki|   bs|
|          Catalan| cawiki|   ca|
|            Czech| cswiki|   cs|
|           Danish| dawiki|   da|
|           German| dewiki|   de|
|            Greek| elwiki|   el|
|          English| enwiki|   en|
|          Spanish| eswiki|   es|
|         Estonian| etwiki|   et|
|          Finnish| fiwiki|   fi|
|           French| frwiki|   fr|
|          Persian| fawiki|   fa|
|            Irish| gawiki|   ga|
+-----------------+-------+-----+
only showing top 20 rows



In [77]:
# add the ISO3 country names for each language
stringency = spark.read.format("parquet").load(f"../nvme/oxcgrt-covid-policy-tracker/OxCGRT_withnotes.parquet")
stringency = stringency.select(F.col("CountryName").alias("country"), F.col("CountryCode").alias("iso3")).distinct()
stringency.show()

+--------------------+----+
|             country|iso3|
+--------------------+----+
|              Bhutan| BTN|
|              Zambia| ZMB|
|               Aruba| ABW|
|           Indonesia| IDN|
|                Iraq| IRQ|
|             Lesotho| LSO|
|              France| FRA|
|               Gabon| GAB|
|               Malta| MLT|
|            Cameroon| CMR|
|                Fiji| FJI|
|         Puerto Rico| PRI|
|            Zimbabwe| ZWE|
|           Australia| AUS|
|               Qatar| QAT|
|               Macao| MAC|
|           Hong Kong| HKG|
|          Azerbaijan| AZE|
|Bosnia and Herzeg...| BIH|
|            Botswana| BWA|
+--------------------+----+
only showing top 20 rows



In [78]:
def get_countries_handler(s):
    return cp.COUNTRIES.get(s, [])
country_udf = F.udf(get_countries_handler, T.ArrayType(T.StringType()))

df = df.withColumn("countries", country_udf(df['group']))
df = df.select("name", "dbname", "group", F.explode("countries").alias("country"))
df.show()

+-----------------+-------+-----+--------------------+
|             name| dbname|group|             country|
+-----------------+-------+-----+--------------------+
|           Arabic| arwiki|   ar|United Arab Emirates|
|           Arabic| arwiki|   ar|        Saudi Arabia|
|  Moroccan Arabic|arywiki|   ar|United Arab Emirates|
|  Moroccan Arabic|arywiki|   ar|        Saudi Arabia|
|  Egyptian Arabic|arzwiki|   ar|United Arab Emirates|
|  Egyptian Arabic|arzwiki|   ar|        Saudi Arabia|
|      Azerbaijani| azwiki|   az|          Azerbaijan|
|South Azerbaijani|azbwiki|   az|          Azerbaijan|
|        Bulgarian| bgwiki|   bg|            Bulgaria|
|          Bosnian| bswiki|   bs|Bosnia and Herzeg...|
|            Czech| cswiki|   cs|      Czech Republic|
|           Danish| dawiki|   da|             Denmark|
|           German| dewiki|   de|             Germany|
|           German| dewiki|   de|             Austria|
|            Greek| elwiki|   el|              Greece|
|         

In [79]:
df = df.join(stringency, on="country", how="inner")
df.show()

+--------------------+-----------------+-------+-----+----+
|             country|             name| dbname|group|iso3|
+--------------------+-----------------+-------+-----+----+
|United Arab Emirates|           Arabic| arwiki|   ar| ARE|
|        Saudi Arabia|           Arabic| arwiki|   ar| SAU|
|United Arab Emirates|  Moroccan Arabic|arywiki|   ar| ARE|
|        Saudi Arabia|  Moroccan Arabic|arywiki|   ar| SAU|
|United Arab Emirates|  Egyptian Arabic|arzwiki|   ar| ARE|
|        Saudi Arabia|  Egyptian Arabic|arzwiki|   ar| SAU|
|          Azerbaijan|      Azerbaijani| azwiki|   az| AZE|
|          Azerbaijan|South Azerbaijani|azbwiki|   az| AZE|
|            Bulgaria|        Bulgarian| bgwiki|   bg| BGR|
|Bosnia and Herzeg...|          Bosnian| bswiki|   bs| BIH|
|      Czech Republic|            Czech| cswiki|   cs| CZE|
|             Denmark|           Danish| dawiki|   da| DNK|
|             Germany|           German| dewiki|   de| DEU|
|             Austria|           German|

In [80]:
country_codes_schema = T.StructType([
    T.StructField("name", T.StringType(), True),
    T.StructField("cca2",T.StringType(), True),
    T.StructField("cca3", T.StringType(), True),
    T.StructField("ccn3", T.IntegerType(), True),
])

country_codes = spark.read.format("csv").schema(country_codes_schema).options(header=True).load("../nvme/country_codes_2020.csv")
country_codes = country_codes.select(F.col("name"), F.col("cca3").alias("iso3"))

population_sizes_schema = T.StructType([
    T.StructField("Rank", T.IntegerType(), True),
    T.StructField("name",T.StringType(), True),
    T.StructField("pop2019", T.FloatType(), True),
    T.StructField("pop2018", T.FloatType(), True),
    T.StructField("GrowthRate", T.FloatType(), True),
    T.StructField("area", T.IntegerType(), True),
    T.StructField("Density", T.FloatType(), True),
])

population_sizes = spark.read.format("csv").schema(population_sizes_schema).options(header=True).load("../nvme/countries_by_population_2019.csv")
population_sizes = population_sizes.select(F.col("name"), F.col("pop2019").alias("population_size"))
population_sizes = population_sizes.join(country_codes, on="name", how="inner")
population_sizes = population_sizes.select(F.col("iso3"), F.col("population_size"))
population_sizes = population_sizes.withColumn("population_size", (1_000 * population_sizes["population_size"]).cast(T.IntegerType()))
population_sizes.show()

# .schema(stringency_schema).options(escape="\"", multiline="true", header="true", delimiter=',') \
# df = df.groupBy("name", "dbname", "group").agg(F.collect_list("country").alias("countries"), F.collect_list("iso3").alias("iso3_countries"))
# df.show()

+----+---------------+
|iso3|population_size|
+----+---------------+
| CHN|     1433783680|
| IND|     1366417792|
| USA|      329064896|
| IDN|      270625568|
| PAK|      216565312|
| BRA|      211049536|
| NGA|      200963600|
| BGD|      163046160|
| RUS|      145872256|
| MEX|      127575528|
| JPN|      126860304|
| ETH|      112078728|
| PHL|      108116616|
| EGY|      100388072|
| VNM|       96462112|
| COD|       86790568|
| DEU|       83517048|
| TUR|       83429616|
| IRN|       82913904|
| THA|       69625576|
+----+---------------+
only showing top 20 rows



In [81]:
df = df.join(population_sizes, on="iso3", how="left")
df.show()

+----+--------------------+-----------------+-------+-----+---------------+
|iso3|             country|             name| dbname|group|population_size|
+----+--------------------+-----------------+-------+-----+---------------+
| ARE|United Arab Emirates|           Arabic| arwiki|   ar|        9770529|
| SAU|        Saudi Arabia|           Arabic| arwiki|   ar|       34268528|
| ARE|United Arab Emirates|  Moroccan Arabic|arywiki|   ar|        9770529|
| SAU|        Saudi Arabia|  Moroccan Arabic|arywiki|   ar|       34268528|
| ARE|United Arab Emirates|  Egyptian Arabic|arzwiki|   ar|        9770529|
| SAU|        Saudi Arabia|  Egyptian Arabic|arzwiki|   ar|       34268528|
| AZE|          Azerbaijan|      Azerbaijani| azwiki|   az|       10047718|
| AZE|          Azerbaijan|South Azerbaijani|azbwiki|   az|       10047718|
| BGR|            Bulgaria|        Bulgarian| bgwiki|   bg|        7000119|
| BIH|Bosnia and Herzeg...|          Bosnian| bswiki|   bs|        3301000|
| CZE|      

In [82]:
# export the list of languages we will use
df.write.format("parquet").mode("overwrite").save("./data/languages.parquet")

In [85]:
out_path = Path("./website/public/data")
languages_countries_out_path = out_path / "languages_countries.json"
languages_countries_out_path.parent.mkdir(parents=True, exist_ok=True)
languages_json = [json.loads(s) for s in df.toJSON().collect()]
pprint(languages_json[:2])

with open(languages_countries_out_path, "w") as f:
    json.dump(languages_json, f, indent=2, sort_keys=True, default=str)

[{'country': 'United Arab Emirates',
  'dbname': 'arwiki',
  'group': 'ar',
  'iso3': 'ARE',
  'name': 'Arabic',
  'population_size': 9770529},
 {'country': 'Saudi Arabia',
  'dbname': 'arwiki',
  'group': 'ar',
  'iso3': 'SAU',
  'name': 'Arabic',
  'population_size': 34268528}]
