In [None]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams["figure.figsize"] = (8,3)

In [None]:
confirmed_path = Path("../nvme/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
deaths_path = Path("../nvme/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
recovered_path = Path("../nvme/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")

confirmed = pd.read_csv(confirmed_path)
deaths = pd.read_csv(deaths_path)
recovered = pd.read_csv(recovered_path)
languages = pd.read_csv("./data/languages.csv", index_col="code")

def clean_df(df):
    return df.rename(columns={
        "Country/Region": "country",
        "Province/State": "state",
        "Lat": "lat",
        "Long": "lon",
    })

confirmed = clean_df(confirmed)
deaths = clean_df(deaths)
recovered = clean_df(recovered)

In [None]:
# plot the data for germany to get a feel
germany = confirmed.copy()
germany = germany[germany["country"] == "Germany"]
germany = germany.iloc[:,4:].T
germany.index = pd.to_datetime(germany.index)
plt.plot(germany)
plt.title("Total confirmed COVID-19 cases in Germany")
plt.xlabel("time")
plt.ylabel("confirmed cases")
plt.tight_layout()
plt.savefig("./figs/total_confirmed_covid_cases_germany.pdf")
plt.show()

In [None]:
# countries with covid data
pd.set_option('display.max_rows', None)
print(len(confirmed["country"].unique().tolist()))
pprint(confirmed["country"].unique().tolist())

In [None]:
# wikipedia language country codes we wish to map to
print(languages["name"])

In [None]:
# map countries to wikipedia language codes
language_mapping = {
    # 'Afghanistan',
    'Albania': "sq",
    'Algeria': "ar", # arabic
    'Andorra': "ca", # catalan
    'Angola': "pt", # portugese
    'Antigua and Barbuda': "en",
    'Argentina': "es",
    'Armenia': "hy",
    'Australia': "en",
    'Austria': "de",
    'Azerbaijan': "az",
    'Bahamas': "en",
    'Bahrain': "ar",
    'Bangladesh': "bn", # bangla
    'Barbados': "en",
    'Belarus': "ru",
    'Belgium': "nl",
    'Belize': "en",
    'Benin': "fr", # more than 50 languages, but the official one is french
    # 'Bhutan', # 4 languages with 25% each, so no clear mapping possible
    'Bolivia': "en", # spanish is 60%
    'Bosnia and Herzegovina': "bs",
    'Botswana': 'tn', # Tswana, spoken by 77%
    'Brazil': "es",
    'Brunei': "ms", # Malay spoken by 290 million people
    'Bulgaria': "bg",
    # 'Burkina Faso', # Mossi, but cannot find it
    'Burma': "my", # burmese has only 33 million native speakers
    # 'Burundi',
    # 'Cabo Verde',
    'Cambodia': "km", # Khmer is only spoken by 16 million people
    'Cameroon': "fr", # french and english are official languages
    'Canada': "en",
    # 'Central African Republic',
    # 'Chad',
    'Chile': "es",
    'China': "zh",
    'Colombia': "es",
    # 'Comoros',
    # 'Congo (Brazzaville)',
    # 'Congo (Kinshasa)',
    'Costa Rica': "es",
    # "Cote d'Ivoire",
    'Croatia': "hr",
    'Cuba': "es",
    'Cyprus': "el",
    'Czechia': "cs",
    'Denmark': "da",
    # 'Diamond Princess',
    # 'Djibouti',
    # 'Dominica',
    'Dominican Republic': "es",
    'Ecuador': "es",
    'Egypt': "ar",
    # 'El Salvador',
    # 'Equatorial Guinea',
    # 'Eritrea',
    'Estonia': "et",
    # 'Eswatini',
    # 'Ethiopia',
    # 'Fiji',
    'Finland': "fi",
    'France': "fr",
    # 'Gabon', 
    # 'Gambia',
    # 'Georgia',
    'Germany': "de",
    # 'Ghana', # official is english but they also speak french and 14 other official languages
    'Greece': "el",
    # 'Grenada',
    'Guatemala': "es",
    # 'Guinea',
    # 'Guinea-Bissau',
    # 'Guyana',
    # 'Haiti',
    # 'Holy See',
    # 'Honduras',
    'Hungary': "hu",
    'Iceland': "is",
    'India': "hi", # Hindi has the largest number of speakers
    'Indonesia': "id",
    'Iran': "fa", # Persian
    'Iraq': "ar",
    'Ireland': "ga",
    'Israel': "he", # hebrew
    'Italy': "it",
    'Jamaica': "en",
    'Japan': "ja",
    'Jordan': "ar",
    # 'Kazakhstan', # Kazakh
    # 'Kenya',
    # 'Kiribati',
    'Korea, South': "ko",
    'Kosovo': "sq",
    # 'Kuwait',
    # 'Kyrgyzstan',
    # 'Laos',
    # 'Latvia',
    'Lebanon': "ar",
    # 'Lesotho',
    'Liberia': "en",
    'Libya': "ar",
    'Liechtenstein': "de",
    'Lithuania': "lt",
    'Luxembourg': "lb",
    # 'MS Zaandam',
    # 'Madagascar',
    # 'Malawi',
    'Malaysia': "ms", # Malay just as in Brunei
    # 'Maldives',
    # 'Mali',
    # 'Malta', too small with just 500 000?
    # 'Marshall Islands',
    'Mauritania': "ar",
    # 'Mauritius',
    'Mexico': "es",
    # 'Micronesia',
    'Moldova': "ro",
    'Monaco': "fr", # official language is french
    # 'Mongolia',
    # 'Montenegro',
    'Morocco': "ar",
    # 'Mozambique',
    # 'Namibia',
     #'Nepal',
    'Netherlands': "nl",
    'New Zealand': "en",
    # 'Nicaragua',
    # 'Niger',
    # 'Nigeria',
    # 'North Macedonia',
    'Norway': "no",
    # 'Oman',
    # 'Pakistan',
    # 'Palau',
    'Panama': "es",
    # 'Papua New Guinea',
    'Paraguay': "es",
    'Peru': "es",
    # 'Philippines',
    'Poland': "pl",
    'Portugal': "pt",
    'Qatar': "ar",
    'Romania': "ro",
    'Russia': "ru",
    # 'Rwanda',
    # 'Saint Kitts and Nevis',
    # 'Saint Lucia',
    # 'Saint Vincent and the Grenadines',
    # 'Samoa',
    # 'San Marino',
    # 'Sao Tome and Principe',
    'Saudi Arabia': "ar",
    # 'Senegal',
    'Serbia': "sr",
    # 'Seychelles',
    # 'Sierra Leone',
    # 'Singapore', # fragmented, 48.3% english, 30% mandarim, 10% malay
    'Slovakia': "sk",
    'Slovenia': "sl",
    # 'Solomon Islands',
    # 'Somalia',
    # 'South Africa',
    # 'South Sudan',
    'Spain': "es",
    # 'Sri Lanka',
    # 'Sudan',
    # 'Summer Olympics 2020',
    # 'Suriname',
    'Sweden': "sv",
    'Switzerland': "de",
    'Syria': "ar",
    'Taiwan*': "zh", # mandarin, which we map to chinese i guess
    # 'Tajikistan',
    # 'Tanzania',
    'Thailand': "th",
    # 'Timor-Leste',
    # 'Togo',
    # 'Trinidad and Tobago',
    'Tunisia': "ar",
    'Turkey': "tr",
    'US': "en",
    # 'Uganda',
    'Ukraine': "uk",
    'United Arab Emirates': "ar",
    'United Kingdom': "en",
    'Uruguay': "es",
    # 'Uzbekistan',
    # 'Vanuatu',
    # 'Venezuela',
    'Vietnam': "vi",
    # 'West Bank and Gaza',
    # 'Yemen',
    # 'Zambia',
    # 'Zimbabwe',
}

In [None]:
# show which and how many countries will be dropped
all_countries = confirmed["country"].unique().tolist()
excluded_countries = sorted(
    list(set(all_countries) - set(language_mapping.keys()))
)
print(f"excluded countries: ({len(excluded_countries)} from {len(all_countries)})")
pprint(excluded_countries)

In [None]:
# group countries and combine their provinces and states
# add the mapped wikipedia domain code to the tables
def group_and_map_country(df):
    def combine(group):
        group = group.iloc[:,1:] # drop state
        group.iloc[0,3:] = group.iloc[:,3:].agg("sum") # sum the covid numbers
        group.iloc[0,1:3] = group.iloc[:,1:3].agg("mean") # mean of lat and lon
        group = group.iloc[0,:]
        return group
    
    df = df.groupby("country").apply(combine) # combine states of the same country
    df.insert(1, "country_code", np.nan) # insert new country_code column after the country
    df["country_code"] = df["country"].apply(lambda c: language_mapping.get(c)) # map wikipedia language
    df = df[df["country_code"].notna()] # filter countries with no mapped wikipedia language
    df = df.T # transpose table
    df.iloc[4:,:].index = pd.to_datetime(df.iloc[4:,:].index) # convert index to datetimes
    df = df.T # transpose table
    return df

mapped_confirmed = group_and_map_country(confirmed.copy())
mapped_deaths = group_and_map_country(deaths.copy())
mapped_recovered = group_and_map_country(recovered.copy())

In [None]:
mapped_confirmed.head()

In [None]:
# check that the results are sane
print("\n confirmed cases for China before:")
print(confirmed[confirmed["country"] == "China"].iloc[:,:6])

print("\n confirmed cases for China after:")
print(mapped_confirmed[mapped_confirmed["country"] == "China"].iloc[:,:6])

print("\n mapped language code for china:")
print(mapped_confirmed[mapped_confirmed["country"] == "China"]["country_code"].head())

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F

MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("CSSEGISandData-COVID-19") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
# write the covid data to parquet
def write_to_parquet(df, output_path: str):
    spark_df = spark.createDataFrame(df)
    spark_df = spark_df.repartition(F.col("country_code"))
    output_path.parent.mkdir(parents=True, exist_ok=True)
    spark_df.write.format("parquet").partitionBy("country_code").mode("overwrite").save(str(output_path))
    print(f"wrote {output_path}")
    
confirmed_parquet_path = confirmed_path.with_suffix(".parquet")
deaths_parquet_path = deaths_path.with_suffix(".parquet")
recovered_parquet_path = recovered_path.with_suffix(".parquet")
    
write_to_parquet(mapped_confirmed, confirmed_parquet_path)
write_to_parquet(mapped_deaths, deaths_parquet_path)
write_to_parquet(mapped_recovered, recovered_parquet_path)

In [None]:
# read a parquet file back to check
reader = spark.read.option("basePath", str(confirmed_parquet_path))
german_parquet = reader.parquet(str(confirmed_parquet_path / "country_code=de"))
german_parquet = german_parquet.toPandas()
austria = german_parquet[german_parquet["country"] == "Austria"].iloc[:,3:-1].T
austria.index = pd.to_datetime(austria.index)
plt.plot(austria)
plt.title("Total confirmed covid cases in Austria")
plt.xlabel("time")
plt.ylabel("confirmed cases")
plt.show()