In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os
import tensorflow as tf
from tqdm import tqdm
import requests
import json
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_log_error

from datetime import datetime
from datetime import timedelta

from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./input/covid19countryinfo.csv
./input/states-daily.csv
./input/covid19-deepscore.csv
./input/population_data.csv
./input/full-list-total-tests-for-covid-19.csv
./input/country_codes.csv
./input/enriched_covid_19_week_2.csv
./input/covid19-global-forecasting-week-3/test.csv
./input/covid19-global-forecasting-week-3/submission.csv
./input/covid19-global-forecasting-week-3/train.csv
./input/korea/SeoulFloating.csv
./input/korea/TimeAge.csv
./input/korea/SearchTrend.csv
./input/korea/TimeProvince.csv
./input/korea/Weather.csv
./input/korea/PatientRoute.csv
./input/korea/PatientInfo.csv
./input/korea/Region.csv
./input/korea/TimeGender.csv
./input/korea/Case.csv
./input/korea/Time.csv
./input/covid19-global-forecasting-week-2/test.csv
./input/covid19-global-forecasting-week-2/submission.csv
./input/covid19-global-forecasting-week-2/train.csv
./input/covidAPI/ESP.json
./input/covidAPI/ICL.json
./input/covidAPI/CHN.json
./input/covidAPI/FRA.json
./input/covidAPI/THA.json
./input/covidAPI/DNK

# 2. Preparing the training data

In [2]:
def code_to_entity(code):
    dic = {"KOR": "South Korea", "ITA": "Italy", "FRA": "France", "DEU": "Germany", "ISL": "Iceland", "DNK": "Denmark", "THA": "Thailand", "TWN": "Taiwan"}
    return dic[code]

In [3]:
# Get API data (confirmed, deaths, recovered)
# filter by country codes
country_codes = ["KOR", "ITA", "FRA", "DEU", "ISL", "DNK", "THA", "TWN"]

for country in country_codes:
    response = requests.get('https://covidapi.info/api/v1/country/{}'.format(country))
    try:
        data = response.json()
    except:
        print("{} not found".format(country))
    with open("./input/covidAPI/{}.json".format(country), "w") as f:
        json.dump(data["result"], f, indent=4)

In [4]:
original_df = pd.read_csv("./input/full-list-total-tests-for-covid-19.csv")

# unify date format
original_df["Date"] = [datetime.strftime(datetime.strptime(date, '%b %d, %Y'), '%Y-%m-%d') for date in original_df["Date"]]
original_df = original_df.rename(columns={"Total tests": "test"})
original_df.head()

Unnamed: 0,Entity,Code,Date,test
0,Argentina,ARG,2020-04-08,13330
1,Argentina,ARG,2020-04-09,14850
2,Argentina,ARG,2020-04-10,16379
3,Argentina,ARG,2020-04-11,18027
4,Argentina,ARG,2020-04-13,19758


In [5]:
original_df = original_df.query("Code in {}".format(country_codes))
original_df.reset_index(inplace=True, drop=True)
original_df.head()

Unnamed: 0,Entity,Code,Date,test
0,Denmark,DNK,2020-03-17,7630
1,Denmark,DNK,2020-03-19,8847
2,Denmark,DNK,2020-03-20,11657
3,Denmark,DNK,2020-03-21,12351
4,Denmark,DNK,2020-03-22,12843


In [6]:
# Merge Dataframe
import json

new_df = original_df
for code in country_codes:
    with open("./input/covidAPI/{}.json".format(code), "r") as f:
        j = json.load(f)
    # for each date
    for key in j.keys():
        # for each in (confirmed, deaths, recovered)
        for k in j[key].keys():
            if ((new_df['Code'] == code) & (new_df['Date'] == key)).any():
                new_df.loc[(new_df["Code"] == code) & (new_df["Date"] == key), k] = j[key][k]
            else:
                s = pd.Series([code_to_entity(code), code, key, j[key][k]], index=["Entity", "Code", "Date", k])
                new_df = new_df.append(s,ignore_index=True)
new_df = new_df.sort_values(['Code', 'Date'])
new_df.reset_index(inplace=True, drop=True)
print(new_df.head())

    Entity Code        Date  test  confirmed  deaths  recovered
0  Germany  DEU  2020-01-22   NaN        0.0     0.0        0.0
1  Germany  DEU  2020-01-23   NaN        0.0     0.0        0.0
2  Germany  DEU  2020-01-24   NaN        0.0     0.0        0.0
3  Germany  DEU  2020-01-25   NaN        0.0     0.0        0.0
4  Germany  DEU  2020-01-26   NaN        0.0     0.0        0.0


In [7]:
# Interpolate
new_df["test"].where(new_df["confirmed"] != 0.0, 0.0, inplace=True)
# add test values to unique countries
for code in ["THA", "TWN"]:
    first_confirmed = new_df.loc[(new_df["Code"] == code) & (new_df["Date"] == "2020-01-22"), "confirmed"].values[0]
    new_df.loc[(new_df["Code"] == code) & (new_df["Date"] == "2020-01-22"), "test"] = new_df.loc[(new_df["Code"] == "KOR") & (new_df["confirmed"] == first_confirmed), "test"].values[0]

for code in country_codes:
    tmp_df = new_df[new_df["Code"] == code]
    tmp_df.interpolate(inplace=True)
    new_df[new_df["Code"]==code] = tmp_df
print(new_df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


    Entity Code        Date  test  confirmed  deaths  recovered
0  Germany  DEU  2020-01-22   0.0        0.0     0.0        0.0
1  Germany  DEU  2020-01-23   0.0        0.0     0.0        0.0
2  Germany  DEU  2020-01-24   0.0        0.0     0.0        0.0
3  Germany  DEU  2020-01-25   0.0        0.0     0.0        0.0
4  Germany  DEU  2020-01-26   0.0        0.0     0.0        0.0


In [8]:
# Add rates
# new_df["infectious_rate"] = (new_df["test"] - new_df["negative"]) / new_df["test"]
new_df["infectious_rate"] = new_df["confirmed"] / new_df["test"]
new_df["removed_rate"] = (new_df["deaths"] + new_df["recovered"]) / new_df["test"]
new_df["susceptible_rate"] = 1.0 - (new_df["infectious_rate"] + new_df["removed_rate"])
new_df["infectious_rate_change"] = 0.0
new_df["removed_rate_change"] = 0.0
new_df["susceptible_rate_change"] = 0.0

# Drop nulls
new_df.dropna(how='any', inplace=True)
print(new_df.head())

    Entity Code        Date          test  confirmed  deaths  recovered  \
5  Germany  DEU  2020-01-27   2969.428571        1.0     0.0        0.0   
6  Germany  DEU  2020-01-28   5938.857143        4.0     0.0        0.0   
7  Germany  DEU  2020-01-29   8908.285714        4.0     0.0        0.0   
8  Germany  DEU  2020-01-30  11877.714286        4.0     0.0        0.0   
9  Germany  DEU  2020-01-31  14847.142857        5.0     0.0        0.0   

   infectious_rate  removed_rate  susceptible_rate  infectious_rate_change  \
5         0.000337           0.0          0.999663                     0.0   
6         0.000674           0.0          0.999326                     0.0   
7         0.000449           0.0          0.999551                     0.0   
8         0.000337           0.0          0.999663                     0.0   
9         0.000337           0.0          0.999663                     0.0   

   removed_rate_change  susceptible_rate_change  
5                  0.0        

In [9]:
# Smoothing
def mu_sigma_justify(Y, floor_n=1, ceil_n=1.5, mu=None, sigma=None):
    """μとσを用いた補正関数。
    Args:
        floor_n (float):
        ceil_n (float):
        mu (Optional[float]):
        sigma (Optional[float]):
    """
    mu = mu or Y.mean()
    sigma = sigma or Y.std()
    Y = Y.copy()
    Y_ = Y.copy()
    Y[abs(Y_ - mu) < floor_n * sigma] = mu  # μ±σ ⇒ μ
    Y[Y_ > mu + ceil_n * sigma] = mu + ceil_n * sigma  # μ＋3σ より大きい ⇒ μ＋3σ
    Y[Y_ < mu - ceil_n * sigma] = mu - ceil_n * sigma  # μー3σ より小さい ⇒ μー3σ
    return Y

In [10]:
def compare_smoothing(df):
    plt.plot(tmp_df["infectious_rate"].values)
    plt.plot(mu_sigma_justify(tmp_df["infectious_rate"].values))
    plt.title("Infectious Rate Smoothing Comparison")
    plt.ylabel('Infectious Rate')
    plt.xlabel('Date')
    plt.xticks(range(len(df["Date"].values)),df["Date"].values,rotation='vertical')
    plt.legend(['Before', 'After'], loc='best')
    plt.show()
    
    plt.plot(tmp_df["removed_rate"].values)
    plt.plot(mu_sigma_justify(tmp_df["removed_rate"].values))
    plt.title("Removed Rate Smoothing Comparison")
    plt.ylabel('Removed Rate')
    plt.xlabel('Date')
    plt.xticks(range(len(df["Date"].values)),df["Date"].values,rotation='vertical')
    plt.legend(['Before', 'After'], loc='best')
    plt.show()

def display_rate(tmp_df):
    plt.plot(tmp_df["infectious_noise_rate"].values)
    plt.plot(tmp_df["removed_noise_rate"].values)
#     plt.plot(tmp_df["susceptible_noise_rate"].values)
    plt.title("Rate")
    plt.ylabel('Rate')
    plt.xlabel('Date')
    plt.xticks(range(len(tmp_df.Date.values)),tmp_df.Date.values,rotation='vertical')
    plt.legend(['Infectious Rate', 'Removed Rate'], loc='best')
    plt.show()

In [11]:
# Calculate Change Rate
for code in country_codes:
    print("-----------------{}------------------".format(code))

    # normalize
    # compare_smoothing(tmp_df)
    # tmp_df["infectious_rate"] = mu_sigma_justify(tmp_df["infectious_rate"].values)
    # tmp_df["removed_rate"] = mu_sigma_justify(tmp_df["removed_rate"].values)
    # tmp_df["susceptible_rate"] = mu_sigma_justify(tmp_df["susceptible_rate"].values)

    tmp_df = new_df[new_df["Code"] == code]
    tmp_df["infectious_rate_change"] = tmp_df["infectious_rate"].pct_change()
    tmp_df["infectious_rate_change"] = tmp_df["infectious_rate_change"].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    tmp_df["removed_rate_change"] = tmp_df["removed_rate"].pct_change()
    tmp_df["removed_rate_change"] = tmp_df["removed_rate_change"].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    tmp_df["susceptible_rate_change"] = tmp_df["susceptible_rate_change"].pct_change()
    tmp_df["susceptible_rate_change"] = tmp_df["susceptible_rate_change"].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    new_df[new_df["Code"]==code] = tmp_df
    
# Save
new_df.to_csv("./input/country_data.csv")

-----------------KOR------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

-----------------ITA------------------
-----------------FRA------------------
-----------------DEU------------------
-----------------ISL------------------
-----------------DNK------------------
-----------------THA------------------
-----------------TWN------------------
