![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F518134%2F485aa04e87e4e45c91815101784c6d95%2Fcorona-4930541_1280.jpg?generation=1585438527494582&alt=media)


# WEEK 4

### The Challenge

Kaggle is launching a companion COVID-19 forecasting challenges to help answer a subset of the NASEM/WHO questions. While the challenge involves forecasting confirmed cases and fatalities between April 15 and May 14 by region, the primary goal isn't only to produce accurate forecasts. Itâ€™s also to identify factors that appear to impact the transmission rate of COVID-19.

You are encouraged to pull in, curate and share data sources that might be helpful. If you find variables that look like they impact the transmission rate, please share your finding in a notebook.

### Data

In this challenge, you will be predicting the cumulative number of confirmed COVID19 cases in various locations across the world, as well as the number of resulting fatalities, for future dates.


### Thanks to great notebook, used as start: 

https://www.kaggle.com/corochann/covid-19-current-situation-on-may-daily-update


I did not participated in previous competitions, so if I miss anything, let me kindly know :)

In [None]:
import os

import warnings
warnings.filterwarnings('ignore')


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

from tqdm import tqdm

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import roc_curve, auc, accuracy_score, cohen_kappa_score
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix, mean_squared_log_error

In [None]:
## since so many data file you'll write a code to find what you need  :D 

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        file_extension = os.path.splitext(filename)[1]
#        if file_extension == '.csv':
#            print(os.path.join(dirname, filename))

In [None]:
def display_missing(df, head=True):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [None]:
needed_files = ['submission.csv', 'test.csv', 'train.csv']

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename in needed_files:
            print(os.path.join(dirname, filename))

In [None]:
%%time

PATH = '/kaggle/input/covid19-global-forecasting-week-4/'

def load_ds(file_name):
    return pd.read_csv(PATH + file_name)

train = load_ds('train.csv')
test = load_ds('test.csv')
submission = load_ds('submission.csv')
#print(train.shape, test.shape)

## adding mortality and geo data to train and test

usa_mortality_rates = pd.read_csv('/kaggle/input/coronavirus-covid19-mortality-rate-by-country/usa_covid19_mortality_rates.csv')
world_mortality_rates = pd.read_csv('/kaggle/input/coronavirus-covid19-mortality-rate-by-country/global_covid19_mortality_rates.csv')

## it seems, there is a dataset from previous week2 competition with all information one may need for a competition like this one :)
## not sure, who created this dataset, was too late, but thank you.
enriched = pd.read_csv('/kaggle/input/covid19-enriched-dataset-week-2/enriched_covid_19_week_2.csv')

world_enriched_cols = ['Date', 'Country_Region', 'age_75-79', 'age_80-84', 'age_85-89', 'age_90-94',
       'age_95-99', 'age_100+', 'total_pop', 'smokers_perc', 'density',
       'urbanpop', 'hospibed', 'lung', 'femalelung', 'malelung', 'restrictions', 'quarantine', 'schools']

us_enriched_cols = ['Date', 'Province_State', 'age_75-79', 'age_80-84', 'age_85-89', 'age_90-94',
       'age_95-99', 'age_100+', 'total_pop', 'smokers_perc', 'density',
       'urbanpop', 'hospibed', 'lung', 'femalelung', 'malelung', 'restrictions', 'quarantine', 'schools']

us_train = train[train['Country_Region'] == 'US']
us_test = test[test['Country_Region'] == 'US']
train = train.drop(train[train['Country_Region'] == 'US'].index)
test = test.drop(test[test['Country_Region'] == 'US'].index)

mort_df = world_mortality_rates[['Country', 'Mortality Rate', 'Latitude', 'Longitude']]
train = train.merge(mort_df, how='left', left_on='Country_Region', right_on='Country').drop(['Country'], axis=1)
train = train.merge(enriched[world_enriched_cols], how='left', left_on=['Date', 'Country_Region'], right_on=['Date', 'Country_Region'])
test = test.merge(mort_df, how='left', left_on='Country_Region', right_on='Country').drop(['Country'], axis=1)
test = test.merge(enriched[world_enriched_cols], how='left', left_on=['Date', 'Country_Region'], right_on=['Date', 'Country_Region'])

us_mort_df = usa_mortality_rates[['State', 'Mortality Rate', 'Latitude', 'Longitude']]
us_train = us_train.merge(us_mort_df, how='left', left_on='Province_State', right_on='State').drop(['State'], axis=1)
us_train = us_train.merge(enriched[us_enriched_cols], how='left', left_on=['Date', 'Province_State'], right_on=['Date', 'Province_State'])

us_test = us_test.merge(us_mort_df, how='left', left_on='Province_State', right_on='State').drop(['State'], axis=1)
us_test = us_test.merge(enriched[us_enriched_cols], how='left', left_on=['Date', 'Province_State'], right_on=['Date', 'Province_State'])

## even more useful features
## need to find US data first

#useful_cols = ['Country_Region', 'Tourism','Date_FirstFatality','Date_FirstConfirmedCase', 'Mean_Age', 'Date_FirstFatality', 'Date_FirstConfirmedCase']
#cluster_data = pd.read_csv("/kaggle/input/covid19-useful-features-by-country/Countries_usefulFeatures.csv")
#train = train.merge(cluster_data[useful_cols], how='left', left_on='Country_Region', right_on='Country_Region')
#test = test.merge(cluster_data[useful_cols], how='left', left_on='Country_Region', right_on='Country_Region')

train['Province_State'] = train['Province_State'].fillna('Unknown')
test['Province_State'] = test['Province_State'].fillna('Unknown')

train = pd.concat([train,us_train])
test = pd.concat([test,us_test])

enriched_cols = ['age_75-79', 'age_80-84', 'age_85-89', 'age_90-94', 'age_95-99', 'age_100+', 'total_pop', 'smokers_perc', 'density',
       'urbanpop', 'hospibed', 'lung', 'femalelung', 'malelung', 'restrictions', 'quarantine', 'schools', 'Mortality Rate']

train['Country_Region'] = train['Country_Region'].fillna('Unknown')
train['Province_State'] = train['Province_State'].fillna('Unknown')
#train[enriched_cols] = train.groupby(['Country_Region', 'Province_State'])[enriched_cols].transform(lambda x: x.fillna(x.mean()))

test['Country_Region'] = test['Country_Region'].fillna('Unknown')
test['Province_State'] = test['Province_State'].fillna('Unknown')
#test[enriched_cols] = test.groupby(['Country_Region', 'Province_State'])[enriched_cols].transform(lambda x: x.fillna(x.mean()))

test.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

print('Shapes: ')
print(train.shape, test.shape)

print('Duplicates: ')
print(len(test.loc[test[enriched_cols].index.duplicated()]))
print(len(train.loc[train[enriched_cols].index.duplicated()]))

### missing data

In [None]:
#train.drop(train.loc[train[(train['Country_Region'] == 'Diamond Princess')].index].index, axis=0, inplace=True)
#test.drop(test.loc[test[(test['Country_Region'] == 'Diamond Princess')].index].index, axis=0, inplace=True)
train.loc[train.Country_Region == 'Diamond Princess', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Diamond Princess', 'Latitude'] = 0
train.loc[train.Country_Region == 'Burma', 'Longitude'] = 0
test.loc[test.Country_Region == 'Diamond Princess', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Diamond Princess', 'Latitude'] = 0
test.loc[test.Country_Region == 'Diamond Princess', 'Longitude'] = 0


train.loc[train.Province_State == 'Guam', 'Mortality Rate'] = 0
train.loc[train.Province_State == 'Guam', 'Latitude'] = 13.444304
train.loc[train.Province_State == 'Guam', 'Longitude'] = 144.793732
test.loc[test.Province_State == 'Guam', 'Mortality Rate'] = 0
test.loc[test.Province_State == 'Guam', 'Latitude'] = 13.444304
test.loc[test.Province_State == 'Guam', 'Longitude'] = 144.793732

train.loc[train.Province_State == 'Virgin Islands', 'Mortality Rate'] = 0
train.loc[train.Province_State == 'Virgin Islands', 'Latitude'] = 0
train.loc[train.Province_State == 'Virgin Islands', 'Longitude'] = 0
test.loc[test.Province_State == 'Virgin Islands', 'Mortality Rate'] = 0
test.loc[test.Province_State == 'Virgin Islands', 'Latitude'] = 0
test.loc[test.Province_State == 'Virgin Islands', 'Longitude'] = 0

train.loc[train.Country_Region == 'Burma', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Burma', 'Latitude'] = 16.871311
train.loc[train.Country_Region == 'Burma', 'Longitude'] = 96.199379
test.loc[test.Country_Region == 'Burma', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Burma', 'Latitude'] = 16.871311
test.loc[test.Country_Region == 'Burma', 'Longitude'] = 96.199379

train.loc[train.Country_Region == 'Cabo Verde', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Cabo Verde', 'Latitude'] = 0
train.loc[train.Country_Region == 'Cabo Verde', 'Longitude'] = 0
test.loc[test.Country_Region == 'Cabo Verde', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Cabo Verde', 'Latitude'] = 0
test.loc[test.Country_Region == 'Cabo Verde', 'Longitude'] = 0

train.loc[train.Country_Region == 'West Bank and Gaza', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'West Bank and Gaza', 'Latitude'] = 0
train.loc[train.Country_Region == 'West Bank and Gaza', 'Longitude'] = 0
test.loc[test.Country_Region == 'West Bank and Gaza', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'West Bank and Gaza', 'Latitude'] = 0
test.loc[test.Country_Region == 'West Bank and Gaza', 'Longitude'] = 0

train.loc[train.Country_Region == 'Congo (Brazzaville)', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Congo (Brazzaville)', 'Latitude'] = 0
train.loc[train.Country_Region == 'Congo (Brazzaville)', 'Longitude'] = 0
test.loc[test.Country_Region == 'Congo (Brazzaville)', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Congo (Brazzaville)', 'Latitude'] = 0
test.loc[test.Country_Region == 'Congo (Brazzaville)', 'Longitude'] = 0

train.loc[train.Country_Region == 'Congo (Kinshasa)', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Congo (Kinshasa)', 'Latitude'] = 0
train.loc[train.Country_Region == 'Congo (Kinshasa)', 'Longitude'] = 0
test.loc[test.Country_Region == 'Congo (Kinshasa)', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Congo (Kinshasa)', 'Latitude'] = 0
test.loc[test.Country_Region == 'Congo (Kinshasa)', 'Longitude'] = 0

train.loc[train.Country_Region == 'Cote d\'Ivoire', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Cote d\'Ivoire', 'Latitude'] = 0
train.loc[train.Country_Region == 'Cote d\'Ivoire', 'Longitude'] = 0
test.loc[test.Country_Region == 'Cote d\'Ivoire', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Cote d\'Ivoire', 'Latitude'] = 0
test.loc[test.Country_Region == 'Cote d\'Ivoire', 'Longitude'] = 0

train.loc[train.Country_Region == 'Czechia', 'Mortality Rate'] = 0
train.loc[train.Country_Region == 'Czechia', 'Latitude'] = 12.841150
train.loc[train.Country_Region == 'Czechia', 'Longitude'] = 15.530190
test.loc[test.Country_Region == 'Czechia', 'Mortality Rate'] = 0
test.loc[test.Country_Region == 'Czechia', 'Latitude'] = 12.841150
test.loc[test.Country_Region == 'Czechia', 'Longitude'] = 15.530190

print(train.shape, test.shape)

In [None]:
display_missing(train)

In [None]:
train[train.isna().any(axis=1)]

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
display_missing(train)

### Cleanup and feature engineering

In [None]:
#train["fatality_rate"] = train.Fatalities * 100 / train.ConfirmedCases
#train.groupby(['Country_Region', 'Province_State'])["fatality_rate"].bfill()
#test['fatality_rate'] = 0.0;

#def set_rate(grp, val):
#    grp['fatality_rate'] = val
#    return grp

## for sure the is a better way in pandas, let me know
#for key, group in train.groupby(['Country_Region', 'Province_State'])['fatality_rate']:
#    val = train[((train.Country_Region == key[0]) & (train.Province_State == key[1]))]['fatality_rate'][:1].values[0]
#    test.groupby(['Country_Region', 'Province_State']).apply(lambda x: set_rate(x,val))

In [None]:
train

In [None]:
def ds_prep(df):
    df['Days'] = (pd.to_datetime(df['Date']) - pd.to_datetime('2020-01-01')).dt.days
    df['Day'] = pd.to_datetime(df['Date']).dt.day
    df['Month'] = pd.to_datetime(df['Date']).dt.month
    #df['Day'] = df['Day'].astype('category')
    #df['Month'] = df['Month'].astype('category')

    df['province_code'] = LabelEncoder().fit_transform(df['Province_State']).astype(int)
    df['country_code'] = LabelEncoder().fit_transform(df['Country_Region']).astype(int)
    #df['province_code'] = df['province_code'].astype('category')
    #df['country_code'] = df['country_code'].astype('category')
    return df

train = ds_prep(train)
train['ConfirmedCases'] = train['ConfirmedCases'].astype(int)
train['Fatalities'] = train['Fatalities'].astype(int)

test = ds_prep(test)

In [None]:
FEATURES = ['Days', 'Day', 'Month', 'province_code', 'country_code', 'Mortality Rate', 'Latitude', 'Longitude', 
            'total_pop', 'density',  
            #'age_75-79', 'urbanpop', 
            ##'Tourism', 'Mean_Age',
            #'age_75-79', 'age_80-84', 'age_85-89', 'age_90-94',
           #'age_95-99', 'age_100+', 'total_pop', 'smokers_perc', 'density', 'urbanpop', 'hospibed', 'lung', 'femalelung', 'malelung',
           #'restrictions', 'quarantine', 'schools'
           ]

X_train = train[FEATURES]
y_confirmed = train['ConfirmedCases']
y_fatalities = train['Fatalities']

X_test = test[FEATURES]

#scaler = RobustScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

print(X_train.shape, y_confirmed.shape, y_fatalities.shape)
print(X_test.shape)

### LGB Model

In [None]:
#from sklearn.decomposition import PCA

#NOTIN_FEATURES = [ 'age_75-79', 'age_80-84', 'age_85-89', 'age_90-94', 'age_95-99', 'age_100+']

#pca = PCA(n_components=1)
#pca_train = pca.fit_transform(train[NOTIN_FEATURES].fillna(0))
#pca_test = pca.transform(test[NOTIN_FEATURES].fillna(0))

#X_train['pca_1'] = pca_train[:,0]
#X_test['pca_1'] = pca_test[:,0]
#FEATURES.append('pca_1')

#del pca_train, pca_test

In [None]:
## this func throws an error on NaN or negative values still in works
def RMSLE(Y, pred):
    y_data = np.nan_to_num(Y.data, nan=0, posinf=0, neginf=0)
    y_data = np.clip(y_data, 0, 1000000) 
    pred_data = np.nan_to_num(pred.data, nan=0, posinf=0, neginf=0)
    pred_data = np.clip(pred_data, 0, 1000000) 
    return np.sqrt(mean_squared_log_error(y_data.values, pred_data.values))

def rmsle(y, y_pred):
        y = y.data
        y_pred = y_pred.data
        assert len(y) == len(y_pred)
        terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
        return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def train_model(X, y, params, split_size, random_seed):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=split_size, random_state=random_seed)
    evals_result = {}
    model = lgb.train(
        params, 
        train_set=lgb.Dataset(X_tr, y_tr), 
        num_boost_round = NUM_BOOST_ROUND,
        valid_sets = lgb.Dataset(X_val, y_val), 
        verbose_eval = VERBOSE_EVAL,
        evals_result = evals_result,
        early_stopping_rounds = EARLY_STOPPING_ROUNDS
    )    
    return model, evals_result

In [None]:
NUM_BOOST_ROUND = 2000
EARLY_STOPPING_ROUNDS = 40
VERBOSE_EVAL = 500
RANDOM_SEED = 13
LEARNING_RATE = 0.1
MAX_DEPTH = -1
NUM_LEAVES = 200


lgb_params = {
    'learning_rate': LEARNING_RATE, 
    'max_depth': MAX_DEPTH, 
    'num_leaves': NUM_LEAVES,
    'random_state': RANDOM_SEED, 
    'n_jobs':-1, 
    'metric':'rmse'
    }

#confirmed_model, confirmed_evals_result = train_model(X_train, y_confirmed, lgb_params, 0.2, RANDOM_SEED)
#fatalities_model, fatalities_evals_result = train_model(X_train, y_fatalities, lgb_params, 0.2, RANDOM_SEED)

In [None]:
#f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6)) 
#lgb.plot_metric(confirmed_evals_result, metric='rmse', ax=ax1) 
#lgb.plot_metric(fatalities_evals_result, metric='rmse', ax=ax2) 
#plt.show()

### XGB Model

In [None]:
def train_xgb_model(X, y, params, split_size, random_seed):
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=split_size, random_state=random_seed)

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dtest = xgb.DMatrix(X_val, label=y_val)
    evals_result = {}
    model = xgb.train( params, dtrain,
        num_boost_round = NUM_BOOST_ROUND,
        evals=[(dtest, 'eval')], 
        #eval_metric=['rmse'],
        evals_result = evals_result,
        verbose_eval = VERBOSE_EVAL,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS)    
    return model, evals_result

xgb_params = {
    'objective':'reg:squarederror', 
    'colsample_bytree': 0.96, 
    #'gamma': 0.9, 
    'learning_rate': 0.05, 
    'max_depth': 10, 
    #'min_child_weight': 1, 
    'subsample': 0.95,
    'eval_metric':'rmse'
}


confirmed_model, confirmed_evals_result = train_xgb_model(X_train, y_confirmed, xgb_params, 0.2, RANDOM_SEED)
fatalities_model, fatalities_evals_result = train_xgb_model(X_train, y_fatalities, xgb_params, 0.2, RANDOM_SEED)

In [None]:
f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6)) 
lgb.plot_metric(confirmed_evals_result, metric='rmse', ax=ax1) 
lgb.plot_metric(fatalities_evals_result, metric='rmse', ax=ax2) 
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 6)) 
xgb.plot_importance(confirmed_model, ax=ax1) 
xgb.plot_importance(fatalities_model, ax=ax2) 
plt.show()

In [None]:
yhat_confirmed = confirmed_model.predict(xgb.DMatrix(X_test))
yhat_fatalities = fatalities_model.predict(xgb.DMatrix(X_test))

test['ConfirmedCases'] = yhat_confirmed
test['Fatalities'] = yhat_fatalities

In [None]:
def lineplot_for_case(df, column, title, ax):
    unique_dates = np.unique(df['Date'].values)
    date_ticks = range(0, len(unique_dates), 5)
    ax.set_xticks(date_ticks);
    ax.set_xticklabels([unique_dates[i] for i in date_ticks], rotation='vertical');
    ax.set_xlabel('Date');
    ax.set_ylabel(column);
    ax.set_title(title)
    sns.lineplot( x=df['Date'], y = df[column], ax=ax)
    
    
def plots_for_country(country):    
    test_afg = test[test['Country_Region'] == country]
    f, ax = plt.subplots(1, 2, figsize=(15, 5))
    lineplot_for_case(test_afg, 'ConfirmedCases', f'Confirmed cases for {country}', ax[0])
    lineplot_for_case(test_afg, 'Fatalities', f'Fatalities for {country}', ax[1])    

### Let's plot the results for some countries

In [None]:
plots_for_country('Italy')
plots_for_country('US')
plots_for_country('Russia')
plots_for_country('France')
plots_for_country('Australia')

### Submission

In [None]:
submission['ConfirmedCases'] = np.round(yhat_confirmed).astype(int)
submission['Fatalities'] = np.round(yhat_fatalities).astype(int)

submission.to_csv('submission.csv', header=True, index=False)
submission.head(15)