In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
from datetime import timedelta
%matplotlib inline
sns.set_theme()

In [None]:
# Exploratory Data Analysis (EDA)

In [None]:
test = pd.read_csv('../input/covid19-global-forecasting-week-4/test.csv')
train = pd.read_csv('../input/covid19-global-forecasting-week-4/train.csv')
train.head()

In [None]:
# Check missing values of train set:
train.isna().sum()

In [None]:
# Check missing values of test set:
test.isna().sum()

In [None]:
# It can be seen that all missing values occur at 'Province_State' feature
# At current stage, Province_State feature seems not important since the analyses mainly produce country-wide forecasts 
# Hence, we would fill in the missing values simply with 'NA'.
# We would explore more on 'Province_State' if analyses show more signficiance of this feature.
train.Province_State.fillna('NA', inplace = True)
train['Date'] = pd.to_datetime(train.Date).dt.date
test.Province_State.fillna('NA', inplace = True)
test['Date'] = pd.to_datetime(test.Date).dt.date

In [None]:
train.describe(include='all')

In [None]:
train.info()

In [None]:
# Explore Country_Region, Date and Province_State
_country_region = train['Country_Region'].nunique()
print(f'The number of Country_Region: {_country_region}')
_date = train['Date'].nunique() 
_maxdate = max(train['Date'])
_mindate = min(train['Date'])
print(f'The number of Date: {_date}, from day {_mindate} to day {_maxdate}')
_informed_P_S = train[train['Province_State'] != 'NA']['Country_Region'].nunique()

print(f'There are totally {_informed_P_S} countries with Province/State recorded')
print(f'They are respectively: ', end='')
_list = train[train['Province_State'] != 'NA']['Country_Region'].unique()
for i in range(len(_list)):
    if i != len(_list) - 1:
        print(_list[i], end = ', ')
    else:
        print(_list[i], end = '. ')

In [None]:
n_province =  train[train['Province_State'] != 'NA']['Province_State'].nunique()
print(f'There are {n_province} provinces/states recorded')
print()

_groupby_Country = train[train['Province_State'] != 'NA'].groupby('Country_Region')
for country, info in _groupby_Country:
    _num = info['Province_State'].nunique()
    _P_S = info['Province_State'].unique()
    print(f'{_num} provinces/states in {country}: ', end='')
    for i in range(len(_P_S)):
        if i != len(_P_S) - 1:
            print(_P_S[i], end = ', ')
        else:
            print(_P_S[i], end = '. ')
            print()
    print()

In [None]:
# Add two features, daily confirmed cases and daily fatalities for data visualization, can be removed during data analysis
train['DailyConfirmedCases'] = train['ConfirmedCases'] - train['ConfirmedCases'].shift(periods=1, fill_value=0)

train['DailyFatalities'] = train['Fatalities'] - train['Fatalities'].shift(periods=1, fill_value=0)

# Replace negative values with 0
train['DailyConfirmedCases'] = train.DailyConfirmedCases.apply(lambda x: 0 if x<0 else x)
train['DailyFatalities'] = train.DailyFatalities.apply(lambda x: 0 if x<0 else x)

_pivot= pd.pivot_table(train, values=['ConfirmedCases','Fatalities', 'DailyConfirmedCases', 'DailyFatalities'], index=['Date'],aggfunc=np.sum)
_confirmed = _pivot.max()['ConfirmedCases']
_fatalities = _pivot.max()['Fatalities']
print(f'From {_mindate} to {_maxdate}, there are totally {_confirmed} confirmed cases and {_fatalities} fatalities')

In [None]:
# create a new feature 'mortality Rate': fatalities / confirmedcases
train['MortalityRate'] = train['Fatalities'] / train['ConfirmedCases']
train['MortalityRate'] = train['MortalityRate'].fillna(0)

# create a new feature 'Increment Rate': Daily Confirmed Cases (today) / Daily Confirmed Cases (last day)
train['IncrementRate'] = train['DailyConfirmedCases'] / train['DailyConfirmedCases'].shift(1)
# if new confirmed cases in the last day is 0, the induced cumulative growth rate will be inf rather than NAN
# Hence need to replace inf and -inf to 0, also fillna with 0
train['IncrementRate'] = train['IncrementRate'].replace(np.inf, 0)
train['IncrementRate'] = train['IncrementRate'].fillna(0)

# Not sure if the motality rate or increment rate will be helpful, need verification in the later assessment
train.head()

In [None]:
# Understand global situatoion

plt.figure(figsize=(16,9))
plt.suptitle('COVID-19 Global Confirmed Cases & Fatalities', fontsize='x-large')

plt.subplot(1,2,1)
line1 = plt.plot(_pivot['ConfirmedCases'], label='ConfirmedCases')
line2 = plt.plot(_pivot['DailyConfirmedCases'], label='DailyConfirmedCases', linestyle='--')
plt.title('Global Confirmed Cases', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Number', fontsize=14)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=40, ha='center')
plt.legend(fontsize=14)

plt.subplot(1,2,2)
line1 = plt.plot(_pivot['Fatalities'], label='Fatalities')
line2 = plt.plot(_pivot['DailyFatalities'], label='DailyFatalities', linestyle='--')
plt.title('Global Fatalities', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Number', fontsize=14)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(rotation=40, ha='center')
plt.legend(fontsize=14)

In [None]:
# Visualize world-wide average mortality rate
_groupby_Date = train.groupby(['Date'], as_index=True).agg({'MortalityRate': 'mean'})

plt.figure(figsize=(16,9))
line = plt.plot(_groupby_Date['MortalityRate'], label='Mortality Rate', linestyle='-')
plt.title('World-Wide Average Mortality Rate', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Mortality Rate', fontsize=14)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks(ha='center')
plt.legend(fontsize=14)

In [None]:
# Understand each country situation

In [None]:
_groupby_Country = train.groupby(['Country_Region','Date']).aggregate({'ConfirmedCases':'sum','Fatalities':'sum', 'MortalityRate': 'mean', 'IncrementRate': 'mean'})
cases_by_country = _groupby_Country.iloc[_groupby_Country.index.get_level_values('Date') == _maxdate]
cases_by_country = cases_by_country.sort_values(by='ConfirmedCases', ascending=False) # can't use inplace=True which will induce a copy warning
cases_by_country.reset_index(level=['Date'], inplace=True)
cases_by_country.style.background_gradient(cmap='Reds')

In [None]:
reset_cases_by_country = cases_by_country.reset_index(level=0)
ax = reset_cases_by_country.head(20).plot(figsize=(12,8),x='Country_Region', y=['ConfirmedCases', 'Fatalities'], kind='barh')
plt.ylabel('Country_Region',fontsize=14)
plt.xlabel('Number of COVID-19 Cases', fontsize=14)
plt.title('First 20 Countries with Most Confirmed Cases', fontsize=16)
plt.ticklabel_format(style='plain', axis='x')
for index, data in enumerate(reset_cases_by_country.head(20)['ConfirmedCases']):
    plt.text(data, index-0.25, str(data), fontsize=10)
for index, data in enumerate(reset_cases_by_country.head(20)['Fatalities']):
    plt.text(data, index+0.1, str(data), fontsize=10)

In [None]:
# Sort by Countries with Facilities > 1500, there are 20 countries in total
cases_by_country_fatalities = cases_by_country[cases_by_country['Fatalities'] > 1500]
cases_by_country_fatalities = cases_by_country_fatalities.sort_values(by='Fatalities',ascending=False).reset_index()
display(cases_by_country_fatalities)

In [None]:
df_merge_by_country = pd.merge(_groupby_Country.reset_index(), cases_by_country_fatalities['Country_Region'], on=['Country_Region'], how='inner')
df_merge_by_country = df_merge_by_country.melt(id_vars=['Country_Region','Date'], var_name='cols', value_name='vals')

plt.figure(figsize=(16,9))
g = sns.relplot(data=df_merge_by_country, col='cols', x='Date', y='vals', hue='Country_Region', col_wrap=2, kind='line', facet_kws={'sharey': False, 'sharex': True})
g.set_axis_labels('Date', 'Total Number', fontsize=14)
g.set_xticklabels(rotation=40,ha='center')
plt.subplots_adjust(top=0.88)

g.fig.suptitle('COVID-19 Situation in Each Country (Fatalities > 1500)')
g.fig.axes[0].ticklabel_format(style='plain', axis='y')
g.fig.axes[0].set_title('Confirmed Cases', fontsize=14)
g.fig.axes[1].set_title('Fatalities', fontsize=14)
g.fig.axes[2].set_title('Average Mortality Rate', fontsize=14)
g.fig.axes[2].set_ylabel('Mortality Rate', fontsize=14)
g.fig.axes[3].set_title('Average Increment Rate', fontsize=14)
g.fig.axes[3].set_ylabel('Increment Rate', fontsize=14)
# From the axes[3], it can be seen that the feature 'Increment Rate' is not very helpful due to high variance

In [None]:
# Visualize first 20 countries in terms of the highest mortality rate
# There are some impoverished countries in Africa which show extremely high mortality rate with a small number of confirmed cases
max_MortalityRate = cases_by_country.sort_values('MortalityRate', ascending=False, inplace=False)
display(max_MortalityRate.head(20))

In [None]:
# The sample of a small number of confirmed cases yet with a high mortaity rate may be not representative enough
# Hence, the dataframe gets one more filtering condition to obtain max_mortalityRate only on ConfirmedCases > 100
max_MortalityRate = cases_by_country[cases_by_country['ConfirmedCases'] > 100]
max_MortalityRate = max_MortalityRate.sort_values('MortalityRate', ascending=False, inplace=False) # inplace=True will induce a warning
max_MortalityRate = max_MortalityRate.reset_index()
display(max_MortalityRate.head(20))

In [None]:
palette = plt.get_cmap('autumn')
rainbow_col = [palette(i/20.0) for i in range(20)]
ax = max_MortalityRate.head(20).plot(figsize=(9,6), x='Country_Region', y='MortalityRate', kind='barh', legend=None, color=rainbow_col)
plt.ylabel('Country_Region',fontsize=14)
plt.xlabel('Mortality Rate', fontsize=14)
plt.title('First 20 Countries with Highest Mortality Rate', fontsize=16)
for index, data in enumerate(max_MortalityRate.head(20)['MortalityRate']):
    plt.text(data, index-0.2, '%.3f'%data, fontsize=12)

In [None]:
# Linear Regression Modelling

In [None]:
test_maxDate = max(test['Date'])
test_minDate = min(test['Date'])
train_maxDate = max(train['Date'])
train_minDate = min(train['Date'])
pred_days = (test_maxDate - test_minDate).days + 1
start_days = (test_minDate - train_minDate).days
print(f'The first day (day 0) from train dataset is {train_minDate}')
print(f'The first day (day {start_days}) from test dataset is {test_minDate}')
print(f'The last day (day {start_days+pred_days}) from test dataset is {test_maxDate}')
print(f'Total prediction period is {pred_days}')

In [None]:
def runningSum(lst, num):
    for i in range(len(lst)):
        if i == 0:
            lst[i] += num
        else:
            lst[i] += lst[i-1]
    return lst

def lr_prediction(data, bool_fit_intercept, y_test, pred_days=43): # predict the daily situation
    # The test from April 02 to May 14, 
    LR = LinearRegression(fit_intercept=bool_fit_intercept)
    X_train = np.array(range(len(data))).reshape(-1,1) # data starts from April 02, to the train data last day
    y_train = np.array(data).reshape(-1,1)
    LR.fit(X_train, y_train)
    pred = LR.predict(np.array(range(pred_days)).reshape(-1,1))
    pred = pred.reshape(pred_days,)
    return pred

# obtain the unique combinations of Province_State and Country_Region
PS_CR = train[['Province_State', 'Country_Region']].drop_duplicates() # NA is also considered as a city anyway
PS_CR.iterrows # This is to obtained each city in each country which is used for the linear regression prediction function

In [None]:
# Forecasting

In [None]:
# Predict for Confirmed Cases
PS_list_CC = []
CR_list_CC = []
Date_list_CC = []
ypred_list_CC = []
validation_Dict_CC = {}

for index, row in PS_CR.iterrows():
    train_temp = train[(train['Province_State']==row['Province_State']) & (train['Country_Region']==row['Country_Region'])]
    data = train_temp['DailyConfirmedCases'][start_days:] # note: fit data starting from the start_days, i.e. day71
    y_pred = lr_prediction(data, False, pred_days) 
    y_pred = runningSum(y_pred, train_temp['ConfirmedCases'].values[start_days])
    y_test = train_temp['ConfirmedCases'].values[start_days: start_days+pred_days]
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    R2 = r2_score(y_test, y_pred)
    validation_Dict_CC[(row[1],row[0])] = (R2, RMSE)
    
    PS_list_CC.extend([row['Province_State']]*pred_days)
    CR_list_CC.extend([row['Country_Region']]*pred_days)
    Date_list_CC.extend(list(pd.date_range(test_minDate, test_maxDate).strftime("%Y-%m-%d")))
    ypred_list_CC.extend((list(y_pred)))

results_CC = pd.DataFrame({'Province_State': PS_list_CC,
                        'Country_Region': CR_list_CC,
                        'Date': Date_list_CC,
                        'Pred_ConfirmedCases': ypred_list_CC})
results_CC['Date'] = pd.to_datetime(results_CC['Date'], format='%Y-%m-%d')
results_CC

In [None]:
# Predict for Fatalities
PS_list_F = []
CR_list_F = []
Date_list_F = []
ypred_list_F = []
validation_Dict_F = {}

for index, row in PS_CR.iterrows():
    train_temp = train[(train['Province_State']==row['Province_State']) & (train['Country_Region']==row['Country_Region'])]
    data = train_temp['Fatalities'][start_days: start_days+5] # note: fit data starting from the start_days, i.e. day71
    y_pred = lr_prediction(data, True, pred_days) 
    y_test = train_temp['Fatalities'].values[start_days: start_days+pred_days]
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    R2 = r2_score(y_test, y_pred)
    validation_Dict_F[(row[1],row[0])] = (R2, RMSE)
    
    PS_list_F.extend([row['Province_State']]*pred_days)
    CR_list_F.extend([row['Country_Region']]*pred_days)
    Date_list_F.extend(list(pd.date_range(test_minDate, test_maxDate).strftime("%Y-%m-%d")))
    ypred_list_F.extend((list(y_pred)))

results_F = pd.DataFrame({'Province_State': PS_list_F,
                        'Country_Region': CR_list_F,
                        'Date': Date_list_F,
                        'Pred_Fatalities': ypred_list_F})
results_F['Date'] = pd.to_datetime(results_F['Date'], format='%Y-%m-%d')
results_F

In [None]:
# For Submission
submission = pd.read_csv('../input/covid19-global-forecasting-week-4/submission.csv')
submission.ConfirmedCases = results_CC.Pred_ConfirmedCases
submission.Fatalities = results_F.Pred_Fatalities
submission

In [None]:
# Test submission
submission.to_csv("submission.csv", index=False)

In [None]:
def plot(country, results_CC, results_F, train):
    Pred_country=results_CC.loc[results_CC['Country_Region']==country,['Date','Pred_ConfirmedCases']]
    Pred_country['Pred_Fatalities']=results_F.loc[results_F['Country_Region']==country,['Pred_Fatalities']]
    Fact_country = train.loc[train['Country_Region']==country,['Date' ,'ConfirmedCases', 'Fatalities']]

    plt.figure(figsize=(16,9))
    plt.suptitle(f'Comparison of Prediction and Real Condition - {country}', fontsize=20)

    plt.subplot(1,2,1)
    line1 = plt.plot(Fact_country['Date'], Fact_country['ConfirmedCases'], label='ConfirmedCases')
    line2 = plt.plot(Pred_country['Date'], Pred_country['Pred_ConfirmedCases'], label='Pred_ConfirmedCases')
    plt.title('Confirmed Cases', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Total Number', fontsize=14)
    plt.ticklabel_format(style='plain', axis='y')
    plt.xticks(rotation=40, ha='center')
    plt.legend(fontsize=14)

    plt.subplot(1,2,2)
    line1 = plt.plot(Fact_country['Date'], Fact_country['Fatalities'], label='Fatalities')
    line2 = plt.plot(Pred_country['Date'], Pred_country['Pred_Fatalities'], label='Pred_Fatalities')
    plt.title('Fatalities', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Total Number', fontsize=14)
    plt.ticklabel_format(style='plain', axis='y')
    plt.xticks(rotation=40, ha='center')
    plt.legend(fontsize=14)

In [None]:
plot('Norway', results_CC, results_F, train)

In [None]:
plot('Brazil', results_CC, results_F, train)

In [None]:
plot('Singapore', results_CC, results_F, train)

In [None]:
plot('Malaysia', results_CC, results_F, train)

In [None]:
plot('Indonesia', results_CC, results_F, train)

In [None]:
# Modelling Validation:
# The R2 and RMSE for prediction of confirmed cases and fatalities have been obtained
validation = pd.DataFrame({
    'Country_Province': validation_Dict_CC.keys(), 
    'ConfirmedCases R2 - RMSE': validation_Dict_CC.values(),
    'Fatalities R2 - RMSE': validation_Dict_F.values(),
})
validation