In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('../input/covid19-global-forecasting-week-1/train.csv')
test_data = pd.read_csv('../input/covid19-global-forecasting-week-1/test.csv')

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
import datetime

train_data['Date'] = pd.to_datetime(train_data['Date'], format='%Y-%m-%d')
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%Y-%m-%d')

In [None]:
train_data.groupby('Fatalities').mean()

In [None]:
# Data pre-processing
train_sl = 63 # time sequence length of training data
test_sl =  43 # time sequence length of test data
num_region = 284 # number of regions 
train_size = train_data.shape[0]

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

#######################################################################################
# Combining region features into one.
train_data = train_data.fillna(value='Only')
test_data = test_data.fillna(value='Only')
print(train_data.head())

train_data['region'] = train_data['Country/Region'] + '-' + train_data['Province/State']
test_data['region'] = test_data['Country/Region'] + '-' + test_data['Province/State']

train_data = train_data.drop(columns=['Province/State', 'Country/Region'])
test_data = test_data.drop(columns=['Province/State', 'Country/Region'])

print(train_data.head())
#######################################################################################

#######################################################################################
# numerical data scaling use sklearn.preprocessing.StandardScaler()

#######################################################################################

#######################################################################################
# in Curve Fitting, We do not use 'Lat', 'Long' features

train_data = train_data.drop(columns=['Lat', 'Long'])
test_data = test_data.drop(columns=['Lat', 'Long'])

#######################################################################################

#######################################################################################
# target scaling use sklearn.preprocessing.StandardScaler()

Conf_fata = ['ConfirmedCases', 'Fatalities']

target_num = train_data.loc[:, Conf_fata] # to do scaling

Conf_scaling = StandardScaler()
Fata_scaling = StandardScaler()

Conf_processed = Conf_scaling.fit_transform(target_num.loc[:,'ConfirmedCases'].values.reshape(-1, 1))
Fata_processed = Fata_scaling.fit_transform(target_num.loc[:,'Fatalities'].values.reshape(-1, 1))

train_data['ConfirmedCases'] = Conf_processed
train_data['Fatalities'] = Fata_processed

print(train_data.head())



#######################################################################################

#######################################################################################
# transform Date feature to more smaller value; days after 2020-01-22
start = pd.to_datetime('2020-01-22')
train_data['Days after Jan 22'] = train_data['Date'] - start
test_data['Days after Jan 22'] = test_data['Date'] - start

# deviding (864*1E+11); to make its unit as a 'day', dividing 10 to prevent overflow at exponential function
train_data['Days after Jan 22'] = train_data['Days after Jan 22'].astype(np.int64)/((864*1E+11)*4)
test_data['Days after Jan 22'] = test_data['Days after Jan 22'].astype(np.int64)/((864*1E+11)*4)

train_data = train_data.drop(columns=['Date'])
test_data = test_data.drop(columns=['Date'])

print(train_data.head(63))
#######################################################################################

#######################################################################################
# Target value scaling by using logarithm

# train_data['ConfirmedCases'] = train_data['ConfirmedCases'].apply(lambda x : np.log(1+x))
# train_data['Fatalities'] = train_data['Fatalities'].apply(lambda x : np.log(1+x))

In [None]:
# data grouping - Cause we want to predict country by country

Regions = np.unique(train_data['region'].values).tolist()

train_data_by_regions = {}
test_data_by_regions ={}

for i in range(num_region):
    region = Regions[i]
    train_data_by_regions[region] = train_data.iloc[i*train_sl:(i+1)*train_sl, :]
    test_data_by_regions[region] = test_data.iloc[i*test_sl:(i+1)*test_sl, :]
    
print(train_data_by_regions['Korea, South-Only'])

In [None]:
###########################
# models - Logistic fitting without Lat/Long data
# decided to discard lat/long data > since we gonna fit this model country by country; so they are meaningless now.
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

# our model; Logistic curve
def logi_curve(x, k, x_0, N, b, c):
    return N / (1+np.exp(k*k*(x_0-x))) + b*x + c # original: N / (1+np.exp(-k*(x-x_0))) # adding bias term. # adding bias term as a linear function to catch more increases

# alternative model; Exponential curve
def exp_curve(x, k, x_0, a, b):
    return a*np.exp(k*(x-x_0))+b

def curve_fitting(Country, Conf_or_Fata):
    X = train_data_by_regions[Country].loc[:, 'Days after Jan 22'].to_numpy()
    
    if Conf_or_Fata:
        y = train_data_by_regions[Country].loc[:, 'ConfirmedCases'].to_numpy()
    else:
        y = train_data_by_regions[Country].loc[:, 'Fatalities'].to_numpy()
    
    try:
        popt, pcov = curve_fit(logi_curve, X, y,method='lm', maxfev = 100000)
        logi_or_exp = True
    except RuntimeError as e:
        print(e)
        try:
            popt, pcov = curve_fit(exp_curve, X, y, maxfev = 5000)
            logi_or_exp = False
        except:
            popt = (0, 0, 0, np.amin(y))
            logi_or_exp = False
            
    return popt, logi_or_exp

def prediction_by_countries(Country, popt, logi_or_exp):
    X = test_data_by_regions[Country].loc[:, 'Days after Jan 22'].to_numpy()

    predictions = []
    if logi_or_exp:
        for i in range(X.shape[0]):
            y = logi_curve(X[i], *popt)
            predictions.append(y)
    else:
         for i in range(X.shape[0]):
            y = exp_curve(X[i], *popt)
            predictions.append(y)

    return np.array(predictions)

In [None]:
# training

Conf_params_by_countries ={}
Fata_params_by_countries ={}

for region in Regions:
    Conf_param, conf_loe = curve_fitting(region, True)
    Fata_param, fata_loe = curve_fitting(region, False)
    
    Conf_params_by_countries[region]=(Conf_param, conf_loe)
    Fata_params_by_countries[region]=(Fata_param, fata_loe)

In [None]:
# predicting

preds_by_countries = {}
Conf_predictions = np.array([])
Fata_predictions = np.array([])

for region in Regions:
    conf_param, conf_loe = Conf_params_by_countries[region]
    fata_param, fata_loe = Fata_params_by_countries[region]
    
    conf_pred = prediction_by_countries(region, conf_param, conf_loe)
    fata_pred = prediction_by_countries(region, fata_param, fata_loe)
    
    preds_by_countries[region]= (conf_pred, fata_pred)
    
    Conf_predictions = np.append(Conf_predictions, conf_pred)
    Fata_predictions = np.append(Fata_predictions, fata_pred)

    
# inverse transform
Conf_predictions = Conf_scaling.inverse_transform(Conf_predictions)
Fata_predictions = Fata_scaling.inverse_transform(Fata_predictions)
# inverse transform;
# Conf_predictions = np.exp(Conf_predictions)-1
# Fata_predictions = np.exp(Fata_predictions)-1

Conf_predictions[Conf_predictions<0]=0
Fata_predictions[Fata_predictions<0]=0

In [None]:
# to show our fitted model of korea, south.
conf_param, conf_loe = Conf_params_by_countries['Korea, South-Only']
fata_param, fata_loe = Fata_params_by_countries['Korea, South-Only']
X = train_data_by_regions['Korea, South-Only'].loc[:, 'Days after Jan 22'].to_numpy()

conf_model = []
fata_model = []
for i in range(X.shape[0]):
    if conf_loe:
        conf = logi_curve(X[i], *conf_param)
    else:
        conf = exp_curve(X[i], *conf_param)
    if fata_loe:
        fata = logi_curve(X[i], *fata_param)
    else:
        fata = exp_curve(X[i], *fata_param)
    conf_model.append(conf)
    fata_model.append(fata)

conf_model = np.array(conf_model)
fata_model = np.array(fata_model)

In [None]:
# plot our results.
%matplotlib inline

days_train = train_data_by_regions['Korea, South-Only'].loc[:, 'Days after Jan 22'].to_numpy() *4
days_test = test_data_by_regions['Korea, South-Only'].loc[:, 'Days after Jan 22'].to_numpy() *4

Confirmed_cases_train = train_data_by_regions['Korea, South-Only'].loc[:, 'ConfirmedCases'].to_numpy() 
Confirmed_cases_trainedmodel = conf_model
Confirmed_cases_test = preds_by_countries['Korea, South-Only'][0] #prediction

Fatalities_train = train_data_by_regions['Korea, South-Only'].loc[:, 'Fatalities'].to_numpy()
Fatalities_trainedmodel = fata_model
Fatalities_test = preds_by_countries['Korea, South-Only'][1] # prediction

plt.figure(figsize=(25,10))
plt.plot(days_train, Confirmed_cases_train, 'b-', label='Confirmed_cases_train')
plt.plot(days_train, Confirmed_cases_trainedmodel, 'r--', label='Confirmed_cases_trained_model')
plt.plot(days_test, Confirmed_cases_test, 'ro', lw=1, label='Confirmed_cases_test')
plt.xlabel('Days after Jan 22, 2020')
plt.ylabel('ConfirmedCases')
plt.legend(loc='upper left')
plt.show()

plt.figure(figsize=(25,10))
plt.plot(days_train, Fatalities_train, 'g-', label='Fatalities_train')
plt.plot(days_train, Fatalities_trainedmodel, 'r--', label='Fatalities_trained_model')
plt.plot(days_test, Fatalities_test, 'ro', lw=1, label='Fatalities_test')
plt.xlabel('Days after Jan 22, 2020')
plt.ylabel('Fatalities')
plt.legend(loc='upper left')
plt.show()

In [None]:
submission = pd.read_csv('../input/covid19-global-forecasting-week-1/submission.csv')
output = pd.DataFrame({
    'ForecastId': submission['ForecastId'],
    'ConfirmedCases': Conf_predictions,
    'Fatalities': Fata_predictions
})

In [None]:
output.head()
output

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)