## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import minimize

## Importing dataset

In [None]:
train_dataset = pd.read_csv('../input/covid19-global-forecasting-week-4/train.csv')
test_dataset = pd.read_csv('../input/covid19-global-forecasting-week-4/test.csv')
submission = pd.read_csv('../input/covid19-global-forecasting-week-4/submission.csv')

In [None]:
submission.head()

## View information about the dataset.

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
train_dataset.isna().sum()

In [None]:
test_dataset.isna().sum()

## Combine **'Province_State'** and **'Country_Region'**

In [None]:
train_dataset['Province_State'].fillna('', inplace = True)
test_dataset['Province_State'].fillna('', inplace = True)

In [None]:
train_dataset['Country_Region'] = train_dataset['Country_Region'] + ' ' + train_dataset['Province_State']
test_dataset['Country_Region'] = test_dataset['Country_Region'] + ' ' + test_dataset['Province_State']
del train_dataset['Province_State']
del test_dataset['Province_State']

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
# How many countries
train_dataset['Country_Region'].describe()

In [None]:
country_list = train_dataset['Country_Region'].unique()

In [None]:
train_date = train_dataset.Date.unique()
train_date

In [None]:
test_date = test_dataset.Date.unique()
test_date

In [None]:
train_days = np.arange(len(train_date))
train_days

In [None]:
train_days[train_date == '2020-04-02']

In [None]:
train_days[train_date == '2020-04-22']

In [None]:
test_days = np.arange(len(test_date)) + 71
test_days

In [None]:
train_end = train_days[train_date == '2020-04-22']
test_start = test_days[0]

In [None]:
train_end

In [None]:
test_start

In [None]:
Day = np.zeros(len(train_dataset))
for ii in range(len(train_date)):
    Day[train_dataset.Date == train_date[ii]] = train_days[ii]
train_dataset['Day'] = Day

In [None]:
train_dataset.head(5)

In [None]:
Day = np.zeros(len(test_dataset))
for ii in range(len(test_date)):
    Day[test_dataset.Date == test_date[ii]] = test_days[ii]
test_dataset['Day'] = Day

In [None]:
test_dataset.head(5)

## Top 10 confirmed cases countries (2020-04-22)

In [None]:
top_comfirmedcases = train_dataset[train_dataset.Date == '2020-04-22'].sort_values(by = 'ConfirmedCases', ascending = False)
top_comfirmedcases.head(10)

In [None]:
def country_plot(country):
    train = train_dataset[train_dataset['Country_Region'] == country]
    test = test_dataset[test_dataset['Country_Region'] == country]
    
    # X_train
    x_train = train.Day.values
    confirmed_train = train.ConfirmedCases.values
    fatalities_train = train.Fatalities.values
    
    # Plot figures
    # Confirmed cases
    plt.figure(figsize = (15, 3))
    plt.subplot(1, 2, 1)
    plt.xlabel('Days')
    plt.ylabel('Confirmed cases')
    plt.title(country)
    plt.plot(x_train, confirmed_train)
    plt.grid()

    # Fatalities
    plt.subplot(1, 2, 2)
    plt.xlabel('Days')
    plt.ylabel('Fatalities')
    plt.title(country)
    plt.plot(x_train, fatalities_train, color = 'orange')
    plt.grid()
    plt.show()

In [None]:
for country in top_comfirmedcases.Country_Region[0:9].values:
    country_plot(country)

## Logistic growth model

$$f(t) = \frac{\theta_{1}}{1 + \theta_{2} e^{-\theta_{3}t}}$$

where

- $\theta_{1}$: The curve's maximum value, since $\lim_{t \to \infty} f(t) = \theta_{1}$.
- $\theta_{2}$: The displacement along the x-axis.
- $\theta_{3}$: The logistic growth rate or steepness of the curve.

Reference: [wiki](https://en.wikipedia.org/wiki/Logistic_function)

In [None]:
def Logistic(t, theta1, theta2, theta3):
    '''
    theta1: 
    theta2: 
    theta3: 
    '''
    f = theta1 / (1 + theta2 * np.exp(-theta3 * t))
    return f

In [None]:
x = np.linspace(start = -2, stop = 5, num = 50)
y1 = Logistic(x, theta1 = 5, theta2 = 1, theta3 = 1)
y2 = Logistic(x, theta1 = 5, theta2 = 1.5, theta3 = 1)
y3 = Logistic(x, theta1 = 5, theta2 = 2, theta3 = 1)

plt.figure(figsize = (12, 8))
plt.plot(x, y1, label = 'y1')
plt.plot(x, y2, label = 'y2')
plt.plot(x, y3, label = 'y3')
plt.legend()
plt.grid()
plt.show()

## Example: 'Korea, South '

In [None]:
country = 'Korea, South '
train = train_dataset[train_dataset['Country_Region'] == country]
test = test_dataset[test_dataset['Country_Region'] == country]

# X_train
x_train = train.Day.values
confirmed_train = train.ConfirmedCases.values
fatalities_train = train.Fatalities.values

# X_test
x_test = test.Day.values
country_plot(country)

## **Least-Squared-Estimation**: scipy.optimize.curve_fit

In [None]:
popt_confirmed, pcov_confirmed = curve_fit(f = Logistic, 
                                           xdata = x_train, 
                                           ydata = confirmed_train, 
                                           p0 = [3 * max(confirmed_train), 1, 1], 
                                           maxfev = 800)

In [None]:
popt_confirmed

In [None]:
pcov_confirmed

In [None]:
def curve_plot(x_train, y_train, x_test, est):
    plt.figure(figsize = (12, 5))
    plt.xlabel('Days')
    plt.ylabel('Cases')
    plt.title(country)
    plt.scatter(x_train, y_train, color = 'r')
    plt.plot(x_train, Logistic(x_train, *est), label = 'Fitting curve (train)')
    plt.plot(x_test, Logistic(x_test, *est), label = 'Fitting curve (test)')
    plt.axvline(x = test_start, color = 'r', linestyle = ':', label = 'test_start = %.f' % (test_start))
    plt.axvline(x = train_end, color = 'b', linestyle = ':', label = 'train_end = %.f' % (train_end))
    plt.legend()
    plt.show()

In [None]:
curve_plot(x_train = x_train, y_train = confirmed_train, x_test = x_test, est = popt_confirmed)

In [None]:
popt_fatalities, pcov_fatalities = curve_fit(f = Logistic, 
                                             xdata = x_train, 
                                             ydata = fatalities_train, 
                                             p0 = [3 * max(fatalities_train), 1, 1], 
                                             maxfev = 800)

In [None]:
popt_fatalities

In [None]:
pcov_fatalities

In [None]:
curve_plot(x_train = x_train, y_train = fatalities_train, x_test = x_test, est = popt_fatalities)

## Minimized loss function: scipy.optimize.minimize

Consider the nonlinear regression model

$$y_{i} = f(t_{i};\theta) + \varepsilon_{i},\quad  i=1, 2, ..., n$$

The function is given by

$$f(t) = \frac{\theta_{1}}{1 + \theta_{2} e^{-\theta_{3}t}}$$

, where $\theta_{1} > 0$, $\theta_{2} > 0$, $\theta_{3} > 0$.

The estimator $(\hat{\theta}_{1}, \hat{\theta}_{2}, \hat{\theta}_{3})$ is obtained by minimizing loss function

$$L(\theta) = \frac{1}{n} \sum_{i=1}^{n} [y_{i} - f(t_{i};\theta)]^{2}$$

In [None]:
# Minimize the Loss function: MSE
def growth_curve(x, y):
    # Loss function
    def l_fun(params): 
        theta1 = np.exp(params[0])
        theta2 = np.exp(params[1])
        theta3 = np.exp(params[2])
        mse = np.mean((y - Logistic(x, theta1, theta2, theta3)) ** 2)
        return mse

    p0 = [np.log(3 * max(y)), 0, 0]
    res = minimize(fun = l_fun, x0 = p0, method = 'L-BFGS-B')
    mse = res.fun

    # convergence_res
    convergence_res = {'MSE': mse,
                       'nfev': res.nfev, 
                       'nit': res.nit, 
                       'status': res.status}

    # Estimator
    est = np.exp(res.x)
    return est, convergence_res

In [None]:
# Confirmed cases
est_confirmed, convergence_res = growth_curve(x = x_train, y = confirmed_train)
convergence_res

In [None]:
curve_plot(x_train = x_train, y_train = confirmed_train, x_test = x_test, est = est_confirmed)

In [None]:
# Confirmed cases
est_fatalities, convergence_res = growth_curve(x = x_train, y = fatalities_train)
convergence_res

In [None]:
curve_plot(x_train = x_train, y_train = fatalities_train, x_test = x_test, est = est_fatalities)

## Submission

In [None]:
confirmed_pred = np.zeros(len(test_dataset))
fatalities_pred = np.zeros(len(test_dataset))

In [None]:
for country in country_list:
    train = train_dataset[train_dataset['Country_Region'] == country]
    test = test_dataset[test_dataset['Country_Region'] == country]
    
    # X_train
    x_train = train.Day.values
    confirmed_train = train.ConfirmedCases.values
    fatalities_train = train.Fatalities.values
    
    # X_test
    x_test = test.Day.values

    # Confirmed cases
    confirmed_est, _ = growth_curve(x = x_train, y = confirmed_train)
    
    # Fatalities    
    fatalities_est, _ = growth_curve(x = x_train, y = fatalities_train)
    
    # Predictions
    confirmed_pred[test_dataset.Country_Region == country] = Logistic(x_test, *confirmed_est)
    fatalities_pred[test_dataset.Country_Region == country] = Logistic(x_test, *fatalities_est)

In [None]:
submission['ConfirmedCases'] = confirmed_pred
submission['Fatalities'] = fatalities_pred

In [None]:
submission.to_csv('submission.csv', index = False)