In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #for plotting graph
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.api import VAR
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

# Any results you write to the current directory are saved as output.

In [None]:
df_train=pd.read_csv("../input/covid19-global-forecasting-week-4/train.csv")
df_test=pd.read_csv("../input/covid19-global-forecasting-week-4/test.csv")
df_sub=pd.read_csv("../input/covid19-global-forecasting-week-4/submission.csv")

print(df_train.shape) # dimension of df_train
print(df_test.shape)
print(df_sub.shape)

In [None]:
df_train.tail(5)

In [None]:
df_test.head(5)

In [None]:
df_sub.tail(5)

### STEP 1: Clean the data
#### One problem is that there is NaN in Province_State, so we need a combination of both Province_State and Country_Region as one attribute.

In [None]:
print(len(df_train.Country_Region.unique()), len(df_train.Province_State.unique()))
print(len(df_test.Country_Region.unique()), len(df_test.Province_State.unique()))

In [None]:
df_train.Country_Region.unique()  # It lists out the countries

In [None]:
df_train.Province_State.unique()  # It lists out provinces/states as the smaller category of country

In [None]:
df_train["Unique_Region"] = df_train.Country_Region
df_train.Unique_Region[df_train.Province_State.isna() == False] = df_train.Province_State+" , "+df_train.Country_Region
df_train.sample(5)

In [None]:
df_train.drop(labels=["Id","Province_State","Country_Region"], axis=1, inplace=True)

In [None]:
df_train.sample(5)

In [None]:
df_test["Unique_Region"] = df_test.Country_Region
df_test.Unique_Region[df_test.Province_State.isna() == False] = df_test.Province_State+" , "+df_test.Country_Region
df_test.sample(5)

In [None]:
df_test.drop(labels=["Province_State","Country_Region"], axis=1, inplace=True)
df_test.sample(5)

### STEP 2: Glancing at the data (Time-series and Cross-sectional dimension as the panel)
#### Next, by looking at time-series dimension, visualize the world trend.

In [None]:
print(list(df_train.Date.unique()))
print(len(df_train.Date.unique()))

In [None]:
print(list(df_test.Date.unique()))
print(len(df_test.Date.unique()))

In [None]:
world_list_confirmed = []
world_list_fatality = []
for date in list(df_train.Date.unique()):
    confirmed = df_train.ConfirmedCases[df_train.Date == date]
    world_list_confirmed.append(sum(confirmed))
    fatality = df_train.Fatalities[df_train.Date == date]
    world_list_fatality.append(sum(fatality))

In [None]:
plt.plot(world_list_confirmed)

In [None]:
plt.plot(world_list_fatality)

In [None]:
plot_acf(pd.Series(world_list_fatality))
pyplot.show()

In [None]:
plt.plot(pd.Series.diff(pd.Series(world_list_confirmed)))

In [None]:
plt.plot(pd.Series.diff(pd.Series(world_list_fatality)))

In [None]:
plot_acf(pd.Series.diff(pd.Series(world_list_fatality))[60:])
pyplot.show()

In [None]:
plot_pacf(pd.Series.diff(pd.Series(world_list_fatality))[60:])
pyplot.show()

#### From the first two plots, confirmed cases and fatality show a rapidly increasing trend clearly. Also, they are not weakly stationary from ACF plot.
#### However, for differenced plots, the difference shows a rapidly increasing trend early on, but the trend disappeared at the very end. Hence, we will use data from 60th day onward, since the differenced data are stationary since then.
#### ACF and PACF plot suggest using AR(1). And, seeing from both differenced confirmed cases and fatality plots, they are strongly related to each other, and this may suggest VAR(1).

In [None]:
date_list = list(df_train.Date.unique())
print(date_list[61])
print(date_list[71])
date_test = list(df_test.Date.unique())

#### We use the data only from 23 March to 14 April. And we will forecast from 15 April to May 14

#### After looking at time-series dimension, let's look at cross-sectional dimension. We look at min, first quartile, median, third-quartile and maximum.

In [None]:
q00_confirmed = []
q25_confirmed = []
q50_confirmed = []
q75_confirmed = []
q10_confirmed = []
for date in date_list:
    list_confirm = pd.Series(df_train.ConfirmedCases[df_train.Date == date])
    q00_confirmed.append(list_confirm.quantile(q = 0))
    q25_confirmed.append(list_confirm.quantile(q = 0.25))
    q50_confirmed.append(list_confirm.quantile(q = 0.5))
    q75_confirmed.append(list_confirm.quantile(q = 0.75))
    q10_confirmed.append(list_confirm.quantile(q = 1))

print(q00_confirmed[61:83])
print(q25_confirmed[61:83])
print(q50_confirmed[61:83])
print(q75_confirmed[61:83])
print(q10_confirmed[61:83])

#### The 25% percentile is around 60, and the median is around 500.

In [None]:
q00_fatality = []
q25_fatality = []
q50_fatality = []
q75_fatality = []
q10_fatality = []
for date in date_list:
    list_confirm = pd.Series(df_train.Fatalities[df_train.Date == date])
    q00_fatality.append(list_confirm.quantile(q = 0))
    q25_fatality.append(list_confirm.quantile(q = 0.25))
    q50_fatality.append(list_confirm.quantile(q = 0.5))
    q75_fatality.append(list_confirm.quantile(q = 0.75))
    q10_fatality.append(list_confirm.quantile(q = 1))

print(q00_fatality[61:83])
print(q25_fatality[61:83])
print(q50_fatality[61:83])
print(q75_fatality[61:83])
print(q10_fatality[61:83])

#### The median is around 10, and 75% percentile is around 70.

#### Plugging in fixed value should be used for a very few number of confirmed cases, or fatalities, since the number would not be likely to change after 14 April.
#### Using two separate AR(1) processes should be used for the region where the number of confirmed cases, or fatalities were not high, since there would not be strong relationship between the two series.
#### Using VAR(1) process should be used for the region where the number of confirmed cases and fatalities were high since there would be strong relationship between the two series.

#### After glancing at the summary statistics, we used (60,10) as the first threshold of confirmed cases and fatalities, switching from plugging in fixed values to using AR(1) process. 
#### Then, the next threshold of confirmed cases and fatalities is (500, 70), switching from AR(1) to VAR(1) process

### STEP 3 (final): Learning Algorithm
#### 1. For confirmed cases <= 60 and falities <= 10 on 14 April, keep predicting those values.
#### 2. For confirmed case > 60, but fatalities <= 10 on 14 April, use simple AR(1) process of differenced series to predict confirmed cases, and plug in the most recent value on fatalities.
#### 3. For confirmed case <= 60, but fatalities > 10 on 14 April, use simple AR(1) process of differenced series to predict fatality, and plug in the most recent value on confirmed cases.
#### 4. If both series are greater than (60, 10), but either confirmed case <= 500 or fatalities <= 70, use two simple AR(1) processes on differenced series to predict each variable.
#### 4. Otherwise, use VAR(1) process using differenced series to predict both confirmed cases and fatalities simultaneously. Motivation: If the number of confirmed cases and fatalities are high enough, there is a strong relationship between these two variables.


#### Now, we are ready to build the model and submission

In [None]:
Date_submission = date_list[71:84]
Date_prediction = date_test[13:]
print(Date_submission)
print(Date_prediction)
print(len(Date_prediction))

In [None]:
 def submit_from_pred (date_list, value_list, Unique_Region, test_df, submission_df, num): #If num = 0, confirmed cases. Else, fatality.
    if len(date_list) != len(value_list):
        print("Error, the length of these two lists are not equal")
    else:
        for i in range(len(date_list)):
            pred = value_list[i]
            selected_df = test_df[(test_df["Date"] == date_list[i]) & (test_df["Unique_Region"] == Unique_Region)]
            forecastID = selected_df["ForecastId"].iloc[0]
            if num == 0:
                submission_df.ConfirmedCases[submission_df["ForecastId"] == forecastID] = pred
            else:
                submission_df.Fatalities[submission_df["ForecastId"] == forecastID] = pred

In [None]:
def from_diff_to_var (diff_series, start_value):
    value = start_value
    for i in range(len(diff_series)):
        value += diff_series[i]
        diff_series[i] = value
    return diff_series

In [None]:
regions = list(df_train.Unique_Region.unique())

for region in regions:
    selected_df = df_train[df_train.Unique_Region == region]
    data_df = selected_df.iloc[61:84]

    confirmed_train = data_df["ConfirmedCases"]
    fatality_train = data_df["Fatalities"]
    
    confirmed_diff = confirmed_train.diff()[1:] # Differencing causes one lost observation
    fatality_diff = fatality_train.diff()[1:]
    
    latest_confirmed = data_df.iloc[-1]["ConfirmedCases"]
    latest_fatality = data_df.iloc[-1]["Fatalities"]
    
    if latest_confirmed <= 60 and latest_fatality <= 10: # Plug in fixed values
        confirmed = [latest_confirmed]*30
        fatality = [latest_fatality]*30
        # Plug in the latest value directly
        submit_from_pred(Date_prediction, confirmed, region, df_test, df_sub, 0) 
        submit_from_pred(Date_prediction, fatality, region, df_test, df_sub, 1)
    
    else:
        if latest_confirmed <= 60 or latest_fatality <= 10: # Use only one AR(1) process
            if latest_confirmed <= 60:
                # Plug in latest value on confirmed
                confirmed = [latest_confirmed]*30
                submit_from_pred(Date_prediction, confirmed, region, df_test, df_sub, 0)
                # Conduct AR(1) process on fatality
                model_fatality = AutoReg(fatality_diff, lags=1)
                model_fatality = model_fatality.fit()
                prediction_fatality = list(model_fatality.predict(start=len(fatality_diff), end=len(fatality_diff)+len(Date_prediction)-1))
                prediction_fatality = from_diff_to_var(prediction_fatality, latest_fatality)
                prediction_fatality = [round(prediction) for prediction in prediction_fatality]
                submit_from_pred(Date_prediction, list(prediction_fatality), region, df_test, df_sub, 1)
                
                
            else:
                # Plug in latest value on fatality
                fatality = [latest_fatality]*30
                submit_from_pred(Date_prediction, fatality, region, df_test, df_sub, 1)
                # Conduct AR(1) process on confirmed
                model_confirmed = AutoReg(confirmed_diff, lags=1)
                model_confirmed = model_confirmed.fit()
                prediction_confirmed = list(model_confirmed.predict(start=len(confirmed_diff), end=len(confirmed_diff)+len(Date_prediction)-1))
                prediction_confirmed = from_diff_to_var(prediction_confirmed, latest_confirmed)
                prediction_confirmed = [round(prediction) for prediction in prediction_confirmed]
                submit_from_pred(Date_prediction, list(prediction_confirmed), region, df_test, df_sub, 0)
                
        else:
            if latest_confirmed <= 500 or latest_fatality <= 70: # Use two separated AR(1) processes
                # Conduct AR(1) process on confirmed
                model_confirmed = AutoReg(confirmed_diff, lags=1)
                model_confirmed = model_confirmed.fit()
                prediction_confirmed = list(model_confirmed.predict(start=len(confirmed_diff), end=len(confirmed_diff)+len(Date_prediction)-1))
                prediction_confirmed = from_diff_to_var(prediction_confirmed, latest_confirmed)
                prediction_confirmed = [round(prediction) for prediction in prediction_confirmed]
                submit_from_pred(Date_prediction, list(prediction_confirmed), region, df_test, df_sub, 0)
                # Conduct AR(1) process on fatality
                model_fatality = AutoReg(fatality_diff, lags=1)
                model_fatality = model_fatality.fit()
                prediction_fatality = list(model_fatality.predict(start=len(fatality_diff), end=len(fatality_diff)+len(Date_prediction)-1))
                prediction_fatality = from_diff_to_var(prediction_fatality, latest_fatality)
                prediction_fatality = [round(prediction) for prediction in prediction_fatality]
                submit_from_pred(Date_prediction, list(prediction_fatality), region, df_test, df_sub, 1)
                
                
            else: # If these two variables are so high, use VAR(1) process
                # Conduct VAR(1) on these two variables simultaneously
                data_VAR = pd.DataFrame({'confirmed_diff':confirmed_diff, 'fatality_diff':fatality_diff})
                model_VAR = VAR(data_VAR)
                model_VAR = model_VAR.fit(1)
                # Forecast using VAR(1)
                forecast_input = data_VAR.values[-1:]
                forecast = model_VAR.forecast(y=forecast_input, steps=len(Date_prediction))
                confirmed_forecast = []
                fatality_forecast = []
                # Copy those values in submission.csv
                for k in range(len(forecast)):
                    confirmed_forecast.append(forecast[k][0])
                    fatality_forecast.append(forecast[k][1])
                prediction_confirmed = from_diff_to_var(confirmed_forecast, latest_confirmed)
                prediction_confirmed = [round(prediction) for prediction in prediction_confirmed]
                submit_from_pred(Date_prediction, list(prediction_confirmed), region, df_test, df_sub, 0)
                prediction_fatality = from_diff_to_var(fatality_forecast, latest_fatality)
                prediction_fatality = [round(prediction) for prediction in prediction_fatality]
                submit_from_pred(Date_prediction, list(prediction_fatality), region, df_test, df_sub, 1)
                                                             
                

    
    

                

In [None]:
regions = list(df_train.Unique_Region.unique())
for region in regions:
    selected_df = df_train[df_train.Unique_Region == region]
    data_df = selected_df.iloc[71:84]
    
    confirmed = list(data_df["ConfirmedCases"])
    fatality = list(data_df["Fatalities"])
    
    submit_from_pred(Date_submission, confirmed, region, df_test, df_sub, 0)
    submit_from_pred(Date_submission, fatality, region, df_test, df_sub, 1)
        
        

In [None]:
df_sub.head(30)

In [None]:
df_sub.to_csv("submission.csv", index = False)