# Importing necessary libraries

In [None]:
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')
import seaborn as sns
import pandas as pd
from datetime import datetime
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, Activation, Dropout,RNN
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
plotsize = (12,5)

# Reading CSV file having Covid-19 records in India

In [None]:
base  = pd.read_csv("../input/covid19-in-india/covid_19_india.csv")

In [None]:
base

In [None]:
base.describe()

# Making sense of Data using Pandas Profiling

In [None]:
import pandas_profiling as pp
profile = pp.ProfileReport(base)
profile.to_file("output.html")

In [None]:
profile

Converting date column to a 'Datetime' object.

In [None]:
base['Date'] = pd.to_datetime(base['Date'])

Grouping data by 'Date' to find cumulative sum of cases in India.

In [None]:
data = base.groupby(by=['Date']).sum().diff()

In [None]:
data

Removing 'NAN' values and replacing with 0.

In [None]:
data.fillna(0,inplace=True)
data.rename(columns={"Confirmed":"Cases"},inplace=True)

In [None]:
data

# Plotting time series of 3 Variables.
1. Cases
2. Deaths
3. Cured

In [None]:
figure, axes = plt.subplots(3,sharex=True)
data['Cases'].plot(ax=axes[0],title='Cases',figsize=plotsize)
data['Deaths'].plot(ax=axes[1],title='Deaths',figsize=plotsize)
data['Cured'].plot(ax=axes[2],title='Cured',figsize=plotsize)

# Resampling number of cases by:
1. Weekly data
2. Monthly data

In [None]:
cases_weekly = data['Cases'].resample('W').sum()
cases_weekly.plot(title='Weekly cases')

In [None]:
cases_monthly = data['Cases'].resample('M').sum()
cases_monthly.plot(title='Monthly cases')

# Setting up helper functions for forecasting

1. get_n_last_days : Extract last n_days of a time series.
2. plot_n_last_days : Plot last n_days of a time series

In [None]:
def get_n_last_days(df, series_name, n_days):

    return df[series_name][-(n_days):] 

def plot_n_last_days(df, series_name, n_days):

    plt.figure(figsize = (10,5))   
    plt.plot(get_n_last_days(df, series_name, n_days), 'k-')
    plt.title('{0} - {1} days'
              .format(series_name, n_days))
    plt.xlabel('Recorded day')
    plt.ylabel('Reading')
    plt.grid(alpha=0.3)

In [None]:
plot_n_last_days(data,'Cases',200)

# Some more helper functions
1. get_keras_format_series :  Convert a series to a numpy array of shape 
    [n_samples, time_steps, features]




2. get_train_test_data : Utility processing function that splits an hourly time series into train and test with keras-friendly format, according to user-specified choice of shape.  
    
    arguments
    ---------
    df (dataframe): dataframe with time series columns.

    series_name (string): column name in df.

    series_days (int): total days to extract.

    input_days (int): length of sequence input to network.

    test_days (int): length of held-out terminal sequence.
    
    sample_gap (int): step size between start of train sequences; default 5
    
    returns
    ---------
    tuple: train_X, test_X_init, train_y, test_y     

In [None]:
def get_keras_format_series(series):

    series = np.array(series)
    return series.reshape(series.shape[0],series.shape[1],1)



def get_train_test_data(df, series_name, series_days, input_hours, 
                        test_hours, sample_gap=3):

    forecast_series = get_n_last_days(df, series_name, series_days).values # reducing our forecast series to last n days

    train = forecast_series[:-test_hours] # training data is remaining days until amount of test_hours
    test = forecast_series[-test_hours:] # test data is the remaining test_hours

    train_X, train_y = [], []

    # range 0 through # of train samples - input_hours by sample_gap. 
    # This is to create many samples with corresponding
    for i in range(0, train.shape[0]-input_hours, sample_gap): 
        train_X.append(train[i:i+input_hours]) # each training sample is of length input hours
        train_y.append(train[i+input_hours]) # each y is just the next step after training sample

    train_X = get_keras_format_series(train_X) # format our new training set to keras format
    train_y = np.array(train_y) # make sure y is an array to work properly with keras
    
    # The set that we had held out for testing (must be same length as original train input)
    test_X_init = test[:input_hours] 
    test_y = test[input_hours:] # test_y is remaining values from test set
    
    return train_X, test_X_init, train_y, test_y

In [None]:
series_days = 600
input_days = 5
test_days = 10

train_X, test_X_init, train_y, test_y = \
    (get_train_test_data(data, 'Cases', series_days, 
                         input_days, test_days))

In [None]:
print('Training input shape: {}'.format(train_X.shape))
print('Training output shape: {}'.format(train_y.shape))
print('Test input shape: {}'.format(test_X_init.shape))
print('Test output shape: {}'.format(test_y.shape))

# Defining model architecture

1. LSTM

    Fit LSTM to data train_X, train_y .
    
    arguments

    train_X (array): input sequence samples for training.

    train_y (list): next step in sequence targets.

    cell_units (int): number of hidden units for LSTM cells.

    epochs (int): number of training epochs   
   


In [None]:
def fit_LSTM(X_train, y_train, epochs):
    
    # initialize model
    regressor = Sequential()

    # Adding the first LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 45, return_sequences = True, input_shape = (X_train.shape[1], 1)))
    regressor.add(Dropout(0.2))

    # Adding a second LSTM layer nd some Dropout regularisation
    regressor.add(LSTM(units = 45, return_sequences = True))
    regressor.add(Dropout(0.2))

    # Adding a third LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 45, return_sequences = True))
    regressor.add(Dropout(0.2))

    # Adding a fourth LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 45))
    regressor.add(Dropout(0.2))

    # Adding the output layer
    regressor.add(Dense(units = 1))
    # define the loss function / optimization strategy, and fit
    # the model with the desired number of passes over the data (epochs) 
    regressor.compile(loss='mean_squared_error', optimizer='adam')
    regressor.fit(train_X, train_y, epochs=epochs, batch_size=64, verbose=1)
    
    return regressor

In [None]:
model1 = fit_LSTM(train_X, train_y, epochs=1000)

# Making predictions.

Functions used
1. predict :  Given an input series matching the model's expected format generates model's predictions for next n_steps in the series.

2. predict_and_plot: Given an input series matching the model's expected format generates model's predictions for next n_steps in the series, and plots these predictions against the ground truth for those steps 
    
    arguments

    X_init (array): initial sequence, must match model's input shape.

    y (array): true sequence values to predict, follow X_init.

    model (keras.models.Sequential): trained neural network.

    title (string): plot title.  

In [None]:
def mse(observations, estimates):

    # check arg types
    assert type(observations) == type(np.array([])), "'observations' must be a numpy array"
    assert type(estimates) == type(np.array([])), "'estimates' must be a numpy array"
    # check length of arrays equal
    assert len(observations) == len(estimates), "Arrays must be of equal length"
    
    # calculations
    difference = observations - estimates
    sq_diff = difference ** 2
    mse = sum(sq_diff)
    
    return mse

In [None]:
def predict(X_init, n_steps, model):

    
    X_init = X_init.copy().reshape(1,-1,1)
    preds = []
    
    # iteratively take current input sequence, generate next step pred,
    # and shift input sequence forward by a step (to end with latest pred).
    # collect preds as we go.
    for _ in range(n_steps):
        pred = model.predict(X_init)
        preds.append(pred)
        X_init[:,:-1,:] = X_init[:,1:,:]
        X_init[:,-1,:] = pred 
    
    preds = np.array(preds).reshape(-1,1)
    
    return preds

def predict_and_plot(X_init, y, model, title):

    y_preds = predict(test_X_init, n_steps=len(y), model=model) # predict through length of y
    # Below ranges are to set x-axes
    start_range = range(1, test_X_init.shape[0]+1) #starting at one through to length of test_X_init to plot X_init
    predict_range = range(test_X_init.shape[0], test_days)  #predict range is going to be from end of X_init to length of test_hours
    
    #using our ranges we plot X_init
    plt.plot(start_range, test_X_init)
    #and test and actual preds
    plt.plot(predict_range, test_y, color='orange')
    plt.plot(predict_range, y_preds, color='teal', linestyle='--')
    
    plt.title(title)
    plt.legend(['Initial Series','Target Series','Predictions'])
    print(y_preds)
    print("MSE:{}".format(np.mean(mse(y,y_preds))))

In [None]:
predict_and_plot(test_X_init, test_y, model1,
                 'Test Data and LSTM Predictions')

The deep learning model fails to learn due to small number of training instances.

# Decomposing the Time series.

Any time series has 3 components associated with it:
1. Trend
2. Seasonality
3. Residual

Analysing Number of cases

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
data.drop(columns=['Cured','Deaths'],inplace=True)
data.columns = ['ds', 'y']
ss_decomposition = seasonal_decompose(x=data['y'], model='additive',freq=7)
estimated_trend = ss_decomposition.trend
estimated_seasonal = ss_decomposition.seasonal
estimated_residual = ss_decomposition.resid

In [None]:
fig, axes = plt.subplots(4, 1)
fig.set_figheight(10)
fig.set_figwidth(15)

axes[0].plot(data['y'], label='Original')
axes[0].legend(loc='upper left');

axes[1].plot(estimated_trend, label='Trend')
axes[1].legend(loc='upper left');

axes[2].plot(estimated_seasonal, label='Seasonality')
axes[2].legend(loc='upper left');

axes[3].plot(estimated_residual, label='Residuals')
axes[3].legend(loc='upper left');

### Plotting Auto-Corellation function

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data['y'])

In [None]:
def run_sequence_plot(x, y, title, xlabel="time", ylabel="series"):
    plt.plot(x, y, 'k-')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(alpha=0.3);

### Dividing the dataset into chunks to analyze data in specific time periods

In [None]:
chunks = np.split(data['y'], indices_or_sections=7)
print("{} | {:7} | {}".format("Chunk", "Mean", "Variance"))
print("-" * 26)
for i, chunk in enumerate(chunks, 1):
    print("{:5} | {:.6} | {:.6}".format(i, np.mean(chunk), np.var(chunk)))

In [None]:
pd.Series(data['y']).hist();

### Augmented Dickey Fuller test (ADF Test) is a common statistical test used to test whether a given Time series is stationary or not. 

In [None]:
from statsmodels.tsa.stattools import adfuller
adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(data['y'])

In [None]:
print("ADF:{}".format(adf))
print("Pvalue:{}".format(pvalue))

p-value obtained is greater than significance level. Hence we cannot reject the null hypothesis. Therefore, We conclude the Time series is non-stationary.

In [None]:
print(estimated_residual)

In [None]:
adf_after, pvalue_after, usedlag_, nobs_, critical_values_, icbest_ = adfuller(estimated_residual[3:-3])
print("ADF: ", adf_after)
print("p-value: ", pvalue_after)

In [None]:
new_hetero = data['y'] + 38
run_sequence_plot(data.index, new_hetero,
                  title="Nonstationary Data w/Heteroscedasticity")

In [None]:
log_new_hetero = np.log(new_hetero)
run_sequence_plot(data.index, log_new_hetero,
                  title="Nonstationary Data w/Heteroscedasticity")

In [None]:
df_diff = data['y'].diff()
df_diff

In [None]:
run_sequence_plot(data.index,df_diff,
                  title="dataset(differenced)")

### Dividing Time series into Train and test for predictions.

We will be making predictions for 30 days

In [None]:
train = np.array(data['y'][1:-30])
test = np.array(data['y'][-30:])

In [None]:
from statsmodels.tsa.api import SimpleExpSmoothing

single = SimpleExpSmoothing(train).fit(optimized=True)
single_preds = single.forecast(len(test))
single_mse = mse(test, single_preds)
print("Predictions: ", single_preds)
print("MSE: ", single_mse)

In [None]:
plt.plot(data.index[1:-30], train, 'b--', label="train")
plt.plot(data.index[-30:], test, color='orange', linestyle="--", label="test")
plt.plot(data.index[-30:], single_preds, 'r--', label="predictions")
plt.legend(loc='upper left')
plt.title("Simple Exponential Smoothing")
plt.grid(alpha=0.3);

In [None]:
from statsmodels.tsa.api import Holt

double = Holt(train).fit(optimized=True)
double_preds = double.forecast(len(test))
double_mse = mse(test, double_preds)
print("Predictions: ", double_preds)
print("MSE: ", double_mse)

In [None]:
plt.plot(data.index[1:-30], train, 'b--', label="train")
plt.plot(data.index[-30:], test, color='orange', linestyle="--", label="test")
plt.plot(data.index[-30:], double_preds, 'r--', label="predictions")
plt.legend(loc='upper left')
plt.title("Double Exponential Smoothing")
plt.grid(alpha=0.3);

In [None]:
from statsmodels.tsa.api import ExponentialSmoothing

triple = ExponentialSmoothing(train,
                              trend="additive",
                              seasonal="additive",
                              seasonal_periods=13).fit(optimized=True)
triple_preds = triple.forecast(len(test))
triple_mse = mse(test, triple_preds)
print("Predictions: ", triple_preds)
print("MSE: ", triple_mse)

In [None]:
plt.plot(data.index[1:-30], train, 'b--', label="train")
plt.plot(data.index[-30:], test, color='orange', linestyle="--", label="test")
plt.plot(data.index[-30:], triple_preds, 'r--', label="predictions")
plt.legend(loc='upper left')
plt.title("Triple Exponential Smoothing")
plt.grid(alpha=0.3);

### Comparing the results of the 3 statistical models.

In [None]:
print("Single MSE :{}".format(single_mse))
print("Double MSE :{}".format(double_mse))
print("Triple MSE :{}".format(triple_mse))

## Please Upvote if you appreciate the work. It would be really Helpful :)