In [245]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
from datetime import timedelta
from statsmodels.tsa.vector_ar.var_model import VAR
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle

# Aggregating all the files to create a final dataset

In [246]:
# data=pd.concat([pd.read_csv(filename)
#            for filename in os.listdir()
#            if '.csv' in filename])
# data.head()
data = pd.read_csv('combined_dataset.csv')
population = pd.read_csv('Population_state.csv')
data=data[data['Lat'].notna()]


#### 1)Dropping rows that have lat and long =NaN(as they correspond to the cases in ships)
#### 2)replace Nan in recovered,active,mortality rate,Hospitalization Rate, people Hospitalized with "0"
#### 3) changing data type of last_update to date

In [247]:
def preprocess(data):
    values={'Recovered':0,'Active':0,'Mortality_Rate':0,'People_Hospitalized':0,'Hospitalization_Rate':0}
    data.fillna(value=values,inplace=True)
    data['Last_Update'].fillna('2020-07-22 04:34:54',inplace=True)
    data['time'] = pd.to_datetime(data['Last_Update'])
    data['dates'] = data['time'].dt.date
    data['Recovered'] = data['Recovered'].astype('int64',inplace=True)
    data['Active'] = data['Active'].astype('int64',inplace=True)
    data.drop(['FIPS','UID','Country_Region','Unnamed: 0','Lat','Long_','ISO3','Last_Update','time'],axis=1,inplace=True)
    data.rename(columns={'Province_State':'State'},inplace=True)
    print("The Data is Collected from {} to {}".format(data['dates'].min(),data['dates'].max()))
    return data

# Check for Stationarity in our data

In [55]:
# data = preprocess(data)
# #check for stationarity
# final_data = data.drop(['State','dates'],axis=1)
# final_data.index = data.dates
# final_data.head()
# #check for stationarity
# from statsmodels.tsa.vector_ar.vecm import coint_johansen
# coint_johansen(final_data,-1,1).eig
# #Since all the values are not zero, they do exhibit a linear relationship.

#### FIPS is NaN for US VIRGIN ISLAND

#### Reading the Population Data

In [248]:
def clean_population(population):
    print("Total US population for the Year 2019 = ",population['Population'][0])
    #drop the first row
    population = population[population['State']!='United States']
    return population

In [250]:
def get_state_data(data,state,population_dataset):
    state_data = data[data['State']==str(state)]
    final_state_data = state_data.join(population_dataset.set_index('State'),on='State')
    final_state_data['population_affected'] = round(final_state_data['Confirmed']/final_state_data['Population'],4)
    final_state_data['Suscpetible'] = final_state_data['Population']-final_state_data['Confirmed']
    final_data = final_state_data.drop(['State','dates','Population'],axis=1)
    print("The population of the State is : ",final_state_data['Population'].iloc[0])
    final_data.index =final_state_data.dates
    return final_data

In [28]:
def train_test(state_data):
    train = state_data[:int(0.85*(len(state_data)))].drop('Population',axis=1)
    test = state_data[int(0.85*(len(state_data))):].drop('Population',axis=1)
    return train,test

In [8]:
# train,test = train_test(state_data)
# cols =train.columns
# cols

In [251]:
def prediction(data,population):
    data = preprocess(data)
    population = clean_population(population)
    x = input("Enter the State : ")
    state_data = get_state_data(data,x,population)
    last_date = state_data.index.max()
    date_entered=input("Enter the Date in format: YYYY/MM/DD : ")
    user_date=datetime.date(datetime.strptime(date_entered, "%Y/%m/%d"))
    steps=(user_date-last_date).days
    cols =train.columns
    model = VAR(endog=state_data)
    model_fit = model.fit()
    saved_model = pickle.dumps(model_fit)
    model2 = pickle.loads(saved_model)
    prediction = model2.forecast(model_fit.y, steps=len(test))
    pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
    for j in range(0,12):
        for i in range(0, len(prediction)):
            pred.iloc[i][j] = prediction[i][j]
    # prediction = model_fit.forecast(model_fit.y, steps=len(test))
    # pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
    return pred

In [252]:
pred =prediction(data,population)

The Data is Collected from 2020-04-12 to 2020-07-22
Total US population for the Year 2019 =  328239523.0
Enter the State : Virginia
The population of the State is :  8535519.0
Enter the Date in format: YYYY/MM/DD : 2020/08/10




In [158]:
def conf_interval(pred,str(column)):
    lower = pred['column'].iloc[-1]-1.96*pred['column'].std()
    upper = pred['column'].iloc[-1]+1.96*pred['column'].std()
    return lower,upper

In [253]:
print("The Number of Confirmed Cases will be in the Following Range :  {} -  {}".format(round(pred['Confirmed'].values[-1][0] - 1.96 * pred['Confirmed'].std()),round(pred['Confirmed'].values[-1][0] + 1.96 * pred['Confirmed'].std())))
print("The Number of Deaths will be in the Following Range :  {} -  {}".format(round(pred['Deaths'].values[-1][0] - 1.96 * pred['Deaths'].std()),round(pred['Deaths'].values[-1][0] + 1.96 * pred['Deaths'].std())))
print("The Number of Recovered Cases will be in the Following Range :  {} -  {}".format(round(pred['Recovered'].values[-1][0] - 1.96 * pred['Recovered'].std()),round(pred['Recovered'].values[-1][0] + 1.96 * pred['Recovered'].std())))
print("The Number of Active Cases will be in the Following Range :  {} -  {}".format(round(pred['Active'].values[-1][0] - 1.96 * pred['Active'].std()),round(pred['Active'].values[-1][0] + 1.96 * pred['Active'].std())))
print("The Number of People Tested will be in the Following Range :  {} -  {}".format(round(pred['People_Tested'].values[-1][0] - 1.96 * pred['People_Tested'].std()),round(pred['People_Tested'].values[-1][0] + 1.96 * pred['People_Tested'].std())))
print("The Number of People that have not yet been affected will be in the Following Range :  {} -  {}".format(round(pred['Suscpetible'].values[-1][0] - 1.96 * pred['Suscpetible'].std()),round(pred['Suscpetible'].values[-1][0] + 1.96 * pred['Suscpetible'].std())))


The Number of Confirmed Cases will be in the Following Range :  Confirmed    84290.0
dtype: float64 -  Confirmed    97256.0
dtype: float64
The Number of Deaths will be in the Following Range :  Deaths    2081.0
dtype: float64 -  Deaths    2167.0
dtype: float64
The Number of Recovered Cases will be in the Following Range :  Recovered    10731.0
dtype: float64 -  Recovered    12113.0
dtype: float64
The Number of Active Cases will be in the Following Range :  Active    71350.0
dtype: float64 -  Active    82901.0
dtype: float64
The Number of People Tested will be in the Following Range :  People_Tested    1060455.0
dtype: float64 -  People_Tested    1401283.0
dtype: float64
The Number of People that have not yet been affected will be in the Following Range :  Suscpetible    8438263.0
dtype: float64 -  Suscpetible    8451229.0
dtype: float64


The Number of Recovered Cases will be in the Following Range :  Recovered    0.0
dtype: float64 -  Recovered    0.0
dtype: float64
The Number of Active Cases will be in the Following Range :  Active    481410.0
dtype: float64 -  Active    709040.0
dtype: float64
The Number of People Tested will be in the Following Range :  People_Tested    7516354.0
dtype: float64 -  People_Tested    10316853.0
dtype: float64
The Number of People that have not yet been affected will be in the Following Range :  Suscpetible    38792471.0
dtype: float64 -  Suscpetible    39022192.0
dtype: float64


In [50]:
#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', math.sqrt(mean_squared_error(pred[i], test[i])))

rmse value for Confirmed is :  1561.4129529823858
rmse value for Deaths is :  48.672089094350135
rmse value for Recovered is :  59.407803254692304
rmse value for Active is :  1749.5636026782859
rmse value for Incident_Rate is :  19.20335065525268
rmse value for People_Tested is :  19530.252525592372
rmse value for People_Hospitalized is :  317.9646878578852
rmse value for Mortality_Rate is :  0.08587001386446466
rmse value for Testing_Rate is :  238.4090629137005
rmse value for Hospitalization_Rate is :  0.42053937045883666
rmse value for population_affected is :  0.00018686845716760358
rmse value for Suscpetible is :  1561.4129529084455


In [51]:
#check rmse
for i in cols:
    print('The R-Squared Value for', i, 'is : ',r2_score(test[i],pred[i]))

The R-Squared Value for Confirmed is :  0.860009997375905
The R-Squared Value for Deaths is :  0.1922653104333183
The R-Squared Value for Recovered is :  0.9813998644067488
The R-Squared Value for Active is :  0.7751672354937278
The R-Squared Value for Incident_Rate is :  0.8457321746877704
The R-Squared Value for People_Tested is :  0.9208359238331382
The R-Squared Value for People_Hospitalized is :  0.6529396330265943
The R-Squared Value for Mortality_Rate is :  0.05080406503147783
The R-Squared Value for Testing_Rate is :  0.9140555535453955
The R-Squared Value for Hospitalization_Rate is :  -8.269777683454445
The R-Squared Value for population_affected is :  0.8535232837497078
The R-Squared Value for Suscpetible is :  0.8600099973891635


In [24]:
model?