In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA
from random import random

import seaborn as sns
import matplotlib.pyplot as plt
import gc
import fbprophet
from fbprophet.plot import add_changepoints_to_plot

In [None]:
features = ['Province_State','Country_Region','Date','ConfirmedCases','Fatalities']
df=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-4/train.csv",usecols=features)
df.fillna(' ',inplace=True)
df['Lat']=df['Province_State']+df['Country_Region']
top10=df
df['ConfirmedCases_cum'] = df.groupby(['Country_Region','Date'])['ConfirmedCases'].transform(pd.Series.sum)
top10 = df.groupby(['Country_Region']).agg({ 'Country_Region':'min','ConfirmedCases': 'sum'})
countries_list=df.Lat.unique()

#read in test file 
test=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-4/test.csv")
test.fillna(' ',inplace=True)
test['Lat']=test['Province_State']+test['Country_Region']

# Read additional live data 
live=pd.read_csv("/kaggle/input/corona-virus-report/covid_19_clean_complete.csv")
live.fillna(' ',inplace=True)
live = live.rename(columns={'Lat': 'Latitude'})
live['Lat']=live['Province/State']+live['Country/Region']
# Do date conversion
live['Date']=pd.to_datetime(df['Date'])

**Merge with Recovery stats**

In [None]:
def add_recoveries(row):
    liveRw =live[(live['Date']==row['Date']) & (live['Lat']==row['Lat'])]
    #print(row['Lat'],row['Date'],liveRw)
    if not liveRw.empty:
            recovered=liveRw['Recovered'].values[0]
    else:
            recovered=0
    return recovered        

df['Recovered'] = df.apply(add_recoveries, axis=1)    
df['Recovered_cum'] = df.groupby(['Country_Region','Date'])['Recovered'].transform(pd.Series.sum)
display(df.head(4))

In [None]:
top10Countries = top10.sort_values(['ConfirmedCases'],ascending=False)['Country_Region'].head(10).to_numpy()
display(top10Countries)

# **Visualize Country Curves Top 10 Countries by confirmed cases**

In [None]:
# Plot the data with Matplotlib defaults
sns.set() 
df_top=df.loc[df['Country_Region'].isin(top10Countries)]
df_top=df_top.drop('Lat',axis=1 )
#display(df_top )
plt.figure(figsize=(16, 6))
#sns.set_context("talk")
sns.set_style("whitegrid")
ax = sns.lineplot(x="Date", y="ConfirmedCases_cum", hue="Country_Region",
                   estimator=None, lw=1, 
                  data=df_top)
plt.draw()
gobble=ax.set_xticklabels(ax.get_xticklabels(), rotation=75, ha='right')

#  Define Prophet Model

Assumptions

Assuming seasonal pattern
* Need to see if can include population density
* Added recoveries as a regressor to fatalities
* Added Confirmed cases as a regressor to fatalities
* Need to see if can include impact of social distancing by a regressor

In [None]:
def is_Off_Season(ds,start=8,end=11):
    date = pd.to_datetime(ds)
    return (date.month > start or date.month < end)

def applyRegressor(ds,reg,df):
    date = ds.strftime("%Y-%m-%d")
    val = df[df['ds']==date][reg]
    if val.empty:
        return 0;
    else:
        return df[df['ds']==date][reg].values[0]

#### Forecaster
def forecast(df,ds,y,reg1='Recovered',y_label='y',title="Projection", plot=False,flexibility=0.01,fourier_order=4,prior_scale=0.1,changepoints = [],reg2 = ""):
    if len(reg2) == 0:
        if len(reg1) == 0:
            df = df[[ds, y]]
        else:
            df = df[[ds, y, reg1]]
    else:
        if len(reg1) == 0:
            df = df[[ds, y, reg2]]
        else:
            df = df[[ds, y, reg1, reg2]]
    df=df.rename(columns={ds: 'ds', y: 'y'})
    prophet = fbprophet.Prophet(changepoint_prior_scale=flexibility, seasonality_mode='additive', \
            weekly_seasonality=False, \
            daily_seasonality=False )
    ## Add sesionality
    df['off_season'] = df['ds'].apply(is_Off_Season)
    prophet.add_seasonality(name='off_season', period=120, prior_scale=prior_scale, fourier_order=fourier_order, \
                            condition_name='off_season')
    ## Add regressors
    if len(reg1) != 0:
        prophet.add_regressor(reg1)
    if len(reg2) != 0:
        prophet.add_regressor(reg2)
    
    prophet.fit(df)
    forecast = prophet.make_future_dataframe(periods=240, freq='D')
    forecast['off_season'] = forecast['ds'].apply(is_Off_Season)
    # add the regressors observed over the future DataFrame period
    forecast[reg1] = forecast['ds'].apply(lambda x: applyRegressor(x,reg1,df))
    if len(reg2) != 0:
        forecast[reg2] = forecast['ds'].apply(lambda x: applyRegressor(x,reg2,df))
    # Predict the future
    forecast = prophet.predict(forecast)
    # Avoid negative
    forecast["yhat"] = np.where(forecast["yhat"]<0,0,forecast["yhat"])
    forecast["yhat_lower"] = np.where(forecast["yhat_lower"]<0,0,forecast["yhat_lower"])
    forecast["yhat_upper"] = np.where(forecast["yhat_upper"]<0,0,forecast["yhat_upper"])
    if plot :
        fig=prophet.plot(forecast, xlabel = 'Date', ylabel = y_label)
        a = add_changepoints_to_plot(fig.gca(), prophet, forecast)
        prophet.plot_components(forecast)
        plt.title(title);
    return (forecast,prophet)

In [None]:
def dateplot(x, y, **kwargs):
    ax = plt.gca()
    data = kwargs.pop("data")
    data.plot(x=x, y=y, ax=ax, grid=False, **kwargs)

#display(df[df.Country_Region == "US"].head(10))
g = sns.FacetGrid(df[df.Country_Region == "US"], col="Country_Region", col_wrap=2, height=3.5, aspect=2)
g = g.map_dataframe(dateplot, "Date", "ConfirmedCases")

# **Predicted US trend**

In [None]:
## Using profit to get trends for us 
modelDs = df_top [df_top['Country_Region'].str.find("US")!=-1][['Date', 'ConfirmedCases_cum','Recovered_cum' ]]
forecast(modelDs,'Date','ConfirmedCases_cum','Recovered_cum','Confirmed Cases','Confirmed Cases US',True,fourier_order=5) 
#del df_top
gc.collect()

# Run model on test data

In [None]:
#collect residual garbage
gc.collect()
#display(df [df['Lat'].str.find("US")!=-1].head(2))
#groupByLat = df[df['Lat'].str.find("Massachusetts")!=-1].groupby('Lat')
groupByLat = df.groupby('Lat')
cols = ['Lat','ConfirmedCase_Forecast','Fatalities_Forecast']
sub_cols= ['ForecastId','Date','ConfirmedCases','Fatalities']
submissions= pd.DataFrame(columns=sub_cols)
results = pd.DataFrame(columns=cols)
i=1;
for name, group in groupByLat:
    print("Forecasting for group : %s:%s"%(i,name))
    forecast_cc = forecast(group,'Date','ConfirmedCases','Recovered',flexibility=0.35,fourier_order=5,prior_scale=0.20) 
    forecast_fat = forecast(group,'Date','Fatalities','Recovered',flexibility=0.35,fourier_order=5,prior_scale=0.19) 
    # Populate test data set 
    for index, row in test[test['Lat']==name].iterrows():
        cc=forecast_cc[0][forecast_cc[0]['ds']==row['Date']]["yhat"].values[0]
        f=forecast_fat[0][forecast_fat[0]['ds']==row['Date']]["yhat"].values[0]
        submissions=submissions.append(pd.Series([int(row['ForecastId']),row['Date'],round(cc),round(f)],index=sub_cols),ignore_index=True )
    results = results.append(pd.Series([name,forecast_cc[0],forecast_fat[0]],index=cols),ignore_index=True )
    del forecast_cc
    del forecast_fat
    gc.collect()
    #for index, row in group.iterrows():
    #    results = results.append(pd.Series([name,row['ConfirmedCases'],row['Fatalities']],index=cols),ignore_index=True )
    i=i+1
del groupByLat

In [None]:
## Create submission File
#os.remove("/kaggle/working/submission.csv")
#pd.options.display.float_format = '{:,.0f}'.format
# Convert forecast Id to Integer
submissions["ForecastId"] =submissions.ForecastId.astype('int64')
display(submissions)
# Drop Date to Prepare Submission
submissions=submissions.drop('Date',axis=1 )
submissions.to_csv('submission.csv', index=False)  

# Forecast for MA and NY

In [None]:
# see forecast for Massachussets 
groupByLat = df[df['Lat'].str.find("Massachusetts")!=-1].groupby('Lat')
for name, group in groupByLat:
    forecast_cc = forecast(group,'Date', 'ConfirmedCases','Recovered','ConfirmedCases Massachusetts', 'ConfirmedCases',True,flexibility=0.35,fourier_order=5,prior_scale=0.20) 
    forecast_fat = forecast(group,'Date','Fatalities','Recovered','Fatalities Massachusetts', 'Fatalities',True,flexibility=0.35,fourier_order=5,prior_scale=0.20) 
    

In [None]:
# see forecast for NY 
groupByLat = df[df['Lat'].str.find("New York")!=-1].groupby('Lat')
for name, group in groupByLat:
    forecast_cc = forecast(group,'Date', 'ConfirmedCases','Recovered','ConfirmedCases New York', 'ConfirmedCases',True,flexibility=0.35,fourier_order=6,prior_scale=0.20) 
    forecast_fat = forecast(group,'Date','Fatalities','Recovered','Fatalities New York', 'Fatalities',True,flexibility=0.35,fourier_order=6,prior_scale=0.20)  

# Forecast for India

In [None]:
# India Forecast
groupByLat = df[df['Lat'].str.find(" India")!=-1].groupby('Lat')
for name, group in groupByLat:
    forecast_cc = forecast(group,'Date', 'ConfirmedCases','Recovered','ConfirmedCases India', 'ConfirmedCases',True,flexibility=0.25,fourier_order=5,prior_scale=0.20) 
    forecast_fat = forecast(group,'Date','Fatalities','Recovered','Fatalities India', 'Fatalities',True,flexibility=0.25,fourier_order=5,prior_scale=0.20,reg2="ConfirmedCases")