In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium
from folium import plugins
import warnings
warnings.filterwarnings('ignore')
import geopandas as gpd
import requests
import re
from pyproj import CRS
from pandas.plotting import lag_plot
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from fbprophet import Prophet
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_n = pd.read_csv("/kaggle/input/corona-virus-report/covid_19_clean_complete.csv",parse_dates=['Date'])
df_n.head(10).style.background_gradient(cmap='Reds')


In [None]:
df_n.info()

In [None]:
df_n["Province/State"].fillna("",inplace=True)
df_n.tail()
df = df_n[df_n["Date"] == '2020-07-27']
df.head()

In [None]:
df_group = df.groupby("Country/Region")['Confirmed','Deaths','Recovered','Active'].sum().reset_index()
df_group.head()

In [None]:
confirmed_cases = df_group[['Country/Region','Confirmed']]
pio.templates.default = 'plotly_dark'
fig = px.bar(confirmed_cases.sort_values('Confirmed',ascending=False)[:20][::-1],x="Confirmed",y="Country/Region",
             title="世界上累积确诊数最多的20个国家",text='Confirmed',color_discrete_sequence=px.colors.qualitative.Light24,height=900,orientation='h')
fig.show()

In [None]:
pio.templates.default = 'plotly_dark'
death_cases = df_group[['Country/Region','Deaths']]
fig = px.bar(death_cases.sort_values('Deaths',ascending=False)[:20][::-1],x="Deaths",y="Country/Region",
             title="世界上死亡数最高的二十个国家",text='Deaths',color_discrete_sequence=px.colors.qualitative.Set1,height=900,orientation='h')
fig.show()

In [None]:
pio.templates.default = 'plotly_dark'
recovered_cases = df_group[['Country/Region','Recovered']]
fig = px.bar(recovered_cases.sort_values('Recovered',ascending=False)[:20][::-1],x="Recovered",y="Country/Region",
             title="Top 20 Country with Highest Number of Recovered Cases",text='Recovered',color_discrete_sequence=px.colors.qualitative.Bold,height=900,orientation='h')
fig.show()

In [None]:
df_who = df.groupby("WHO Region")['Confirmed','Deaths','Recovered','Active'].sum().reset_index()
df_who.head()

In [None]:
df_who_confirmed = df_who[['WHO Region','Confirmed']]
fig = px.bar(df_who_confirmed.sort_values('Confirmed',ascending=False)[::-1],x='Confirmed',y='WHO Region',
             title="WHO Region with Highest number of Confirmed Cases",text ="Confirmed",color_discrete_sequence=px.colors.qualitative.Light24,orientation='h')
fig.show()

df_who_deaths = df_who[['WHO Region','Deaths']]
fig = px.bar(df_who_deaths.sort_values('Deaths',ascending=False)[::-1],x='Deaths',y='WHO Region',
             title="WHO Region with Highest number of Death Cases",text ="Deaths",color_discrete_sequence=px.colors.qualitative.Set1,orientation='h')
fig.show()

df_who_recovered = df_who[['WHO Region','Recovered']]
fig = px.bar(df_who_recovered.sort_values('Recovered',ascending=False)[::-1],x='Recovered',y='WHO Region',
             title="WHO Region with Highest number of Recovered Cases",text ="Recovered",color_discrete_sequence=px.colors.qualitative.Bold,orientation='h')
fig.show()

df_who_active= df_who[['WHO Region','Active']]
fig = px.bar(df_who_active.sort_values('Active',ascending=False)[::-1],x='Active',y='WHO Region',
             title="WHO Region with Highest number of Active Cases",text ="Active",color_discrete_sequence=px.colors.qualitative.Vivid,orientation='h')
fig.show()


In [None]:
whole = df_n.groupby('Date')['Date','Confirmed','Deaths','Recovered','Active'].sum().reset_index()
fig = make_subplots(rows=1,cols=4,column_titles = ('总确诊数','死亡数','治愈数','确诊数'))

trace_1 = go.Scatter(x=whole['Date'],y=whole['Confirmed'],name='总确诊数',opacity=0.9,mode='lines+markers',line_color='blue')

trace_2 = go.Scatter(x=whole['Date'],y=whole['Deaths'],name='死亡数',opacity=0.9,mode='lines+markers',line_color='red')

trace_3 = go.Scatter(x=whole['Date'],y=whole['Recovered'],name='治愈数',opacity=0.9,mode='lines+markers',line_color='gold')

trace_4 = go.Scatter(x=whole['Date'],y=whole['Active'],name='确诊数',opacity=0.9,mode='lines+markers',line_color='yellow')


fig.append_trace(trace_1,1,1)
fig.append_trace(trace_2,1,2)
fig.append_trace(trace_3,1,3)
fig.append_trace(trace_4,1,4)

fig.update_layout(title_text="世界范围内新冠疫情发展趋势")
fig.show()

In [None]:
df_grouped = pd.read_csv('/kaggle/input/corona-virus-report/worldometer_data.csv')
df_grouped.head().style.background_gradient(cmap='Reds')

In [None]:
group = df_grouped.groupby('Country/Region')['Population','TotalCases','TotalDeaths','TotalRecovered'].sum().reset_index()
group.nlargest(10,'Population').style.background_gradient(cmap='Reds')
group['Affected Rate'] = np.round(((group['TotalCases']) /( group['Population']))*100,2)
group.nlargest(10,'Affected Rate').style.background_gradient(cmap='Reds')

In [None]:
pio.templates.default = 'plotly_dark'
infection_rate = group[['Country/Region','Affected Rate']]
fig = px.bar(infection_rate.sort_values('Affected Rate',ascending=False)[:21][::-1],x="Affected Rate",y="Country/Region",
             title="全世界排名前二十人口感染率",text='Affected Rate',color_discrete_sequence=px.colors.qualitative.Set1,height=900,orientation='h')
fig.show()

In [None]:
fig = px.choropleth( df_group,  
    locationmode='country names',
    locations='Country/Region',
    hover_name="Country/Region",
    color='Country/Region',
    hover_data = ['Confirmed','Recovered','Deaths','Active'],
    featureidkey= 'Confirmed',
    labels="Confirmed",
    title= "世界疫情播报"
)
fig.show()


In [None]:
confirmed = pd.read_csv('../input/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv')
deaths = pd.read_csv('../input/novel-corona-virus-2019-dataset/time_series_covid_19_deaths.csv')
recovered = pd.read_csv('../input/novel-corona-virus-2019-dataset/time_series_covid_19_recovered.csv')
confirmed['Province/State'] = confirmed['Province/State'].fillna('Unknown')
deaths['Province/State'] = deaths['Province/State'].fillna('Unknown')
recovered['Province/State'] = recovered['Province/State'].fillna('Unknown')
column = confirmed.columns[len(confirmed.columns)-1]
confirmed_per_country = confirmed.groupby('Country/Region',as_index=False)[column].sum()
data_over_time = pd.DataFrame()
dates = []
confirm = []
death = []
recoveries = []
for col in confirmed.columns[4:]:
    dates.append(col)
    confirm.append(confirmed[col].sum())
    death.append(deaths[col].sum())
    recoveries.append(recovered[col].sum())
data_over_time['observationDate'] = dates
data_over_time['confirmCases'] = confirm
data_over_time['deaths'] = death
data_over_time['recoveries'] = recoveries
data_over_time['observationDate'] = pd.to_datetime(data_over_time['observationDate'])
daily_confirmed = data_over_time[['observationDate','confirmCases']]
daily_confirmed = daily_confirmed.set_index('observationDate')
train_size = int(len(daily_confirmed) * 0.95)
train_confirmed, test_confirmed = daily_confirmed[0:train_size], daily_confirmed[train_size:len(daily_confirmed)]

In [None]:
model_comparison = []

In [None]:
prophet_daily_confirmed = data_over_time[['observationDate','confirmCases']]
prophet_daily_confirmed.rename(columns = {"observationDate": "ds", 
                                  "confirmCases":"y"},inplace=True) 

In [None]:
model_prophet = Prophet()
model_prophet.fit(prophet_daily_confirmed)

In [None]:
future = model_prophet.make_future_dataframe(periods=20)
forecast = model_prophet.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
rmse = np.sqrt(mean_squared_error(daily_confirmed['confirmCases'],
                    forecast['yhat'].head(daily_confirmed['confirmCases'].shape[0])))
print("Squared Error for Prophet Model: ",rmse)
model_comparison.append(rmse)

In [None]:
fig1 = model_prophet.plot(forecast)

In [None]:
model_holt = Holt(np.asarray(train_confirmed['confirmCases'])).fit(smoothing_level=0.38, 
                                                                   smoothing_slope=0.38,optimized=False)

In [None]:
prdeictions_holt = model_holt.forecast(len(test_confirmed))
rmse = np.sqrt(mean_squared_error(
    test_confirmed["confirmCases"],prdeictions_holt))
print("Root Mean Square Error Holt's Linear Model: ",rmse)
model_comparison.append(rmse)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_confirmed.index,y=train_confirmed['confirmCases'],
              mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=test_confirmed['confirmCases'],
              mode='lines+markers',name="Test Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=prdeictions_holt,
              mode='lines+markers',name="Holt Linear Predictions for Confirmed Cases"))
fig.update_layout(title="Confirmed Cases Holt Linear Model Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
Holt_model_new_prediction=[]
for i in range(1,21):
    Holt_model_new_prediction.append(model_holt.predict(len(test_confirmed['confirmCases'])+i)[-1])
print(Holt_model_new_prediction)

In [None]:
model_es=ExponentialSmoothing(np.asarray(train_confirmed['confirmCases']),seasonal_periods=14,trend='mul', seasonal='add').fit()

In [None]:
prdeictions_es = model_es.forecast(len(test_confirmed))
rmse = np.sqrt(mean_squared_error(
    test_confirmed["confirmCases"],prdeictions_es))
print("Root Mean Square Error Exponential Smoothing Model: ",rmse)
model_comparison.append(rmse)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_confirmed.index,y=train_confirmed['confirmCases'],
              mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=test_confirmed['confirmCases'],
              mode='lines+markers',name="Test Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=prdeictions_es,
              mode='lines+markers',name="Exponential Smoothing Predictions for Confirmed Cases"))
fig.update_layout(title="Confirmed Cases Exponential Smoothing Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
ES_model_new_prediction=[]
for i in range(1,21):
    ES_model_new_prediction.append(model_es.predict(len(test_confirmed['confirmCases'])+i)[-1])
print(ES_model_new_prediction)

In [None]:
datewise_confirmed = data_over_time[['observationDate','confirmCases']].copy()

In [None]:
datewise_confirmed['month'] = datewise_confirmed['observationDate'].dt.month
datewise_confirmed['day'] = datewise_confirmed['observationDate'].dt.day
datewise_confirmed['week'] = datewise_confirmed['observationDate'].dt.week
datewise_confirmed['quarter'] = datewise_confirmed['observationDate'].dt.quarter
datewise_confirmed['daysSince'] = (datewise_confirmed['observationDate'] - datewise_confirmed['observationDate'].min()).dt.days

In [None]:
unixtime = []
    
for date in datewise_confirmed['observationDate']:
    unixtime.append(time.mktime(date.timetuple()))
datewise_confirmed['DateTime'] = unixtime
datewise_confirmed = datewise_confirmed.drop(['observationDate'],axis=1)

In [None]:
datewise_confirmed.info()

In [None]:
train_confirmed_reg, test_confirmed_reg = datewise_confirmed[0:train_size], datewise_confirmed[train_size:len(datewise_confirmed)]
y_train= train_confirmed_reg.pop('confirmCases')
X_train = train_confirmed_reg
y_test= test_confirmed_reg.pop('confirmCases')
X_test = test_confirmed_reg

In [None]:
model_xgb = XGBRegressor()

In [None]:
parameters = {'learning_rate': [0.1, 0.2, 0.3], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4,5,6],
              'subsample': [0.6,0.7,0.8],
              'colsample_bytree': [0.6,0.7,0.8],
              'n_estimators': [500,1000,1500,2000]}

In [None]:
xgb_grid = GridSearchCV(model_xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

In [None]:
xgb_grid.fit(X_train,y_train)

In [None]:
xgb_grid.best_params_

In [None]:
model_xgb1 = XGBRegressor(colsample_bytree=0.6,learning_rate=0.2,max_depth=5,min_child_weight=4,n_estimators=2000,subsample= 0.7)

In [None]:
model_xgb1.fit(X_train[['daysSince','DateTime','week']],y_train)
prdeictions_xgb = model_xgb1.predict(X_test[['daysSince','DateTime','week']])
rmse = np.sqrt(mean_squared_error(
    y_test,prdeictions_xgb))
print("Root Mean Square Error XGBRegressor Model: ",rmse)
model_comparison.append(rmse)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_confirmed.index,y=train_confirmed['confirmCases'],
              mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=test_confirmed['confirmCases'],
              mode='lines+markers',name="Test Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=prdeictions_xgb,
              mode='lines+markers',name="Exponential Smoothing Predictions for Confirmed Cases"))
fig.update_layout(title="Confirmed Cases Exponential Smoothing Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

In [None]:
poly = PolynomialFeatures(degree=5)

In [None]:
train_confirmed_reg, test_confirmed_reg = datewise_confirmed[0:train_size], datewise_confirmed[train_size:len(datewise_confirmed)]

In [None]:
poly_train_confirmed_reg = poly.fit_transform(train_confirmed_reg[['daysSince','DateTime']])
poly_test_confirmed_reg = poly.fit_transform(test_confirmed_reg[['daysSince','DateTime']])

In [None]:
model_linear=LinearRegression(normalize=True)
model_linear.fit(poly_train_confirmed_reg,y_train)

In [None]:
prediction_poly=model_linear.predict(poly_test_confirmed_reg)
rmse_poly=np.sqrt(mean_squared_error(y_test,prediction_poly))
model_comparison.append(rmse_poly)
print("Root Mean Squared Error for Polynomial Regression: ",rmse_poly)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_confirmed.index,y=train_confirmed['confirmCases'],
              mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=test_confirmed['confirmCases'],
              mode='lines+markers',name="Test Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=test_confirmed.index,y=prediction_poly,
              mode='lines+markers',name="Exponential Smoothing Predictions for Confirmed Cases"))
fig.update_layout(title="Confirmed Cases Exponential Smoothing Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()


In [None]:
models = ['Prophet','Holt"s Linear','Exponential Smoothing','XGBRegression','Polynomial Regression']

In [None]:
model_rmse = pd.DataFrame()
model_rmse['models'] = models
model_rmse['RMSE'] = model_comparison

In [None]:
model_rmse.sort_values('RMSE',ascending=True).reset_index(drop=True)