# Import Libraries

In [None]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp


import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- setup ---
pd.set_option('max_columns', 50)
import os

from subprocess import check_output
print(check_output(["ls", "../input/covid19-global-forecasting-week-5"]).decode("utf8"))

# Import Data

In [None]:
df = pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv') # training
test = pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv') # testing
sub = pd.read_csv('../input/covid19-global-forecasting-week-5/submission.csv') #submission

In [None]:
# %%time
# import requests

# for filename in ['time_series_covid19_confirmed_global.csv',
#                  'time_series_covid19_deaths_global.csv',
#                  'time_series_covid19_recovered_global.csv',
#                  'time_series_covid19_confirmed_US.csv',
#                  'time_series_covid19_deaths_US.csv']:
#     print(f'Downloading {filename}')
#     url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
#     myfile = requests.get(url)
#     open(filename, 'wb').write(myfile.content)

In [None]:
df.info()
df

In [None]:
test.info()
test

# Finding missing values

In [None]:
print("Train columns with null values:\n", df.isnull().sum())
print("-"*10)
print("Test columns with null values:\n", test.isnull().sum())

## A.train data

In [None]:
missing = df.isnull().sum()
missing_pourcent = df.isnull().sum()/df.shape[0]*100

dic = {
    'mising':missing,
    'missing_pourcent %':missing_pourcent
}
frame=pd.DataFrame(dic)
frame

## A.test data

In [None]:
missing = test.isnull().sum()
missing_pourcent = test.isnull().sum()/df.shape[0]*100

dic = {
    'mising':missing,
    'missing_pourcent %':missing_pourcent
}
frame=pd.DataFrame(dic)
frame

# Clean Data

Fill missing values for columns 'County' and 'Province_State' as 'Null'.

In [None]:
d_cleaner = [df, test]

for dataset in d_cleaner:
    dataset.fillna(dataset.dtypes.replace({'O': 'Null'}), inplace = True)
    
print(df.isnull().sum())
print('-'*10)
print(test.isnull().sum())

df[:10]

To better manipulate data, for each dataframe we divide it up into two groups on 'Target'. One suggests confirmed cases, and the other for fatalities.

In [None]:
df_conf = df.loc[df['Target'] == 'ConfirmedCases']
df_fatal = df.loc[df['Target'] == 'Fatalities']
test_conf = test.loc[test['Target'] == 'ConfirmedCases']
test_fatal = test.loc[test['Target'] == 'Fatalities']

In [None]:
df_conf.head()

In [None]:
df_fatal.head()

# Statistical Exploration and Visualization

In [None]:
## 人口分布区间频率
plt.hist(df['Population'].unique())

In [None]:
## 确诊数分布区间频率
plt.hist(df_conf['TargetValue'], range=(-1000, 2000))

In [None]:
CNconfirmed = df_conf.loc[(df_conf['Country_Region'] == 'China')]
CNconfirmed['TargetValue'].hist((CNconfirmed['Date'] > '2020-01-23') & (CNconfirmed['Date'] < '2020-02-24'), range=(0, 5000))

In [None]:
# use 'df_conf‘ as the training dataset of confirmed cases
fig = px.treemap(df_conf, path=['Country_Region'], values='TargetValue',width=900, height=600)
fig.update_traces(textposition='middle center', textfont_size=15)
fig.update_layout(
    title={
        'text': 'Total Share of Worldwide COVID19 Confirmed Cases',
        'y':0.92,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
fig = px.pie(df, values='TargetValue', names='Country_Region')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

## Worldwide trend

In [None]:
ww_conf = df_conf.groupby('Date')[['TargetValue']].sum().reset_index()
ww_conf.rename(columns={'TargetValue': 'Confirmed'}, inplace=True)
ww_fatal = df_fatal.groupby('Date')[['TargetValue']].sum().reset_index()
ww_fatal.rename(columns={'TargetValue': 'Fatalities'}, inplace=True)
ww_df = pd.merge(ww_conf, ww_fatal, on='Date', how='outer')
ww_df

In [None]:
ww_melt_df = pd.melt(ww_df, id_vars=['Date'], value_vars=['Confirmed', 'Fatalities'])
ww_melt_df

In [None]:
fig = px.line(ww_melt_df, x="Date", y="value", color='variable', 
              title="Worldwide Confirmed/Death Cases Over Time")
fig.show()

In [None]:
ww_df.sort_values("Date",inplace=True)
ww_df['Confirmed_cumsum'] = ww_df['Confirmed'].cumsum()
ww_df['Fatalities_cumsum'] = ww_df['Fatalities'].cumsum()
ww_df['New cases'] = ww_df['Confirmed']
ww_cumsum_melt_df = pd.melt(ww_df, id_vars=['Date'], value_vars=['Confirmed_cumsum', 'Fatalities_cumsum', 'New cases'])

fig = px.line(ww_cumsum_melt_df, x="Date", y="value", color='variable', 
              title="Worldwide cumsum Confirmed/Death Cases Over Time")
fig.show()

In [None]:
fig = px.line(ww_cumsum_melt_df, x="Date", y="value", color='variable',
              title="Worldwide cumsum Confirmed/Death Cases Over Time (Log scale)",
             log_y=True)
fig.show()

## Country-wise growth

In [None]:
country_conf = df_conf.groupby(['Date', 'Country_Region'])[['TargetValue']].sum().reset_index()
country_conf.rename(columns={'TargetValue': 'Confirmed'}, inplace=True)
country_fatal = df_fatal.groupby(['Date', 'Country_Region'])[['TargetValue']].sum().reset_index()
country_fatal.rename(columns={'TargetValue': 'Fatalities'}, inplace=True)
country_df = pd.merge(country_conf, country_fatal, on=['Date', 'Country_Region'], how='outer')
country_df

In [None]:
country_df.sort_values("Date",inplace=True)
country_df_2 = country_df.groupby(['Country_Region'])
country_df['Confirmed_cumsum'] = country_df_2['Confirmed'].cumsum()
country_df['Fatalities_cumsum'] = country_df_2['Fatalities'].cumsum()
country_df['New cases'] = country_df['Confirmed']
country_df

In [None]:
target_date = country_df['Date'].max()

top_country_df = country_df.query('(Date == @target_date) & (Confirmed_cumsum > 10000)').sort_values('Confirmed_cumsum', ascending=False)
top_country_melt_df = pd.melt(top_country_df, id_vars='Country_Region', value_vars=['Confirmed_cumsum', 'Fatalities_cumsum'])

In [None]:
fig = px.bar(top_country_melt_df.iloc[::-1],
             x='value', y='Country_Region', color='variable', barmode='group',
             title=f'Confirmed Cases/Deaths on {target_date}', text='value', height=1500, orientation='h')
fig.show()

In [None]:
top30_countries = top_country_df.sort_values('Confirmed_cumsum', ascending=False).iloc[:10]['Country_Region'].unique()
top30_countries_df = country_df[country_df['Country_Region'].isin(top30_countries)]
fig = px.line(top30_countries_df,
              x='Date', y='Confirmed_cumsum', color='Country_Region',
              title=f'Confirmed Cases for top 10 country as of {target_date}')
fig.show()

In [None]:
top30_countries = top_country_df.sort_values('Fatalities_cumsum', ascending=False).iloc[:10]['Country_Region'].unique()
top30_countries_df = country_df[country_df['Country_Region'].isin(top30_countries)]
fig = px.line(top30_countries_df,
              x='Date', y='Fatalities_cumsum', color='Country_Region',
              title=f'Fatalities for top 10 country as of {target_date}')
fig.show()

In [None]:
all_country_df = country_df.query('Date == @target_date')
all_country_df['confirmed_log1p'] = np.log10(all_country_df['Confirmed_cumsum'] + 1)
all_country_df['fatalities_log1p'] = np.log10(all_country_df['Fatalities_cumsum'] + 1)
all_country_df['mortality_rate'] = all_country_df['Fatalities_cumsum'] / all_country_df['Confirmed_cumsum']

In [None]:
fig = px.choropleth(all_country_df, locations="Country_Region", 
                    locationmode='country names', color="confirmed_log1p", 
                    hover_name="Country_Region", hover_data=["Confirmed_cumsum", 'Fatalities_cumsum', 'mortality_rate'],
                    range_color=[all_country_df['confirmed_log1p'].min(), all_country_df['confirmed_log1p'].max()], 
                    color_continuous_scale="peach", 
                    title='Countries with Confirmed Cases')

# I'd like to update colorbar to show raw values, but this does not work somehow...
# Please let me know if you know how to do this!!
trace1 = list(fig.select_traces())[0]
trace1.colorbar = go.choropleth.ColorBar(
    tickvals=[0, 1, 2, 3, 4, 5, 6, 7, 8],
    ticktext=['1', '10', '100', '1000','10000', '100000', '1000000', '10000000', '100000000'])
fig.show()

In [None]:
fig = px.choropleth(all_country_df, locations="Country_Region", 
                    locationmode='country names', color="fatalities_log1p", 
                    hover_name="Country_Region", 
                    range_color=[all_country_df['fatalities_log1p'].min(), all_country_df['fatalities_log1p'].max()],
                    hover_data=["Confirmed_cumsum", 'Fatalities_cumsum', 'mortality_rate'],
                    color_continuous_scale="peach", 
                    title='Countries with fatalities')
fig.show()

In [None]:
fig = px.choropleth(all_country_df, locations="Country_Region", 
                    locationmode='country names', color="mortality_rate", 
                    hover_name="Country_Region", hover_data=["Confirmed_cumsum", 'Fatalities_cumsum', 'mortality_rate'],
                    #range_color=[all_country_df['mortality_rate'].min(), all_country_df['mortality_rate'].max()], 
                    range_color=[0,0.1],
                    color_continuous_scale="peach", 
                    title='Countries with mortality rate')
fig.show()