In [None]:
import os
import numpy as np
import pandas as pd
import random
import seaborn as sns

import datetime as datetime
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from contextlib import contextmanager
from time import time
from tqdm import tqdm
import lightgbm as lgbm

from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [None]:
data0 = pd.read_csv("../input/covid19-osaka/osaka_summary - 0718.csv")
data0[-5:].T

In [None]:
print(data0.shape)

In [None]:
data0['% positive rate 7-day']=(data0['examined positives']*100/data0['examined']).rolling(window=7).mean()
data0['examined positives 7-day']=data0['examined positives'].rolling(window=7).mean()
data0['current positives 7-day']=data0['current positives'].rolling(window=7).mean()
data0['deaths 7-day']=data0['deaths'].rolling(window=7).mean()
data0[-5:].T

In [None]:
fig=make_subplots(specs=[[{"secondary_y":False}]])
fig.add_trace(go.Scatter(x=data0['date'],y=data0['examined positives 7-day'],name='examined positives 7-day'),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text="Examined Positives (rolling 7-day) in Osaka")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Cases",secondary_y=False)
fig.show()

In [None]:
fig=make_subplots(specs=[[{"secondary_y":False}]])
fig.add_trace(go.Scatter(x=data0['date'],y=data0['% positive rate 7-day'],name='% positive rate 7-day'),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text="% Positive Rate (rolling 7-day) in Osaka")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="% rate",secondary_y=False)
fig.show()

In [None]:
fig=make_subplots(specs=[[{"secondary_y":False}]])
fig.add_trace(go.Scatter(x=data0['date'],y=data0['current positives 7-day'],name="current positives 7-day"),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text="Current Positive Cases (rolling 7-day) in Osaka")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Cases",secondary_y=False)
fig.show()

In [None]:
fig=make_subplots(specs=[[{"secondary_y":False}]])
fig.add_trace(go.Scatter(x=data0['date'],y=data0['deaths 7-day'],name="deaths 7-day"),secondary_y=False,)
fig.update_layout(autosize=False,width=700,height=500,title_text="Death Cases (rolling 7-day) in Osaka")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Cases",secondary_y=False)
fig.show()

### What is the most related factor for number of current deaths?

In [None]:
list(data0.columns)

In [None]:
col1=[]
for j in range(1,41): 
    col1+=['ep-'+str(j)]
    col1+=['cp-'+str(j)]
    col1+=['ap-'+str(j)]
print(col1)

In [None]:
col0= list(data0.columns) + col1
print(col0)

In [None]:
data2=pd.DataFrame(columns=col0)
data3=pd.concat([data0,data2])
data3[-5:].T

In [None]:
n=len(data0)
print(n)

### Add candidates of current positives, examined positives and accumulated positives 1-30 days before

In [None]:
for  i in range(n):    
    cpi=data3['current positives'][i]
    for j in range(1,41): 
        data3.loc[i+j,'cp-'+str(j)]=cpi

In [None]:
for  i in range(n):    
    epi=data3['examined positives'][i]
    for j in range(1,41): 
        data3.loc[i+j,'ep-'+str(j)]=epi

In [None]:
for  i in range(n):    
    api=data3['accumulated positives'][i]
    for j in range(1,41): 
        data3.loc[i+j,'ap-'+str(j)]=api

In [None]:
data3[0:n][-5:].T

In [None]:
datay=data3[0:n]['deaths']
datax=data3[0:n].drop(['deaths','deaths 7-day','date'],axis=1)
datax[-5:].T

In [None]:
datax.columns

In [None]:
datax=np.array(datax)
datay=np.array(datay)

In [None]:
df_columns = ['examined', 'examined positives', 'acumulated positives',
       'current positives', 'left hospital', 'acumulated left hospital',
       'left hospital found', 'accumulated left hospital found',
       'link unkonwn', '% positive rate 7-day', 'examined positives 7-day',
       'current positives 7-day'] + col1


In [None]:
data_df=pd.DataFrame(datax)
data_df.columns=df_columns

In [None]:
def create_numeric_feature(input_df):
    use_columns = df_columns 
    return input_df[use_columns].copy()

In [None]:
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [None]:
from tqdm import tqdm

def to_feature(input_df):

    processors = [
        create_numeric_feature,
    ]
    
    out_df = pd.DataFrame()
    
    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='create' + func.__name__ + ' '):
            _df = func(input_df)

        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
        
    return out_df

In [None]:
train_feat_df = to_feature(data_df[7:])
y = datay[7:]

In [None]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

def fit_lgbm(X,y,cv,params:dict=None,verbose:int=50):

    if params is None:
        params = {}

    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)
        
        with Timer(prefix='fit fold={} '.format(i)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=100,
                    verbose=verbose)

        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        print(f'Fold {i} RMSLE: {mean_squared_error(y_valid, pred_i) ** .5:.4f}')
        print()

    score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole RMSLE: {:.4f}'.format(score))
    return oof_pred, models

In [None]:
params = {
    'objective': 'rmse', 
    'learning_rate': .1,
    'reg_lambda': 1.,
    'reg_alpha': .1,
    'max_depth': 5, 
    'n_estimators': 10000, 
    'colsample_bytree': .5, 
    'min_child_samples': 10,
    'subsample_freq': 3,
    'subsample': .9,
    'importance_type': 'gain', 
    'random_state': 71,
}

In [None]:
from sklearn.model_selection import KFold

fold = KFold(n_splits=10, shuffle=True, random_state=71)
cv = list(fold.split(train_feat_df, y))
oof, models = fit_lgbm(train_feat_df.values, y, cv, params=params, verbose=500)

In [None]:
fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('oof',fontsize=20)
ax.set_ylabel('train_y',fontsize=20)
ax.scatter(oof,y)

In [None]:
def visualize_importance(models, feat_train_df):

    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importances_
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x='feature_importance', 
                  y='column', 
                  order=order, 
                  ax=ax, 
                  palette='viridis', 
                  orient='h')
    ax.tick_params(axis='x', rotation=0)
    ax.set_title('Importance')
    ax.grid()
    fig.tight_layout()
    return fig,ax

fig, ax = visualize_importance(models, train_feat_df)

## Conclusion
#### 'cp-12'(current positives 12 days before) is the most related parameters for 'deaths'. 
#### Among ep-s, 'ep-20' (examined positive 20 days before) is the most.

In [None]:
fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('cp-11',fontsize=20)
ax.set_ylabel('deaths',fontsize=20)
ax.scatter(data_df.loc[7:,'cp-11'],y)