# Note

This Notebook specializes in finding hints to get better score with using simple data, strategy and visualize.

## strategy of this notebook

There non liner time_step data and minus pressure data.
I ignore these data and simplify data for finding low score reason.

Overview:
* ignore non liner time_step data
* ignore minus pressure data
* use puls shift lag data only. do not use future data. 

## refer to
For visualization<br>
https://www.kaggle.com/tfukuda675/data-visualization-plotly-seaborn-matplot

# glossary

### LGBMRegressor
LGBM for regression<br>

https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

### LGBMClassifier
LGBM for Classifier


# Read library

In [None]:
#!pip install joypy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#from joypy import joyplot for matplotlib
import tensorflow as tf
import tqdm
import optuna
import time
import lightgbm as lgb
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize, StandardScaler
from sklearn.model_selection import train_test_split, GroupKFold, KFold

%matplotlib inline

# Read Data

In [None]:
path = '../input/ventilator-pressure-prediction'
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")

# Confirm linearity of time_step

In [None]:
time_step_diff_limit = 0.04
non_liner_timestep_breath_ids = list()
for k, grp in train.groupby("breath_id"):
    diff_se = grp["time_step"].diff()
    diff_chk = diff_se[diff_se > time_step_diff_limit]
    if len(diff_chk) != 0:
        non_liner_timestep_breath_ids.append(k)

#print(non_liner_timestep_breath_ids)
## results are following:
## [803, 2327, 3178, 4199, 5830, 10277, 11502, 13238, 15803, 16315, 16634, 18117, 18600, 24127, 25397, 28189, 28942, 30181, 32296, 36128, 36175, 37711, 38237, 38415, 39045, 39722, 42317, 42988, 43344, 44245, 45197, 46324, 49849, 53877, 54129, 55244, 55851, 61454, 64662, 67422, 67748, 72104, 74766, 76037, 78768, 79105, 80375, 87127, 87776, 89084, 91883, 93186, 98677, 102063, 104001, 106034, 107067, 109693, 111439, 112027, 115588, 119689, 120878, 121135, 125136]

# Visualize NON linier "time_step"

In [None]:
non_liner_timestep_df = train[train["breath_id"].isin(non_liner_timestep_breath_ids)]
fig = go.Figure()
for k,grp in non_liner_timestep_df.groupby("breath_id"):
    grp = grp.reset_index(drop=True)
    fig.add_trace(go.Scatter(x=grp.index, y=grp["time_step"], mode='lines', name=k))
fig.show()

# Visualize liner "time_step"

There are several gradient of "time_step",,,.

In [None]:
liner_timestep_df = train[~train["breath_id"].isin(non_liner_timestep_breath_ids)]
fig = go.Figure()
for k,grp in liner_timestep_df[:80*10000].groupby("breath_id"):
    grp = grp.reset_index(drop=True)
    fig.add_trace(go.Scatter(x=grp.index, y=grp["time_step"], mode='lines', name=k))
fig.show()

<br>
<br>

# END Visualize

<br>
<br>

# Utilitys

In [None]:
def data_clean(df):
    ## drop non liner time_step data.
    time_step_diff_limit = 0.04
    non_liner_timestep_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        diff_se = grp["time_step"].diff()
        diff_chk = diff_se[diff_se > time_step_diff_limit]
        if len(diff_chk) != 0:
            non_liner_timestep_breath_ids.append(k)
    df = df[~df["breath_id"].isin(non_liner_timestep_breath_ids)]
    
    ## drop minus pressure data.
    minus_pressure_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        m = grp["pressure"].min()
        if m < 0:
            minus_pressure_breath_ids.append(k)
    df = df[~df["breath_id"].isin(minus_pressure_breath_ids)]   
    
    return df

def change_type(df):
    df = df.merge(pd.get_dummies(df['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
    df = df.merge(pd.get_dummies(df['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)

    return df

def add_features(df):
    df['u_in_cumsum'] = df.groupby('breath_id')['u_in'].cumsum()
    df['u_in_lag1']   = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1']  = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag2']   = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2']  = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_diff1']  = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2']  = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    return df

# Read data again and set config


In [None]:
path = '../input/ventilator-pressure-prediction'
train = pd.read_csv(f"{path}/train.csv")
test = pd.read_csv(f"{path}/test.csv")
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

debug_mode = True

# In Debug mode

reduce data size

In [None]:
if debug_mode:
    train = train[:80*1000]

# apply utilitys for data

In [None]:
train = data_clean(train)
train = add_features(train)
train = change_type(train)

test = add_features(test)
test = change_type(test)

# Prepare data for LGBM

In [None]:
y = train[['pressure']]
X = train.drop(['pressure', 'id', 'breath_id'], axis=1)
test_X = test.drop(['id', 'breath_id'], axis=1)

In [None]:
scores = []
models = []
feature_importance = pd.DataFrame()
params = {'objective': 'regression',
          'learning_rate': 0.10,
          "boosting_type": "gbdt",
          'min_data_in_leaf':600,
          'max_bin': 196,
          #'device':'gpu',
          'feature_fraction':0.4,
          'lambda_l1':36, 'lambda_l2':80,
          'max_depth':16,
          'num_leaves':1000,
          "metric": 'mae',
          'n_jobs': -1
         }

folds = GroupKFold(n_splits=5)

for fold_n, (train_index, valid_index) in enumerate(folds.split(train, y, groups=train['breath_id'])):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = lgb.LGBMRegressor(**params, n_estimators=8000)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            verbose=100, early_stopping_rounds=10)
    score = metrics.mean_absolute_error(y_valid, model.predict(X_valid))
    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = X.columns.tolist()
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
for model in models:
    submission['pressure'] += model.predict(test_X)
submission['pressure'] /= 5

submission.to_csv('submission.csv', index=False)