## 0. Introduction
- Updated 10/15/2021
- GPU usage
- This code is a baseline with LighGBM train method.
- Shown to confirm overfitting
- Shown Importance feature by lightGBM feature_importance function
- Finished to submit

In [None]:
class Config:
    def __init__(self):
        self.config = 0
        self.gpu_on = 1
        self.optuna_tuner = 0
        self.optuna_train = 0
        self.config_size = 754
        self.data_dir = '../input/ventilator-pressure-prediction/'
        self.post_processing = {
                                'max_pressure': 64.82099173863948,
                                'min_pressure': -1.8957442945646408,
                                'diff_pressure': 0.07030215,
                                }       
config = Config()

### 0-1. GPU Prepare
- You need to turn on "GPU Accelerator"
- You need to turn on Inernet setting
- https://www.kaggle.com/dromosys/gpu-accelerated-lightgbm-full

In [None]:
if config.gpu_on:
    !rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
    !git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
if config.gpu_on:
    !apt-get install -y -qq libboost-all-dev

- You need to delete comment out when you wnat to use GPU

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
if config.gpu_on:
    !cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
if config.gpu_on:
    !mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
    !rm -r LightGBM

In [None]:
if config.gpu_on:
    !nvidia-smi

### 0-2. Libarary

In [None]:
import os
import gc
import glob
import time
import random
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgbo
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler

## 1 EDA & Preprocessing
### 1-1. Train & Test data

In [None]:
# Dtype Changed for low size data
dtypes = {'id': 'int32',
          'breath_id': 'int32',
          'R' : 'int8',
          'C' : 'int8',
          'time_step': 'float64',
          'u_in': 'float64',
          'u_out': 'int8',
          'pressure': 'float64'}

# Read train CSV data
def read_train():
    train = pd.read_csv(config.data_dir + 'train.csv')
    # Select random breath_id for degug
    if config.config:
        random.seed(2021)
        lst_train = random.sample(set(train['breath_id'].unique()), config.config_size)
        train_tmp = pd.DataFrame()
        for i in lst_train:
            train_tmp = pd.concat([train_tmp, train[train['breath_id'] == i]], axis=0)
        train = train_tmp
    train = train.astype(dtypes)
    return train

# Read test CSV data
def read_test():
    test = pd.read_csv(config.data_dir + 'test.csv')
    # Select random breath_id for degug
    if config.config:
        random.seed(2021)
        lst_test = random.sample(set(test['breath_id'].unique()), config.config_size)
        test_tmp = pd.DataFrame()
        for i in lst_test:
            test_tmp = pd.concat([test_tmp, test[test['breath_id'] == i]], axis=0)
        test = test_tmp
    test = test.astype(dtypes)
    return test  

train = read_train()   
train.head(2)

### 1-2. Exploratory Data Analysis
### Feature
- id - globally-unique time step identifier across an entire file
- breath_id - globally-unique time step for breaths
- R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.
- C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.
- time_step - the actual time stamp.
- u_in - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.
- u_out - the control input for the exploratory solenoid valve. Either 0 or 1.
- pressure - the airway pressure measured in the respiratory circuit, measured in cmH2O.

In [None]:
## Describe in exclude id columns
train[train.columns[1:]].describe(include='all').round(3)

### 1-3. Time series data(pressure/ u_in / u_out)
- from [https://www.kaggle.com/kaitohonda/beginner-lgbm](https://www.kaggle.com/kaitohonda/beginner-lgbm)

In [None]:
lst_train = random.sample(set(train['breath_id'].unique()), config.config_size)
fig, ax = plt.subplots(1, 3, figsize=(30, 6))
sns.set(font_scale=1.2)
for i, num in enumerate(random.sample(lst_train, 3)):
    df = train[train['breath_id']==num]
    ax2 = ax[i].twinx()

    sns.lineplot(data=df, x='time_step', y='pressure', label='pressure', ax=ax[i])
    sns.lineplot(data=df, x='time_step', y='u_in', label='u_in', ax=ax[i])
    sns.lineplot(data=df, x='time_step', y='u_out', label='u_out', ax=ax2, color='r')

    ax[i].set(xlabel='Timestep', ylabel='pressure, u_in', title=f'breath_id: {num}', xlim=(-0.2, 3.2), ylim=(-5, 105))
    ax[i].legend(loc=(0.75, 0.7))
    ax2.legend(loc=(0.75, 0.6))
plt.show()

### 1-4. Preprocessing

In [None]:
def log_exp_return(series):
    return np.exp(np.log1p(series).diff(1).fillna(0))

def data_clean(df):
    ## timestepに直線性が無いデータを削除
    time_step_diff_limit = 0.04
    non_liner_timestep_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        diff_se = grp["time_step"].diff()
        diff_chk = diff_se[diff_se > time_step_diff_limit]
        if len(diff_chk) != 0:
            non_liner_timestep_breath_ids.append(k)
    df = df[~df["breath_id"].isin(non_liner_timestep_breath_ids)]
    
    ## 負のpressure値を持つデータを削除
    minus_pressure_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        m = grp["pressure"].min()
        if m < 0:
            minus_pressure_breath_ids.append(k)
    df = df[~df["breath_id"].isin(minus_pressure_breath_ids)]   
    
    ## u_out = 1のstep数が52以上のデータを削除
    u_out_open_step_counts_over52_breath_ids = list()
    for k, grp in train.groupby("breath_id"):
        count = grp.groupby("u_out")["id"].count()[1]
        if count > 51:
            u_out_open_step_counts_over52_breath_ids.append(k)
    df = df[~df["breath_id"].isin(u_out_open_step_counts_over52_breath_ids)] 
    
    return df


def preprocessing(df):   
    # time diff
    df['time_diff'] = df['time_step'].groupby(df['breath_id']).diff(1).fillna(0)
    
    # basic parameter
    df['u_in_ratio'] = df['u_in'].groupby(df['breath_id']).apply(log_exp_return)
    df['area_unit'] = df['u_in'] * df['time_diff']
    df['area_ratio'] = df['area_unit'].groupby(df['breath_id']).apply(log_exp_return)
    
    # Create Time Windows
    def create_time_window(df, time_min, time_max, diff_time):
        feature_dict = {
                        'u_in': [np.max, np.std], 
                        'area_unit': [np.max, np.std], 
                        'u_in_ratio': [np.prod, np.std],
                        'area_ratio': [np.prod, np.std]
                        }
        for time_stamp in np.arange(time_min, time_max, diff_time):
            df_tmp = df[['time_step'] + list(feature_dict.keys())][(df['time_step'] >= time_stamp - diff_time) & (df['time_step'] < time_stamp)] \
                        .groupby(df['breath_id']).agg(feature_dict)
            df_tmp.columns = ['_'.join(col) for col in df_tmp.columns]
            df = pd.merge(df, df_tmp.add_suffix(f'_{time_stamp}_term').reset_index(), on='breath_id', how='left')
            del df_tmp
            gc.collect()
            time.sleep(3)

        return df
    
    df = create_time_window(df, 0.5, 2.0, 0.5)
    
    # u_in shift change 
    for i in np.arange(1, 5, 1):
        df[f'u_in_lag_fwrd{i}'] = df['u_in'].groupby(df['breath_id']).shift(i)
        df[f'u_in_lag_back{i}'] = df['u_in'].groupby(df['breath_id']).shift(int(-i))                                   

        df[f'u_out_lag_fwrd{i}'] = df['u_out'].groupby(df['breath_id']).shift(i)
        df[f'u_out_lag_back{i}'] = df['u_out'].groupby(df['breath_id']).shift(int(-i))
        
        df[f'u_in_diff{i}'] = df['u_in'] - df[f'u_in_lag_fwrd{i}']
        df[f'u_out_diff{i}'] = df['u_out'] - df[f'u_out_lag_fwrd{i}']      

    # u_in parameter
    df['last_value_u_in'] = df['u_in'].groupby(df['breath_id']).transform('last')
    df['first_value_u_in'] = df['u_in'].groupby(df['breath_id']).transform('first')
    df['u_in_cumsum'] = df['u_in'].groupby(df['breath_id']).cumsum()  
    df['u_in_diff_max'] = df['u_in'] - df['u_in'].groupby(df['breath_id']).max()
    df['u_in_diff_ave'] = df['u_in'] - df['u_in'].groupby(df['breath_id']).mean()  
                                      
    # u_in area
    df['last_value_area'] = df['area_unit'].groupby(df['breath_id']).transform('last')
    df['first_value_area'] = df['area_unit'].groupby(df['breath_id']).transform('first')  
    
    df = df.fillna(0)
    
    # u_out parameter
    df['cross_u_in']= df['u_in'] * df['u_out']
    df['cross_area']= df['area_unit'] * df['u_out']
    df['cross_time']= df['time_step'] * df['u_out']
    df['u_out'] = df['u_out'].astype('str')
    
    # R, C parameter
    df['R'] = df['R'].astype('str')
    df['C'] = df['C'].astype('str')
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df, drop_first=True) 
    
    return df

train = data_clean(train)
target = train["pressure"].values
train = train.drop(["id", "pressure"], axis=1)
train = preprocessing(train)
train = train.drop(['breath_id'], axis=1)
feature_column = train.columns.values

### 1-5. RobustScaler

In [None]:
rs = RobustScaler()
train = rs.fit_transform(train)
print(f'train shape: {train.shape}')

## 2. LightGBM
### 2-1. LightGBM class
- Create Class
- You can change easier any parameters

In [None]:
class LGBM:
    def __init__(self, feature_column, train, target):
        self.num_boost_round_optuna = 100
        if config.optuna_train:
            self.num_boost_round = 200
            self.verbose_eval = 100
        else:
            self.num_boost_round = 15000
            self.verbose_eval = 5000
        self.n_splits = 3
        self.early_stopping_rounds = 100
        self.kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=2021)
        self.boosters = []
        self.eval_results_lst = []
        self.feature_column = feature_column
        self.feature_importance = pd.DataFrame()
        self.train, self.target = train, target
        del train, target
        gc.collect()
        
    # Hyperparameter OputunaTunerCV
    def optuna_tuner(self, params):
        lgb_trn = lgbo.Dataset(self.train, label=self.target)       
        study_tuner = optuna.create_study(direction='minimize') 
        tuner =  lgbo.LightGBMTunerCV(params=params, 
                                      train_set=lgb_trn,
                                      num_boost_round=self.num_boost_round_optuna, 
                                      early_stopping_rounds=self.early_stopping_rounds, 
                                      verbose_eval=self.verbose_eval,
                                      folds=self.kf,
                                      study=study_tuner)
        tuner.run()
        return tuner.best_params
        
    def lgbm_train(self, params, reg):
        # Lughgbm train
        for fold, (trn_idx, val_idx) in enumerate(self.kf.split(self.train, self.target)):
            eval_results = {}
            print("="*15 + f' Fold {fold+1} started at {time.ctime()} ' + "="*15)
            lgb_trn = reg.Dataset(self.train[trn_idx], label=self.target[trn_idx])
            lgb_val = reg.Dataset(self.train[val_idx], label=self.target[val_idx])
                        
            booster = reg.train(params=params, 
                                train_set=lgb_trn, 
                                valid_sets=[lgb_trn, lgb_val],
                                valid_names=['Train', 'Valid'], 
                                num_boost_round=self.num_boost_round, 
                                early_stopping_rounds=self.early_stopping_rounds, 
                                verbose_eval=self.verbose_eval,
                                evals_result=eval_results)
                        
            self.eval_results_lst.append(eval_results)
            self.boosters.append(booster)
            
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = self.feature_column
            fold_importance["importance"] = booster.feature_importance()
            fold_importance["fold"] = fold + 1
            self.feature_importance = pd.concat([self.feature_importance, fold_importance], axis=0)
        
            del lgb_trn, lgb_val, fold_importance, booster, eval_results
            gc.collect()
                
    # Vizualize Importance columns
    def importance_show(self, top=20):
        df_tmp = self.feature_importance
        sns.set(font_scale=1.2)
        fig, ax = plt.subplots(1, self.n_splits, figsize=(30, 20))
        for i in range(self.n_splits):
            df = df_tmp[df_tmp["fold"]==i+1].sort_values('importance', ascending=False)
            sns.barplot(data=df[:top], x="importance", y="feature", ci=None, ax=ax[i])
            ax[i].set_title(f"Fold: {i+1}")
        plt.tight_layout()
        plt.show()
        
        del df_tmp
        gc.collect()

### 2-3. LightGBM Train

In [None]:
lgbm_inst = LGBM(feature_column, train, target)

# Params base
best_params = {
                'objective': 'regression_l1', 
                'metric': 'l1', 
                'boosting_type': 'gbdt', 
                'verbose': -1, 'random_state': 2021
              }

# GPU 
if config.gpu_on:
    best_params.update({'device': 'gpu', 'gpu_platform_id': 0, 'gpu_device_id': 0})

# TunerCV version
if config.optuna_tuner:
    best_params = lgbm_inst.optuna_tuner(best_params)

# Train
else:
    # LightGBM Optuna Train
    if config.optuna_train:
        lgbm_inst.lgbm_train(best_params, lgbo)
    else:
        best_params.update({
                            'feature_pre_filter': False, 
                            'lambda_l1': 0.0, 
                            'lambda_l2': 0.0, 
                            'num_leaves': 255, 
                            'feature_fraction': 0.8999999999999999, 
                            'bagging_fraction': 1.0, 
                            'bagging_freq': 0, 
                            'min_child_samples': 20
                            })
        lgbm_inst.lgbm_train(best_params, lgb)
        
gc.collect()    
print(best_params)

### 2-4. Checking overfitting
- Checking overfitting using eval_results dict

In [None]:
if not config.optuna_tuner:
    fig, ax = plt.subplots(1, lgbm_inst.n_splits, figsize=(30, 6))
    for i, results in enumerate(lgbm_inst.eval_results_lst):
        ax[i].plot(np.log(results['Train']['l1']), label='train')
        ax[i].plot(np.log(results['Valid']['l1']), label='valid')
        ax[i].set(xlabel="Boosting round", ylabel = 'Log_loss', title = f'Training Fold {i+1}')
    plt.legend()
    plt.show()

### 2-5. Scores & Feature Importance
- Visualize how each explanatory variable affects the objective function

In [None]:
if not config.optuna_tuner:
    # Score AVE & STD in CV
    scores = list()
    for result in lgbm_inst.eval_results_lst:
        scores.append(result['Valid']['l1'][-1])
    print('\n CV mean score: {0:.5f}, std: {1:.5f}.'.format(np.mean(scores), np.std(scores)))

    # Show Importance
    lgbm_inst.importance_show(top=20)

### 2-6. Predict Train data show

In [None]:
if  config.config and not config.optuna_tuner:
    tmp = list()
    for booster in lgbm_inst.boosters:
        tmp.append(booster.predict(train, num_iteration=booster.best_iteration))

    train_pred = read_train()
    train_pred["pred_pressure"] = sum(tmp) /lgbm_inst.n_splits
    train_pred["mae_unit"] = np.abs(train_pred["pred_pressure"] - train_pred["pressure"])
    train_pred = pd.merge(train_pred, pd.DataFrame(train_pred.groupby("breath_id")["mae_unit"].mean()).rename(columns={"mae_unit": "mae"}), how="left", on="breath_id")
    
    lst_train_pred = random.sample(set(train_pred['breath_id'].unique()), config.config_size)
    
    fig, ax = plt.subplots(2, 3, figsize=(30, 12))
    sns.set(font_scale=1.2)
    for i, num in enumerate(random.sample(lst_train_pred , 3)):
        df = train_pred[train_pred['breath_id']==num]

        sns.lineplot(data=df, x='time_step', y='pressure', label='actual', ax=ax[0, i])
        sns.lineplot(data=df, x='time_step', y='pred_pressure', label='predict', ax=ax[0, i])
        sns.lineplot(data=df, x='time_step', y='u_in', label='u_in', ax=ax[0, i])
        sns.lineplot(x=df['time_step'], y=np.log(df['mae_unit']), label='mae_unit', ax=ax[1, i])
        ax[0, i].set(xlabel='Timestep', ylabel='pressure, u_in', title=f'breath_id: {num}, MAE: {round(df["mae"].mean(), 3)}', xlim=(-0.2, 3.2))
        ax[0, i].legend(loc=(0.75, 0.7))
        ax[1, i].set(xlabel='Timestep', ylabel='mae_unit', title=f'breath_id: {num}, MAE: {round(df["mae"].mean(), 3)}', xlim=(-0.2, 3.2), ylim=(-2.1, 0.6))
        ax2 = ax[1, i].twinx()
        sns.lineplot(data=df, x='time_step', y='u_out', label='u_out', ax=ax2, color='r')
        ax[1, i].legend(loc=(0.75, 0.2))
        ax2.legend(loc=(0.75, 0.1))
    plt.tight_layout()
    plt.show()

## 3.Submission

In [None]:
if not config.optuna_tuner:
    del train, target
    gc.collect()

    dtypes.pop("pressure")
    test = read_test()
    test = preprocessing(test)
    test = test.drop(["id", 'breath_id'], axis=1)
    test = rs.transform(test)
    print(f'test shape: {test.shape}')

    submission = pd.read_csv(config.data_dir + "sample_submission.csv")[:test.shape[0]]
    for booster in lgbm_inst.boosters:
        submission['pressure'] += booster.predict(test, num_iteration=booster.best_iteration)

    del test
    gc.collect()

    submission['pressure'] /= lgbm_inst.n_splits
#   submission["pressure"] = np.round((submission["pressure"] - config.post_processing["min_pressure"]) / config.post_processing["diff_pressure"]) * config.post_processing["diff_pressure"] + config.post_processing["min_pressure"]
    submission["pressure"] = np.clip(submission["pressure"], config.post_processing["min_pressure"], config.post_processing["max_pressure"])
    submission.to_csv('submission_lgb.csv', index=False)
    print(submission.tail(2))