In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

import optuna 
from optuna import Trial, visualization
from optuna.samplers import *

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_absolute_error

import warnings
import joblib
warnings.filterwarnings(action='ignore')

In [None]:
def create_new_feat(df):
    
    df["u_in_sum"]         = df.groupby("breath_id")["u_in"].transform("sum")
    df["u_in_std"]         = df.groupby("breath_id")["u_in"].transform("std")
    df["u_in_min"]         = df.groupby("breath_id")["u_in"].transform("min")
    df["u_in_first"]       = df.groupby("breath_id")["u_in"].transform("first")
    df["u_in_last"]        = df.groupby("breath_id")["u_in"].transform("last")
    df["time_passed"]      = df.groupby("breath_id")["time_step"].diff()
    df['area']             = df['time_step'] * df['u_in']
    df['area_2']           = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum']      = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1']        = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1']       = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1']   = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1']  = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2']        = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2']       = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2']   = df.groupby('breath_id')['u_in'].shift(-2) 
    df['u_out_lag_back2']  = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3']        = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3']       = df.groupby('breath_id')['u_out'].shift(3) 
    df['u_in_lag_back3']   = df.groupby('breath_id')['u_in'].shift(-3) 
    df['u_out_lag_back3']  = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4']        = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4']       = df.groupby('breath_id')['u_out'].shift(4) 
    df['u_in_lag_back4']   = df.groupby('breath_id')['u_in'].shift(-4) 
    df['u_out_lag_back4']  = df.groupby('breath_id')['u_out'].shift(-4) 
    
    df = df.fillna(0)
    
    df['breath_id__u_in__diffmax']  = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['cross']                     = df['u_in']*df['u_out']
    df['cross2']                    = df['time_step']*df['u_out']
    df['R']                         = df['R'].astype(str)
    df['C']                         = df['C'].astype(str)
    df['R__C']                      = df["R"].astype(str) + '__' + df["C"].astype(str)
    
    df = pd.get_dummies(df)

    df['time_diff']  = (df['time_step']).groupby(df['breath_id']).diff(1)
    df['time_diff2'] = (df['time_step']).groupby(df['breath_id']).diff(2)
    df['time_diff3'] = (df['time_step']).groupby(df['breath_id']).diff(3)
    df['time_diff4'] = (df['time_step']).groupby(df['breath_id']).diff(4)
    df['time_diff5'] = (df['time_step']).groupby(df['breath_id']).diff(5)
    df['time_diff6'] = (df['time_step']).groupby(df['breath_id']).diff(6)
    df['time_diff7'] = (df['time_step']).groupby(df['breath_id']).diff(7)
    df['time_diff8'] = (df['time_step']).groupby(df['breath_id']).diff(8)
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2'] 
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2'] 
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3'] 
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3'] 
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4'] 
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4'] 
    
    #####################################     ADD     ########################################################
    df['ewm_u_in_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['ewm_u_in_std'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    df['ewm_u_in_corr'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).corr().reset_index(level=0,drop=True)
    
    df['rolling_10_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_10_max'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).max().reset_index(level=0,drop=True)
    df['rolling_10_std'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    
    df['expand_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expand_max'] = df.groupby('breath_id')['u_in'].expanding(2).max().reset_index(level=0,drop=True)
    df['expand_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    #########################################################################################################
    
    return df

In [None]:
def create_new_feat(df):
    
    df["time_passed_1"]      = df.groupby("breath_id")["time_step"].diff(1)
    df["time_passed_2"]      = df.groupby("breath_id")["time_step"].diff(2)
    df['u_in_diff1']       = df.groupby('breath_id')['u_in'].diff(1)
    df['u_in_diff2']       = df.groupby('breath_id')['u_in'].diff(2)
    df['area']             = df['time_step'] * df['u_in']
    df['area_2']           = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum']      = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1']        = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1']       = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1']   = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1']  = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2']        = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2']       = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2']   = df.groupby('breath_id')['u_in'].shift(-2) 
    df['u_out_lag_back2']  = df.groupby('breath_id')['u_out'].shift(-2)
    
    df = df.fillna(0)
    df['u_in_der_1'] = df['u_in_diff1'] / df["time_passed_1"]
    df['u_in_der_2'] = df['u_in_diff2'] / df["time_passed_2"]
    

    df['cross']                     = df['u_in']*df['u_out']
    df['cross2']                    = df['time_step']*df['u_out']
    df['R']                         = df['R'].astype(str)
    df['C']                         = df['C'].astype(str)
    df['R__C']                      = df["R"].astype(str) + '__' + df["C"].astype(str)
    
    df = pd.get_dummies(df)

    #####################################     ADD     ########################################################
    df['ewm_u_in_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['ewm_u_in_std'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    df['ewm_u_in_corr'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).corr().reset_index(level=0,drop=True)
    
    df['rolling_10_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_10_max'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).max().reset_index(level=0,drop=True)
    df['rolling_10_std'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    
    df['expand_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expand_max'] = df.groupby('breath_id')['u_in'].expanding(2).max().reset_index(level=0,drop=True)
    df['expand_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    #########################################################################################################
    
    return df

In [None]:
train_df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
train_df = create_new_feat(train_df)
train_df = train_df.fillna(0)

In [None]:
train_df

In [None]:
x_col = train_df.iloc[:, ~train_df.columns.str.contains('pressure')].columns[3:]
y_col = train_df.iloc[:, train_df.columns.str.contains('pressure')].columns
print(x_col)
print(y_col)

In [None]:
sc = MinMaxScaler()
# sc = RobustScaler()
sc.fit(train_df[x_col])
train_df[x_col] = sc.transform(train_df[x_col])

train_x = train_df.loc[:int(train_df.shape[0]*0.7), x_col]
train_y = train_df.loc[:int(train_df.shape[0]*0.7), y_col]
valid_x = train_df.loc[int(train_df.shape[0]*0.7):, x_col]
valid_y = train_df.loc[int(train_df.shape[0]*0.7):, y_col]

del train_df

In [None]:
def objectiveXGB(trial: Trial, tx, ty, vx, vy):
    param = {
        "n_estimators" : trial.suggest_int('n_estimators', 200, 2000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 300),
        'gamma':trial.suggest_int('gamma', 1, 3),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 10.0),
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-5, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 10.0),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }
    
    model = XGBRegressor(**param)
    xgb_model = model.fit(tx, ty, verbose=False)
    score = mean_absolute_error(xgb_model.predict(vx), vy)

    return score

In [None]:
# study = optuna.create_study(direction='minimize',sampler=TPESampler())
study = optuna.create_study(direction='minimize')

study.optimize(lambda trial : objectiveXGB(trial, train_x,  train_y, valid_x, valid_y), n_trials=10)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
joblib.dump(study, './optuna_result.pkl')
df = study.trials_dataframe()
df.head(3)