In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
# Importing AutoGluon
from lightgbm import LGBMRegressor
import lightgbm as lgbm
# Scikit Learn
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
# Metrics for models evaluation
from sklearn.metrics import mean_absolute_error

### YOUR FEATURE ENGINEERING GOES HERE
import optuna
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
import gc
import os
import sys
from tqdm import tqdm
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# Reduce memory usage imported from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

In [None]:
# Reduce mem usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_feather(file).drop('index',axis=1)
    df = reduce_mem_usage(df)
    return df

def free_mem(objects):
    """free's mem"""
    start_mem = sum([sys.getsizeof(obj) for obj in objects])/ 1024
    print('Memory usage before cleanup {:.2f} KB'.format(start_mem))
    for obj in objects:
        del obj
    gc.collect()    
    end_mem = sum([sys.getsizeof(obj) for obj in objects])/ 1024
    print('Memory usage after cleanup: {:.2f} KB'.format(end_mem))

In [None]:
train = import_data('../input/folds-just-added-in-feather-format/train_folds_10.ftr') 
test = import_data('../input/folds-just-added-in-feather-format/test_stratfold.ftr')

In [None]:
all_pressure = np.sort(train.pressure.unique())
print('The first 25 unique pressures...')
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
print(all_pressure[:25])


In [None]:
print('The differences between first 25 pressures...')
PRESSURE_STEP = ( all_pressure[1] - all_pressure[0] ).item()
all_pressure[1:26] - all_pressure[:25]

In [None]:
free_mem([all_pressure])

In [None]:
TARGET_VAR='pressure'
FOLDS=5
submission_file = '../input/ventilator-pressure-prediction/sample_submission.csv'

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

#train = add_features(train)
#test = add_features(test)
#useful_features = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure','kfold']]
#RS = RobustScaler()
#train[useful_features] = RS.fit_transform(train[useful_features])
#test[useful_features] = RS.transform(test[useful_features])

In [None]:
#targets = train[['pressure']].to_numpy().reshape(-1, 80)
#display(targets.shape)
#train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
#test = test.drop(['id', 'breath_id'], axis=1)

#RS = RobustScaler()
#train = RS.fit_transform(train[useful_features])
#test = RS.transform(test[useful_features])

#train = train.reshape(-1, 80, train.shape[-1])
#test = test.reshape(-1, 80, train.shape[-1])

In [None]:
train

In [None]:
#def trainlegacy(train=train,test=test):
# rewritten calculation of lag features from this notebook: https://www.kaggle.com/patrick0302/add-lag-u-in-as-new-feat
# some of ideas from this notebook: https://www.kaggle.com/mst8823/google-brain-lightgbm-baseline
train['last_value_u_in'] = train.groupby('breath_id')['u_in'].transform('last')
#https://machinelearningmastery.com/time-series-forecasting-supervised-learning/
train['u_in_lag1'] = train.groupby('breath_id')['u_in'].shift(1)
train['u_out_lag1'] = train.groupby('breath_id')['u_out'].shift(1)
train['u_in_lag_back1'] = train.groupby('breath_id')['u_in'].shift(-1)
train['u_out_lag_back1'] = train.groupby('breath_id')['u_out'].shift(-1)
train['u_in_lag2'] = train.groupby('breath_id')['u_in'].shift(2)
train['u_out_lag2'] = train.groupby('breath_id')['u_out'].shift(2)
train['u_in_lag_back2'] = train.groupby('breath_id')['u_in'].shift(-2)
train['u_out_lag_back2'] = train.groupby('breath_id')['u_out'].shift(-2)
train = train.fillna(0)

train['R__C'] = train["R"].astype(str) + '__' + train["C"].astype(str)

# max value of u_in and u_out for each breath
train['breath_id__u_in__max'] = train.groupby(['breath_id'])['u_in'].transform('max')
train['breath_id__u_out__max'] = train.groupby(['breath_id'])['u_out'].transform('max')

# difference between consequitive values
train['u_in_diff1'] = train['u_in'] - train['u_in_lag1']
train['u_out_diff1'] = train['u_out'] - train['u_out_lag1']
train['u_in_diff2'] = train['u_in'] - train['u_in_lag2']
train['u_out_diff2'] = train['u_out'] - train['u_out_lag2']
# from here: https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
train.loc[train['time_step'] == 0, 'u_in_diff'] = 0
train.loc[train['time_step'] == 0, 'u_out_diff'] = 0

# difference between the current value of u_in and the max value within the breath
train['breath_id__u_in__diffmax'] = train.groupby(['breath_id'])['u_in'].transform('max') - train['u_in']
train['breath_id__u_in__diffmean'] = train.groupby(['breath_id'])['u_in'].transform('mean') - train['u_in']

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
train['u_in_cumsum'] = train.groupby(['breath_id'])['u_in'].cumsum()
train['time_step_cumsum'] = train.groupby(['breath_id'])['time_step'].cumsum()
# https://www.kaggle.com/yasufuminakama/ventilator-pressure-lstm-starter
train['breath_time'] = train['time_step'] - train.groupby('breath_id')['time_step'].shift(1)


# OHE
train = train.merge(pd.get_dummies(train['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
train = train.merge(pd.get_dummies(train['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)
train = train.merge(pd.get_dummies(train['R__C'], prefix='R__C'), left_index=True, right_index=True).drop(['R__C'], axis=1)


# all the same for the test data
test['last_value_u_in'] = test.groupby('breath_id')['u_in'].transform('last')
test['u_in_lag1'] = test.groupby('breath_id')['u_in'].shift(1)
test['u_out_lag1'] = test.groupby('breath_id')['u_out'].shift(1)
test['u_in_lag_back1'] = test.groupby('breath_id')['u_in'].shift(-1)
test['u_out_lag_back1'] = test.groupby('breath_id')['u_out'].shift(-1)
test['u_in_lag2'] = test.groupby('breath_id')['u_in'].shift(2)
test['u_out_lag2'] = test.groupby('breath_id')['u_out'].shift(2)
test['u_in_lag_back2'] = test.groupby('breath_id')['u_in'].shift(-2)
test['u_out_lag_back2'] = test.groupby('breath_id')['u_out'].shift(-2)
test = test.fillna(0)

test['R__C'] = test["R"].astype(str) + '__' + test["C"].astype(str)

test['breath_id__u_in__max'] = test.groupby(['breath_id'])['u_in'].transform('max')
test['breath_id__u_out__max'] = test.groupby(['breath_id'])['u_out'].transform('max')

test['u_in_diff1'] = test['u_in'] - test['u_in_lag1']
test['u_out_diff1'] = test['u_out'] - test['u_out_lag1']
test['u_in_diff2'] = test['u_in'] - test['u_in_lag2']
test['u_out_diff2'] = test['u_out'] - test['u_out_lag2']
test.loc[test['time_step'] == 0, 'u_in_diff'] = 0
test.loc[test['time_step'] == 0, 'u_out_diff'] = 0

test['breath_id__u_in__diffmax'] = test.groupby(['breath_id'])['u_in'].transform('max') - test['u_in']
test['breath_id__u_in__diffmean'] = test.groupby(['breath_id'])['u_in'].transform('mean') - test['u_in']

test['u_in_cumsum'] = test.groupby(['breath_id'])['u_in'].cumsum()
test['time_step_cumsum'] = test.groupby(['breath_id'])['time_step'].cumsum()

test['breath_time'] = test['time_step'] - test.groupby('breath_id')['time_step'].shift(1)


# OHE
test = test.merge(pd.get_dummies(test['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
test = test.merge(pd.get_dummies(test['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)
test = test.merge(pd.get_dummies(test['R__C'], prefix='R__C'), left_index=True, right_index=True).drop(['R__C'], axis=1)

#trainlegacy()    
useful_features = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure','kfold']]

RS = RobustScaler()
train[useful_features] = RS.fit_transform(train[useful_features])
test[useful_features] = RS.transform(test[useful_features])
useful_features

In [None]:
train

In [None]:
prep =lambda: ColumnTransformer(
    transformers=[ 
        #('num', RobustScaler(), useful_features) 
    ], remainder='passthrough')

In [None]:
# utility functions

def optunaOpt(modelName,model,folds,n_trials,
              params,
              direction="minimize",
              metric = lambda x,y:mean_absolute_error(x,y),
              callbacks=(lambda trial: [])):
    """ Best model eval util using Optuna
    """
    def run(trials):
        """ Optima trials lambda"""
        fold = folds
        X_m = train
        preprocessor = prep()
        trial_params = {param:param_fn(trials) for param,param_fn in params.items()}
        l_model = model(trial_params)
        X_train = X_m[X_m.kfold != fold].reset_index(drop=True)
        X_valid = X_m[X_m.kfold == fold].reset_index(drop=True)
        train_data = preprocessor.fit_transform(X_train[useful_features])
        val_data =  preprocessor.transform(X_valid[useful_features])
        train_labels, val_labels = X_train[TARGET_VAR].to_numpy(), X_valid[TARGET_VAR].to_numpy()
        l_model.fit(train_data, train_labels ,early_stopping_rounds=300,eval_set=[(val_data, val_labels)],
                    verbose=False,
                    callbacks = callbacks(trials))
        preds_valid = l_model.predict(val_data)
        score = metric(val_labels, preds_valid)
        free_mem([X_m,X_train,X_valid,val_data])
        return score
    
    study = optuna.create_study(direction=direction,
                                study_name=f"{modelName}-study")
    study.optimize(run, n_trials)
    print('\n Best Trial:')
    print(study.best_trial)
    print('\n Best value')
    print(study.best_value)
    print('\n Best hyperparameters:')
    print(study.best_params)
    return study

def model_trainer(name,reg,folds,_X=train,_Y=train[TARGET_VAR],X_test_m=test,
                  useful_features= useful_features,
                  prep=prep,
                  metric = lambda x,y:mean_absolute_error(x,y),
                  earlyStoppingRounds = 400,
                  usePreprocessor=True):
    """trains model"""
    final_predictions = []
    final_test_predictions = []
    final_valid_predictions = {}
    final_valid_preds = []
    final_valid_labels = []
    history = []
    models = []
    scores = [] 
    X_m = _X
    print("training model ...",name)
    for fold in tqdm(range(FOLDS)): 
        tempModel = reg(fold)
        preprocessor = prep()
        X_train = X_m[X_m.kfold != fold].reset_index(drop=True)
        X_valid = X_m[X_m.kfold == fold].reset_index(drop=True)
        train_data = preprocessor.fit_transform(X_train[useful_features])
        val_data =  preprocessor.transform(X_valid[useful_features])
        train_labels, val_labels = X_train[TARGET_VAR].to_numpy(), X_valid[TARGET_VAR].to_numpy()
        # Training 
        if earlyStoppingRounds:
            tempModel.fit(train_data, train_labels,early_stopping_rounds=earlyStoppingRounds,eval_set=[(val_data, val_labels)],verbose=False)
        else:
            tempModel.fit(train_data, train_labels)
        valid_ids = X_valid.id.values.tolist()

        test_preds = tempModel.predict(preprocessor.transform(X_test_m[useful_features]))
        preds_valid = tempModel.predict(val_data)
        final_predictions.append(test_preds)
        foldStat = metric(val_labels,preds_valid)
        final_test_predictions.append(test_preds)
        final_valid_labels.append(val_labels)
        final_valid_preds.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
        scores.append(foldStat)
        print(f"Fold {fold} mae {foldStat}") 
        # GC collect
        free_mem([train_labels,val_labels,X_train,X_valid,train_data,val_data])
        gc.collect()
    print(f"score : mean {np.mean(scores)} median {np.median(scores)} std  {np.std(scores)}")    
    # https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/276138 
    # Plot ROC Curve
    final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    final_valid_predictions.columns = ["id", f"pred_{name}"]
    final_valid_predictions.to_csv(f"train_{name}_pred.csv", index=False)
    sample_submission_temp = pd.read_csv(submission_file)
    sample_submission_temp[TARGET_VAR] = np.median(np.column_stack(final_test_predictions), axis=1)
    sample_submission_temp.columns = ["id", f"pred_{name}"]
    sample_submission_temp.to_csv(f"test_{name}_pred.csv", index=False)
    free_mem([sample_submission_temp])
    gc.collect()
    return  final_predictions  

def make_submission(name,final_predictions):
    """Makes submission for testing"""
    sample_submission = pd.read_csv(submission_file)
    try:
        os.remove(f"submission_{name}.csv")
    except (OSError, IOError) as e:    
        #gulp
        print(f"Gulp {name}")
    # https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/276138    
    preds = np.column_stack(final_predictions)
    sample_submission[TARGET_VAR] = np.mean(preds, axis=1)
    sample_submission.to_csv(f"submission_mean_{name}.csv", index=False)
    
    sample_submission[TARGET_VAR] = np.median(preds, axis=1)
    sample_submission.to_csv(f"submission_median_{name}.csv", index=False)
    
    # ENSEMBLE FOLDS WITH MEDIAN AND ROUND PREDICTIONS
    sample_submission[TARGET_VAR] =  np.round( (sample_submission[TARGET_VAR] - PRESSURE_MIN)/PRESSURE_STEP ) * PRESSURE_STEP + PRESSURE_MIN
    sample_submission[TARGET_VAR] = np.clip(sample_submission[TARGET_VAR], PRESSURE_MIN, PRESSURE_MAX)
    sample_submission.to_csv(f"submission.csv", index=False)

In [None]:
# Doing tuning with LGBM .
lgbm_params = {
    "n_estimators":lambda trial :trial.suggest_int("n_estimators", 5000, 10000),
    "learning_rate":lambda trial :trial.suggest_loguniform("learning_rate", 0.01,1),
    "reg_lambda":lambda trial :trial.suggest_loguniform("reg_lambda", 1e-9, 100.0),
    "reg_alpha":lambda trial :trial.suggest_loguniform("reg_alpha", 1e-9, 100.0),
    "colsample_bytree":lambda trial :trial.suggest_uniform("colsample_bytree", 0.01, 1),
    "subsample":lambda trial :trial.suggest_uniform("subsample", 0.01, 1.0),
    "subsample_freq":lambda trial :trial.suggest_int("subsample_freq", 0,10),
    #"cat_l2":lambda trial :trial.suggest_float("cat_l2", 0.01, 0.4, log=True),
    "min_child_samples":lambda trial :trial.suggest_int("min_child_samples", 1, 256),
    "min_child_weight":lambda trial :trial.suggest_uniform("min_child_weight", 0.01, 10.0),
    #"scale_posn_weight":lambda trial :trial.suggest_uniform("scale_posn_weight", 1.0, 500.0),
    #"min_data_in_leaf":lambda trial :trial.suggest_int("min_data_in_leaf", 90, 110),
    "num_leaves":lambda trial :trial.suggest_int("num_leaves", 400,  1024),
    "max_bin":lambda trial :trial.suggest_int("max_bin", 150,  256),
    "max_depth":lambda trial :trial.suggest_int("max_depth", -1, 256),
    #"seed":lambda trial :trial.suggest_categorical("seed", [0,42]),
    #"feature_fraction":lambda trial :trial.suggest_float("feature_fraction", 0.1, 1.0, log=True),
}

def tune():
    gbmStudy = optunaOpt("lgbmModel",lambda xargs : LGBMRegressor(boosting_type = 'gbdt',n_jobs=-1,  metric = 'mae',
                                                     #device = 'gpu', gpu_platform_id=0,gpu_device_id=0, 
                                                     **xargs) ,0,100,lgbm_params,callbacks=
                                                     lambda trial : [optuna.integration.LightGBMPruningCallback(trial,'l1')])
    
#tune()    

In [None]:
best_params = {'objective': 'regression',
          'learning_rate': 0.25,
          "boosting_type": "gbdt",
          'min_data_in_leaf':600,
          'max_bin': 196,
          #'device':'gpu',
          'feature_fraction':0.4,
          'lambda_l1':36, 'lambda_l2':80,
          'max_depth':16,
          'num_leaves':1000,         
         }
best_params = {'n_estimators': 6310, 'learning_rate': 0.04365620149019579, 'reg_lambda': 0.037260392798849885, 'reg_alpha': 5.16437079538232e-06,
               'colsample_bytree': 0.9528265222160901, 'subsample': 0.6508127220019502,
               'subsample_freq': 7, 'min_child_samples': 88, 'min_child_weight': 3.9179766017460653, 'num_leaves': 731, 'max_bin': 161, 'max_depth': 107}
best_params = {'n_estimators': 7022, 'learning_rate': 0.07852321810528877, 'reg_lambda': 1.7862446759505994e-07, 'reg_alpha': 1.2765906103250663e-06, 'colsample_bytree': 0.7566430939237064, 
               'subsample': 0.7338768565837517, 'subsample_freq': 0, 'min_child_samples': 99, 'min_child_weight': 4.008859124270689, 'num_leaves': 697, 'max_bin': 239, 'max_depth': 197}
final_predictions_lgbm = model_trainer("lgbm_1",lambda fold : LGBMRegressor( metric='mae', n_jobs= -1,
                                                # device = 'gpu', gpu_platform_id=0,gpu_device_id=0,  #num_boost_round=100000,
                                                  random_state=fold,**best_params),5,earlyStoppingRounds=220
                                                    #int(best_params['n_estimators']*0.10
                                                          )
# 0.515 score

In [None]:
make_submission("lgbm_1",final_predictions_lgbm)