In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, normalize
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")
submission = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")


In [None]:
def add_features(df):
    df['R1'] = df['R']
    df['C1'] = df['C']
    
    df['last_value_u_in'] = df.groupby('breath_id')['u_in'].transform('last')
    
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df['u_in_lag_back10'] = df.groupby('breath_id')['u_in'].shift(-10)
    df['u_out_lag_back10'] = df.groupby('breath_id')['u_out'].shift(-10)
    df = df.fillna(0)
    
    df['u_in_first'] = df.groupby('breath_id')['u_in'].first()
    df['u_out_first'] = df.groupby('breath_id')['u_out'].first()
    
    ## time since last step
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    ### rolling window ts feats
    df['ewm_u_in_mean'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=9)
                           .mean()
                           .reset_index(level=0,drop=True))
    df['ewm_u_in_std'] = (df
                          .groupby('breath_id')['u_in']
                          .ewm(halflife=10)
                          .std()
                          .reset_index(level=0,drop=True)) ## could add covar?
    df['ewm_u_in_corr'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=15)
                           .corr()
                           .reset_index(level=0,drop=True)) # self umin corr
    
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean","15_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=15,min_periods=1)
                                                                              .agg({"15_in_sum":"sum",
                                                                                    "15_in_min":"min",
                                                                                    "15_in_max":"max",
                                                                                    "15_in_mean":"mean",
                                                                                    "15_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["45_in_sum","45_in_min","45_in_max","45_in_mean","45_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=45,min_periods=1)
                                                                              .agg({"45_in_sum":"sum",
                                                                                    "45_in_min":"min",
                                                                                    "45_in_max":"max",
                                                                                    "45_in_mean":"mean",
                                                                                    "45_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["15_out_mean"]] = (df
                           .groupby('breath_id')['u_out']
                           .rolling(window=15,min_periods=1)
                           .agg({"15_out_mean":"mean"})
                           .reset_index(level=0,drop=True))
    
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['breath_id__u_out__mean'] =df.groupby(['breath_id'])['u_out'].mean()
    df['breath_id__u_in__mean'] =df.groupby(['breath_id'])['u_in'].mean()

    df['breath_id__u_in__min'] = df.groupby(['breath_id'])['u_in'].transform('min')
    df['breath_id__u_out__min'] = df.groupby(['breath_id'])['u_out'].transform('min')
    
    df['R_div_C'] = df["R"].div(df["C"])
    
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    
    df['u_in_diff_1_2'] = df['u_in_lag1'] - df['u_in_lag2']
    df['u_out_diff_1_2'] = df['u_out_lag1'] - df['u_out_lag2']
    df['u_in_lagback_diff_1_2'] = df['u_in_lag_back1'] - df['u_in_lag_back2']
    df['u_out_lagback_diff_1_2'] = df['u_out_lag_back1'] - df['u_out_lag_back2']
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    
    df.loc[df['time_step'] == 0, 'u_in_diff'] = 0
    df.loc[df['time_step'] == 0, 'u_out_diff'] = 0
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df = df.merge(pd.get_dummies(df['R'], prefix='R'), left_index=True, right_index=True).drop(['R'], axis=1)
    df = df.merge(pd.get_dummies(df['C'], prefix='C'), left_index=True, right_index=True).drop(['C'], axis=1)
    df = df.merge(pd.get_dummies(df['R__C'], prefix='R__C'), left_index=True, right_index=True).drop(['R__C'], axis=1)
    
    
    df['u_in_partition_out_sum'] = df.groupby(['breath_id',"u_out"])['u_in'].transform("sum")
    
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
#     df['R'] = df['R'].astype(str)
#     df['C'] = df['C'].astype(str)
#     df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
#     df = pd.get_dummies(df)
    df = df.fillna(0)

    
    df['delta_time']=df['time_step'].shift(-1, fill_value=0)-df['time_step']
    df['area_u_in']=df['u_in']*df['delta_time']
    df['u_in_change']= df['u_in'].shift(-1, fill_value=0)-df['u_in']
    df['uin_in_time']=df['u_in_change']/df['delta_time']
    df['area_u_in_abs']=df['u_in_change']*df['delta_time']
    
    df['RC_max'] = df.groupby(['R1','C1'])['pressure'].transform('max')
    df['RC_mean'] =df.groupby(['R1','C1'])['pressure'].transform('mean')
    
    df = df.fillna(0)
    
    return df
train = add_features(train)
test = add_features(test)

In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import numpy as np
import time
import lightgbm as lgb

from sklearn.model_selection import GroupKFold 
from sklearn.model_selection import  KFold
from sklearn import metrics

In [None]:
train = train[train['u_out'] < 1].reset_index(drop=True)

In [None]:
scores = []
feature_importance = pd.DataFrame()
models = []
columns = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure','u_out']]
columns = ['time_step', 'last_value_u_in', 'u_in_cumsum', 'time_step_cumsum', 
           'u_in_lag_back1', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4', 
           'u_in_lag_back4', 'u_out_lag_back4', 'u_in_lag_back10', 'time_step_diff',
           'ewm_u_in_mean', 'ewm_u_in_std', 'ewm_u_in_corr', '15_in_min', '15_in_max', 
           '15_out_std', '45_in_sum', '45_in_min', '45_in_max', '45_in_mean', '45_out_std',
           'breath_id__u_in__max', 'breath_id__u_out__mean', 'breath_id__u_in__min', 
           'R_div_C', 'u_in_diff2', 'u_in_diff3', 'u_in_diff4', 'breath_id__u_in__diffmean',
           'R__C_20__20', 'R__C_20__50', 'R__C_50__50', 'R__C_5__10', 'R__C_5__20', 
           'R__C_5__50', 'u_in_partition_out_sum', 'delta_time', 'u_in_change',
           'area_u_in_abs', 'u_in', 'u_in_lag2', 'u_out_lagback_diff2',
           'breath_id__u_in__diffmax', 'R_50', 'R__C_50__10', 'R1', 'C_10', 'C_50', 'C1',
           'u_in_lag3','RC_max','RC_mean']
X = train[columns]
y = train['pressure']

RS = RobustScaler()
Xt = RS.fit_transform(X)
#test = RS.transform(test)

params = {'objective': 'regression',
          'learning_rate': 0.2, #0.35,
          "boosting_type": "gbdt",
          'min_data_in_leaf':30,
          'max_bin': 700, #600,
          'num_leaves': 1200,
          "metric": 'mae', #'mae',
          'n_jobs': -1
         }
folds = GroupKFold(n_splits=5)
for fold_n, (train_index, valid_index) in enumerate(folds.split(train, y, groups=train['breath_id'])):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    X_train = RS.transform(X_train)
    X_valid = RS.transform(X_valid)
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = lgb.LGBMRegressor(**params, n_estimators=600)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            verbose=100, early_stopping_rounds=25)
    score = metrics.mean_absolute_error(y_valid, model.predict(X_valid))
    
    models.append(model)
    scores.append(score)

    #fold_importance = pd.DataFrame()
    #fold_importance["feature"] = columns
    #fold_importance["importance"] = model.feature_importances_
    #fold_importance["fold"] = fold_n + 1
    #feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
X_test = RS.transform(test[columns])
for model in models:
    submission['pressure'] += model.predict(X_test)
submission['pressure'] /= 5

submission.to_csv('submission.csv', index=False)