In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
#plt.rcParams.update({'font.size': 18})
#plt.style.use('fivethirtyeight')
plt.style.use('seaborn-white')
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
train_data = pd.read_csv('../input/ventilator-pressure-prediction/train.csv',index_col=0,dtype={4: np.float32, 5: np.float32,6: np.float32,7: np.float32})
test_data  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv', index_col=0,dtype={4: np.float32, 5: np.float32,6: np.float32,7: np.float32})
sample     = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
#looking at the train data
train_data.head()

In [None]:
train_data.info()

In [None]:
#looking at the test data
test_data.head()

In [None]:
test_data.info()

In [None]:
train_data.isnull().sum().to_frame()

In [None]:
test_data.isnull().sum().to_frame()

In [None]:
breath_one = train_data[train_data['breath_id'] == 3928].reset_index(drop=True)
breath_one

In [None]:
breath_one.nunique().to_frame()

In [None]:
fig,axes = plt.subplots(3,1,figsize=(12,15))
sns.lineplot(x='time_step',y='u_in',data=breath_one,ax=axes[0])
axes[0].set_title("u_in")
sns.lineplot(x='time_step',y='u_out',data=breath_one,ax=axes[1])
axes[1].set_title("u_out")
sns.lineplot(x='time_step',y='pressure',data=breath_one,ax=axes[2])
axes[2].set_title("pressure")

In [None]:
breath_one.describe()

In [None]:
train_data.R.value_counts().to_frame()

In [None]:
train_data.C.value_counts().to_frame()

In [None]:
train_data.describe()

In [None]:
fig,axes = plt.subplots(1,1,figsize=(10,5))
sns.histplot(data=train_data,x="pressure",ax=axes)

In [None]:
#add feature last_value_u_in
idxmax_time_step = train_data.groupby('breath_id')['time_step'].idxmax()
last_value_u_in = train_data.loc[idxmax_time_step, ['breath_id','u_in']]
last_value_u_in.columns = ['breath_id','last_value_u_in']

train_data = train_data.merge(last_value_u_in, on='breath_id')
train_data

In [None]:
#add feature last_value_u_in
idxmax_time_step = test_data.groupby('breath_id')['time_step'].idxmax()
last_value_u_in = test_data.loc[idxmax_time_step, ['breath_id','u_in']]
last_value_u_in.columns = ['breath_id','last_value_u_in']

test_data = test_data.merge(last_value_u_in, on='breath_id')
test_data

In [None]:
#add feature mean value u_in
mean_u_in = train_data.groupby('breath_id')['u_in'].mean().to_frame()
mean_u_in.columns = ['mean_value_u_in']
train_data = train_data.merge(mean_u_in,on='breath_id')

In [None]:
train_data

In [None]:
#add feature mean value u_in
mean_u_in = test_data.groupby('breath_id')['u_in'].mean().to_frame()
mean_u_in.columns = ['mean_value_u_in']
test_data = test_data.merge(mean_u_in,on='breath_id')
test_data

In [None]:
train_data['diff_u_in'] = train_data.groupby('breath_id')['u_in'].diff()

In [None]:
train_data = train_data.fillna(0)
train_data

In [None]:
test_data['diff_u_in'] = test_data.groupby('breath_id')['u_in'].diff()
test_data = test_data.fillna(0)
test_data

In [None]:
train_data['diff_diff_u_in'] = train_data.groupby('breath_id')['diff_u_in'].diff()
train_data = train_data.fillna(0)
train_data

In [None]:
test_data['diff_diff_u_in'] = test_data.groupby('breath_id')['diff_u_in'].diff()
test_data = test_data.fillna(0)
test_data

In [None]:
#add feature: comsum of u_in
train_data['u_in_cumsum'] = (train_data['u_in']).groupby(train_data['breath_id']).cumsum()
test_data['u_in_cumsum'] = (test_data['u_in']).groupby(test_data['breath_id']).cumsum()

In [None]:
#add feature sum value u_in
sum_u_in = train_data.groupby('breath_id')['u_in'].sum().to_frame()
sum_u_in.columns = ['sum_value_u_in']
train_data = train_data.merge(sum_u_in,on='breath_id')

In [None]:
#add feature sum value u_in
sum_u_in = test_data.groupby('breath_id')['u_in'].sum().to_frame()
sum_u_in.columns = ['sum_value_u_in']
test_data = test_data.merge(sum_u_in,on='breath_id')

In [None]:
#add feature : u_in_cumsum_rate
train_data["u_in_cumsum_rate"] = train_data["u_in_cumsum"] / train_data["sum_value_u_in"]
test_data["u_in_cumsum_rate"] = test_data["u_in_cumsum"] / test_data["sum_value_u_in"]

In [None]:
#sum breath_id has all zero u_in
train_data[train_data["sum_value_u_in"] == 0]

In [None]:
test_data[test_data["sum_value_u_in"] == 0]

In [None]:
train_data[train_data["breath_id"] == 3928]

In [None]:
#so,null to zero
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [None]:
#add feature: lag of u_in
train_data['lag_u_in'] = train_data.groupby('breath_id')['u_in'].shift(1)
train_data = train_data.fillna(0)

test_data['lag_u_in'] = test_data.groupby('breath_id')['u_in'].shift(1)
test_data = test_data.fillna(0)

#add feature: lag2 of u_in
train_data['lag_2_u_in'] = train_data.groupby('breath_id')['u_in'].shift(2)
train_data = train_data.fillna(0)
test_data['lag_2_u_in'] = test_data.groupby('breath_id')['u_in'].shift(2)
test_data = test_data.fillna(0)

In [None]:
#add feature lag -1 and -2 u_in
train_data['lag_-1_u_in'] = train_data.groupby('breath_id')['u_in'].shift(-1)
train_data = train_data.fillna(0)
test_data['lag_-1_u_in'] = test_data.groupby('breath_id')['u_in'].shift(-1)
test_data = test_data.fillna(0)

train_data['lag_-2_u_in'] = train_data.groupby('breath_id')['u_in'].shift(-2)
train_data = train_data.fillna(0)
test_data['lag_-2_u_in'] = test_data.groupby('breath_id')['u_in'].shift(-2)
test_data = test_data.fillna(0)

In [None]:
#add feature lag -3 and 3 u_in
train_data['lag_-3_u_in'] = train_data.groupby('breath_id')['u_in'].shift(-3)
train_data = train_data.fillna(0)
test_data['lag_-3_u_in'] = test_data.groupby('breath_id')['u_in'].shift(-3)
test_data = test_data.fillna(0)

train_data['lag_3_u_in'] = train_data.groupby('breath_id')['u_in'].shift(3)
train_data = train_data.fillna(0)
test_data['lag_3_u_in'] = test_data.groupby('breath_id')['u_in'].shift(3)
test_data = test_data.fillna(0)

In [None]:
#add feature: max_u_in_breathid
train_data["max_u_in_breathid"] = train_data.groupby("breath_id")["u_in"].transform("max")
test_data["max_u_in_breathid"] = test_data.groupby("breath_id")["u_in"].transform("max")

#add feature: R*C
train_data["R*C"] = train_data['R'] * train_data['C']
test_data['R*C'] = test_data['R'] * test_data['C']

## add breath_id__u_in__min
train_data['breath_id__u_in__min'] = train_data.groupby(['breath_id'])['u_in'].transform('min')
test_data['breath_id__u_in__min'] = test_data.groupby(['breath_id'])['u_in'].transform('min')

## add breath_id__u_in__diffmax & breath_id__u_in__diffmean
train_data['breath_id__u_in__diffmax'] = train_data.groupby(['breath_id'])['u_in'].transform('max') - train_data['u_in']
train_data['breath_id__u_in__diffmean'] = train_data.groupby(['breath_id'])['u_in'].transform('mean') - train_data['u_in']

test_data['breath_id__u_in__diffmax'] = test_data.groupby(['breath_id'])['u_in'].transform('max') - test_data['u_in']
test_data['breath_id__u_in__diffmean'] = test_data.groupby(['breath_id'])['u_in'].transform('mean') - test_data['u_in']

train_data['u_in_partition_out_sum'] = train_data.groupby(['breath_id',"u_out"])['u_in'].transform("sum")
test_data['u_in_partition_out_sum'] = test_data.groupby(['breath_id',"u_out"])['u_in'].transform("sum")

##add feature area
train_data['area'] = train_data['time_step'] * train_data['u_in']
train_data['area'] = train_data.groupby('breath_id')['area'].cumsum()
test_data['area'] = test_data['time_step'] * test_data['u_in']
test_data['area'] = test_data.groupby('breath_id')['area'].cumsum()


In [None]:
#scatter plot (u_out = 0)
GRAPH = True
if(GRAPH):
    sample_train = train_data.sample(frac=0.001)
    sample_train = sample_train[sample_train["u_out"] == 0]
    #check scatter with pressure and (last_value_u_in |mean_value_u_in| u_in_diff)
    fig,axes = plt.subplots(3,7,figsize=(25,15))
    sns.scatterplot(data=sample_train,x='last_value_u_in',y='pressure',ax=axes[0][0])
    sns.scatterplot(data=sample_train,x='mean_value_u_in',y='pressure',ax=axes[0][1])
    sns.scatterplot(data=sample_train,x='diff_u_in',y='pressure',ax=axes[0][2])
    sns.scatterplot(data=sample_train,x='u_in_cumsum',y='pressure',ax=axes[0][3])
    sns.scatterplot(data=sample_train,x='time_step',y='pressure',ax=axes[0][4])
    sns.scatterplot(data=sample_train,x='diff_diff_u_in',y='pressure',ax=axes[0][5])
    sns.scatterplot(data=sample_train,x='sum_value_u_in',y='pressure',ax=axes[0][6])
    sns.scatterplot(data=sample_train,x='u_in_cumsum_rate',y='pressure',ax=axes[1][0])
    sns.scatterplot(data=sample_train,x='lag_u_in',y='pressure',ax=axes[1][1])
    sns.scatterplot(data=sample_train,x='lag_2_u_in',y='pressure',ax=axes[1][2])
    sns.scatterplot(data=sample_train,x='max_u_in_breathid',y='pressure',ax=axes[1][3])
    sns.scatterplot(data=sample_train,x='R*C',y='pressure',ax=axes[1][4])
    sns.scatterplot(data=sample_train,x='lag_-3_u_in',y='pressure',ax=axes[1][5])
    sns.scatterplot(data=sample_train,x='lag_3_u_in',y='pressure',ax=axes[1][6])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__min',y='pressure',ax=axes[2][0])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__diffmax',y='pressure',ax=axes[2][1])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__diffmean',y='pressure',ax=axes[2][2])
    sns.scatterplot(data=sample_train,x='u_in_partition_out_sum',y='pressure',ax=axes[2][3])
    sns.scatterplot(data=sample_train,x='area',y='pressure',ax=axes[2][4])

In [None]:
#scatter plot (u_out = 1)
GRAPH = True
if(GRAPH):
    sample_train = train_data.sample(frac=0.001)
    sample_train = sample_train[sample_train["u_out"] == 1]
    #check scatter with pressure and (last_value_u_in |mean_value_u_in| u_in_diff)
    fig,axes = plt.subplots(3,7,figsize=(25,15))
    sns.scatterplot(data=sample_train,x='last_value_u_in',y='pressure',ax=axes[0][0])
    sns.scatterplot(data=sample_train,x='mean_value_u_in',y='pressure',ax=axes[0][1])
    sns.scatterplot(data=sample_train,x='diff_u_in',y='pressure',ax=axes[0][2])
    sns.scatterplot(data=sample_train,x='u_in_cumsum',y='pressure',ax=axes[0][3])
    sns.scatterplot(data=sample_train,x='time_step',y='pressure',ax=axes[0][4])
    sns.scatterplot(data=sample_train,x='diff_diff_u_in',y='pressure',ax=axes[0][5])
    sns.scatterplot(data=sample_train,x='sum_value_u_in',y='pressure',ax=axes[0][6])
    sns.scatterplot(data=sample_train,x='u_in_cumsum_rate',y='pressure',ax=axes[1][0])
    sns.scatterplot(data=sample_train,x='lag_u_in',y='pressure',ax=axes[1][1])
    sns.scatterplot(data=sample_train,x='lag_2_u_in',y='pressure',ax=axes[1][2])
    sns.scatterplot(data=sample_train,x='max_u_in_breathid',y='pressure',ax=axes[1][3])
    sns.scatterplot(data=sample_train,x='R*C',y='pressure',ax=axes[1][4])
    sns.scatterplot(data=sample_train,x='lag_-3_u_in',y='pressure',ax=axes[1][5])
    sns.scatterplot(data=sample_train,x='lag_3_u_in',y='pressure',ax=axes[1][6])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__min',y='pressure',ax=axes[2][0])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__diffmax',y='pressure',ax=axes[2][1])
    sns.scatterplot(data=sample_train,x='breath_id__u_in__diffmean',y='pressure',ax=axes[2][2])
    sns.scatterplot(data=sample_train,x='u_in_partition_out_sum',y='pressure',ax=axes[2][3])
    sns.scatterplot(data=sample_train,x='area',y='pressure',ax=axes[2][4])

In [None]:
del fig
del axes
del sample_train

In [None]:
import gc
gc.collect()

In [None]:
#add feature: R_C
train_data["train_test"] = "train"
test_data["train_test"] = "test"

train_test_all = pd.concat([train_data,test_data],axis=0)

del train_data
del test_data
gc.collect()

In [None]:
train_test_all

In [None]:
train_test_all['R_C'] = [f'{r}_{c}' for r, c in zip(train_test_all['R'], train_test_all['C'])]

In [None]:
train_test_all.info()

In [None]:
train_test_all = pd.get_dummies(train_test_all,columns=["R_C"])
train_test_all.columns

In [None]:
##add feaure time_diff
train_test_all['time_diff']=train_test_all.time_step.diff().fillna(0)

In [None]:
#Simple Regression Submit
train_data = train_test_all[train_test_all["train_test"] == "train"]
test_data = train_test_all[train_test_all["train_test"] == "test"]


In [None]:
del train_test_all
gc.collect()

In [None]:
LM = True
u_out_zero_only = False ## if train from only u_out=0 data

In [None]:
#train
if(u_out_zero_only):
    train_data = train_data[train_data["u_out"] == 0]
    train_data = train_data.reset_index(drop=True)
X_train = train_data.drop(["pressure","breath_id","train_test"],axis=1)
y_train = train_data['pressure']
X_test = test_data.drop(["pressure","breath_id","train_test"],axis=1)

if(LM):
    scaler = StandardScaler()
    scaler.fit(X_train)
    #print(scaler.mean_)

    X_train_std = scaler.transform(X_train)


    lm = LinearRegression().fit(X_train_std, y_train)
    print("coefficient of determination = ",lm.score(X_train_std, y_train))


    #test
    
    X_test_std = scaler.transform(X_test)
    sample['pressure'] = lm.predict(X_test_std)

    sample.to_csv("submission_lm.csv",index=False)

In [None]:
#plot scatter of corrct-predict of train_in_sample
if(LM):
    insample_result = pd.DataFrame()
    insample_result['correct'] = y_train
    insample_result['result'] = lm.predict(X_train_std)

    fig,axes = plt.subplots(1,1,figsize=(10,10))
    sns.scatterplot(data=insample_result,x='correct',y='result',ax=axes)

    x = np.linspace(0, 60, 10)
    y = x
    axes.plot(x, y, color = "r")

In [None]:
#calc insample MSE
if(LM):
    insample_MSE = mean_absolute_error(insample_result['correct'],insample_result['result'])
    print(insample_MSE)

In [None]:
if(LM):
    del insample_result
    del fig
    del axes
    del X_train_std
    del X_test_std

del test_data

In [None]:
NEW_GBM = False
#LIghtGBM

In [None]:
!pip install lightgbm

In [None]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import numpy as np
import time
import lightgbm as lgb

from sklearn.model_selection import GroupKFold 
from sklearn.model_selection import  KFold
from sklearn import metrics

In [None]:
y_train

In [None]:
#for scatterplot of lightgbm
gbm_val_result = pd.DataFrame()
gbm_val_result['correct'] = y_train

In [None]:
if(NEW_GBM):
    scores = []
    feature_importance = pd.DataFrame()
    columns = [col for col in train_data.columns if col not in ['id', 'breath_id', 'pressure',"train_test"]]

    models = []
    X = X_train
    y = y_train

    del X_train
    del y_train

    params = {'objective': 'regression',
              'learning_rate': 0.25,
              "boosting_type": "gbdt",
              'min_data_in_leaf':600,
              'max_bin': 196,
              #'device':'gpu',
              'feature_fraction':0.4,
              'lambda_l1':36, 'lambda_l2':80,
              'max_depth':16,
              'num_leaves':1000,
              "metric": 'mae',
              'n_jobs': -1
             }
    folds = GroupKFold(n_splits=5)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(train_data, y, groups=train_data['breath_id'])):
        print(f'Fold {fold_n} started at {time.ctime()}')
        X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.LGBMRegressor(**params, n_estimators=8000)
        model.fit(X_train, y_train, 
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                verbose=100, early_stopping_rounds=10)
        score = metrics.mean_absolute_error(y_valid, model.predict(X_valid))

        models.append(model)
        scores.append(score)

        y_pred = model.predict(X_valid)

        gbm_val_result.loc[valid_index,["result"]] = y_pred #for scatterplot



        fold_importance = pd.DataFrame()
        fold_importance["feature"] = columns
        fold_importance["importance"] = model.feature_importances_
        fold_importance["fold"] = fold_n + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
if(NEW_GBM):
    for model in models:
        sample['pressure'] += model.predict(X_test)
    sample['pressure'] /= 5

    sample.to_csv('submission.csv', index=False)
if(NEW_GBM):
    fig,axes = plt.subplots(1,1,figsize=(10,10))
    sns.scatterplot(data=gbm_val_result,x='correct',y='result',ax=axes)

    x = np.linspace(0, 60, 10)
    y = x
    axes.plot(x, y, color = "r")

In [None]:
#LightGBM (old)
#GroupID for Group-KFold
from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold, train_test_split
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
groups = train_data["breath_id"]
#groups

In [None]:
OLD_GBM = True
if(OLD_GBM):
    # CV Averaging
    scores = []
    importance = []
    y_pred_test = np.zeros(len(X_test)) #array for predict value
    gkf = GroupKFold(n_splits=5)

    for i, (train_ix, test_ix) in tqdm(enumerate(gkf.split(X_train, y_train, groups))):

        X_train_, y_train_, groups_train_ = X_train.iloc[train_ix], y_train.iloc[train_ix], groups[train_ix]
        X_val, y_val, groups_val = X_train.iloc[test_ix], y_train.iloc[test_ix], groups[test_ix]

        print('Train Groups', np.unique(groups_train_))
        print('Val Groups', np.unique(groups_val))
        print(X_train_.shape, X_val.shape)

        model = lgb.LGBMRegressor(random_state=71, importance_type='gain')

        model.fit(X_train_, y_train_)
        y_pred = model.predict(X_val)

        gbm_val_result.loc[test_ix,["result"]] = y_pred #for scatterplot

        y_pred_test += model.predict(X_test) # add predict value

        score =  mean_absolute_error(y_val, y_pred)
        scores.append(score) 

        #importance
        importance_df = pd.DataFrame(model.feature_importances_, index = X_test.columns, columns=['importance'])
        importance.append(importance_df)

        print('CV Score of Fold_%d is %f' % (i, score))

In [None]:
if(OLD_GBM):
    print(scores)
    print(np.mean(scores))

In [None]:
if(OLD_GBM):
    for df in importance:
        display(df.sort_values('importance',ascending=False))

In [None]:
if(OLD_GBM):
    y_pred_test_submit = y_pred_test/5 #n_splits=5
    sample['pressure'] = y_pred_test_submit
    sample.to_csv("submission.csv",index=False)

In [None]:
if(OLD_GBM):
    fig,axes = plt.subplots(1,1,figsize=(10,10))
    sns.scatterplot(data=gbm_val_result,x='correct',y='result',ax=axes)

    x = np.linspace(0, 60, 10)
    y = x
    axes.plot(x, y, color = "r")