In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

## EDA

In [None]:
train.head()

In [None]:
train[train['breath_id']==1]

In [None]:
train[train['breath_id']==1].nunique().to_frame()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
fig,ax = plt.subplots(figsize=(12,7),constrained_layout=True)

plt.subplot(2,2,1)
plt.title('count of R in train')
sns.countplot(x='R',data=train)
plt.subplot(2,2,2)
plt.title('count of R in test')
sns.countplot(x='R',data=test)
plt.subplot(2,2,3)
plt.title('count of C in train')
sns.countplot(x='C',data=train)
plt.subplot(2,2,4)
plt.title('count of C in test')
sns.countplot(x='C',data=test)  

In [None]:
plt.figure(figsize=(12,7))
plt.subplot(2,2,1)
plt.title('u_in in train')
plt.hist(train['u_in'],bins=100)

plt.subplot(2,2,2)
plt.title('u_in in test')
plt.hist(test['u_in'],bins=100)

plt.subplot(2,2,3)
plt.title('u_out in train')
plt.hist(train['u_out'],bins=100)

plt.subplot(2,2,4)
plt.title('u_out in test')
plt.hist(test['u_out'],bins=100)
plt.tight_layout()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# rewritten from https://www.kaggle.com/mst8823/google-brain-lightgbm-baseline?scriptVersionId=75500028&cellId=14
fig,ax = plt.subplots(figsize=(14,7))

ax = sns.distplot(train.loc[train["u_out"] == 0, "pressure"], ax=ax, label="u_out=0", bins=200)
ax = sns.distplot(train.loc[train["u_out"] == 1, "pressure"], ax=ax, label="u_out=1", bins=200)
ax.legend(loc='upper right')

In [None]:
fig,ax = plt.subplots(figsize=(12,7))

ax = sns.distplot(train.loc[train['R']==5,'pressure'],ax=ax, label='R=5',bins = 200)
ax = sns.distplot(train.loc[train['R']==20,'pressure'],ax=ax,label='R=20',bins=200)
ax= sns.distplot(train.loc[train['R']==50,'pressure'],ax=ax,label='R=50',bins=200)
ax.legend(loc='upper right')

In [None]:
fig,ax = plt.subplots(figsize=(12,7))

ax = sns.distplot(train.loc[train['C']==10,'pressure'],ax=ax, label='C=10',bins = 200)
ax = sns.distplot(train.loc[train['C']==20,'pressure'],ax=ax,label='C=20',bins=200)
ax= sns.distplot(train.loc[train['C']==50,'pressure'],ax=ax,label='C=50',bins=200)
ax.legend(loc='upper right')

In [None]:
# referred https://www.kaggle.com/carlmcbrideellis/ventilator-pressure-eda-and-simple-submission?scriptVersionId=76671996&cellId=32
pd.crosstab(train['R'],train['C'])/80

In [None]:
train['R_C'] = train['R'].astype(str)+'_'+train['C'].astype(str)
train['R_C'].unique()

In [None]:
# referred https://www.kaggle.com/artgor/ventilator-pressure-prediction-eda-fe-and-models?scriptVersionId=75639741&cellId=13
for i in (train['R_C'].unique()):  
    fig,ax1 = plt.subplots(figsize=(10,7))
    bid=train.loc[train['R_C']==i].iloc[0,1]
    data = train.loc[train['breath_id']==bid]               
    ax2=ax1.twinx()     
    ax1.set_title(f'breath_id= {bid} / R_C: {i}')
    ax1.plot(data['time_step'],data['pressure'],'r-',label='pressure')
    ax1.plot(data['time_step'],data['u_in'],'g-',label='u_in')
    ax2.plot(data['time_step'],data['u_out'],'b-',label='u_out')
    ax1.set_xlabel('Timestep')
    ax1.title.set_size(20)
    ax1.legend(loc='center right')
    ax2.legend(loc='upper right')
    plt.show()

#### we can know that when u_out moves to 1 -> pressure goes down

In [None]:
train[['R_C','pressure']].groupby(['R_C'],as_index=False).max().sort_values(by='R_C',ascending=False)

In [None]:
train[['R_C','pressure']].groupby(['R_C'],as_index=False).min().sort_values(by='R_C',ascending=False)

## Feature Engineering

In [None]:
def fe (df):
    df['R_C'] = df['R'].astype(str)+'_'+train['C'].astype(str)
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_in_lagN1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_in_lagN2'] = df.groupby('breath_id')['u_in'].shift(-2)
    
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_out_lagN1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_out_lagN2'] = df.groupby('breath_id')['u_out'].shift(-2)

    df = pd.get_dummies(df)
    
    #window
    df['rolling_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10,min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_std'] =df.groupby('breath_id')['u_in'].rolling(window=10,min_periods=1).std().reset_index(level=0,drop=True)
    
    df['expanding_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expanding_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    
    #https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
    df['u_in_cumsum']=(df['u_in']).groupby(df['breath_id']).cumsum()
    return df

In [None]:
train = fe(train)

In [None]:
test=fe(test)

In [None]:
train.head()

In [None]:
list(train)

In [None]:
train=train.fillna(0)
test=test.fillna(0)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
import os
import time
import lightgbm as lgb

from sklearn.model_selection import GroupKFold 
from sklearn.model_selection import  KFold
from sklearn import metrics

In [None]:
column = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure']]

In [None]:
#referred https://www.kaggle.com/shivansh002/lgbm-lover-s?scriptVersionId=75633660&cellId=6
scores = []
feature_importance = pd.DataFrame()
models = []
columns = [col for col in train.columns if col not in ['id', 'breath_id', 'pressure']]
X = train[columns]
y = train['pressure']
params = {'objective': 'regression',
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          'max_bin': 196,
          'feature_fraction':0.4,
          'max_depth':16,
          "metric": 'mae',
          'n_jobs': -1
         }
folds = GroupKFold(n_splits=5)
for fold_n, (train_index, valid_index) in enumerate(folds.split(train, y, groups=train['breath_id'])):
    print(f'Fold {fold_n} started at {time.ctime()}')
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model = lgb.LGBMRegressor(**params, n_estimators=8000)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            verbose=100, early_stopping_rounds=10)
    score = metrics.mean_absolute_error(y_valid, model.predict(X_valid))
    
    models.append(model)
    scores.append(score)

    fold_importance = pd.DataFrame()
    fold_importance["feature"] = columns
    fold_importance["importance"] = model.feature_importances_
    fold_importance["fold"] = fold_n + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
for model in models:
    submission['pressure']+=model.predict(test[columns])
submission['pressure']/=5

In [None]:
submission.head()

In [None]:
submission.to_csv('1submit.csv',index=False)