In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# import file

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train=pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test=pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submit=pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
submit.head()

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
#check missing value
print(df_train.isnull().sum())
print(df_test.isnull().sum())

In [None]:
#check data type
df_train.info()
df_test.info()

In [None]:
#summary statistic
df_train.describe()

In [None]:
df_test.describe()

# histgram

In [None]:
#Target-histgram
sns.set()
plt.hist(df_train["pressure"])
plt.xlabel("pressure")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Feature histgram
for i in df_test.columns:
    plt.hist(df_train[i], alpha=0.5, label="train")
    plt.hist(df_test[i], alpha=0.5, label="test")
    plt.xlabel(i)
    plt.ylabel("Frequency")
    plt.legend(loc=1)
    plt.show()

# relationship for pressure (boxplot & scatter plot)

In [None]:
x="R"
sns.boxplot(df_train[x], df_train["pressure"])
plt.xlabel(x)
plt.ylabel("pressure")
plt.show()

In [None]:
x="C"
sns.boxplot(df_train[x], df_train["pressure"])
plt.xlabel(x)
plt.ylabel("pressure")
plt.show()

In [None]:
x="time_step"
plt.scatter(df_train[x], df_train["pressure"])
plt.xlabel(x)
plt.ylabel("pressure")
plt.show()

In [None]:
x="u_in"
plt.scatter(df_train[x], df_train["pressure"])
plt.xlabel(x)
plt.ylabel("pressure")
plt.show()

In [None]:
x="u_out"
sns.boxplot(df_train[x], df_train["pressure"])
plt.xlabel(x)
plt.ylabel("pressure")
plt.show()

# breath ID

In [None]:
#number of unique value
print(df_train["breath_id"].nunique())
print(df_test["breath_id"].nunique())

In [None]:
df_train["breath_id"]

In [None]:
def lineplot(df):
    plt.plot(df["time_step"], df["pressure"])
    plt.xlabel("time_step")
    plt.ylabel("pressure")
    plt.show()

In [None]:
breath1=df_train.loc[df_train["breath_id"]==1]
breath2=df_train.loc[df_train["breath_id"]==2]
breath3=df_train.loc[df_train["breath_id"]==3]
breath4=df_train.loc[df_train["breath_id"]==4]

In [None]:
lineplot(breath1)

In [None]:
lineplot(breath2)

In [None]:
lineplot(breath3)

In [None]:
lineplot(breath4)

In [None]:

plt.plot(breath1["time_step"], breath1["pressure"])
plt.xlabel("time_step")
plt.ylabel("pressure")
plt.show()

# Simple Model

In [None]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold, train_test_split

from tqdm import tqdm_notebook as tqdm

In [None]:
df_train.head(2)

In [None]:
y_train=df_train["pressure"]
x_train=df_train.drop(["id", "breath_id", "pressure"], axis=1)


In [None]:
x_test=df_test.drop(["id","breath_id"], axis=1)

In [None]:
x_train

In [None]:
x_test

In [None]:
#GroupID for Group-KFold
groups = df_train["breath_id"]
groups

In [None]:
# CV Averaging
scores = []
y_pred_test = np.zeros(len(x_test)) #array for predict value
gkf = GroupKFold(n_splits=5)

for i, (train_ix, test_ix) in tqdm(enumerate(gkf.split(x_train, y_train, groups))):
    
    X_train_, y_train_, groups_train_ = x_train.iloc[train_ix], y_train.iloc[train_ix], groups[train_ix]
    X_val, y_val, groups_val = x_train.iloc[test_ix], y_train.iloc[test_ix], groups[test_ix]
    
    print('Train Groups', np.unique(groups_train_))
    print('Val Groups', np.unique(groups_val))
    print(X_train_.shape, X_val.shape)
    
    model = lgb.LGBMRegressor(random_state=71, importance_type='gain')
    
    model.fit(X_train_, y_train_)
    y_pred = model.predict(X_val)
    
    y_pred_test += model.predict(x_test) # add predict value
    
    score =  mean_absolute_error(y_val, y_pred)
    scores.append(score) 
    
    
    print('CV Score of Fold_%d is %f' % (i, score))
    
   
   

In [None]:
print(scores)
print(np.mean(scores))

In [None]:
y_pred_test_submit=y_pred_test/5 #n_splits=5


In [None]:
y_pred_test_submit

In [None]:
submit["pressure"]=y_pred_test_submit
submit.head()

In [None]:
submit.to_csv("submission.csv",index=False)

# Feature Impact

In [None]:
importance = pd.DataFrame(model.feature_importances_, index=x_train.columns, columns=['importance'])
importance

In [None]:
sns.barplot(x=importance.index, y="importance", data=importance)

In [None]:
#End