In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Load CSV files
train_df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test_df = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
# Shape of train and test dataset
print("Shape of training data : {}".format(train_df.shape))
print("Shape of test data : {}".format(test_df.shape))

In [None]:
def add_features(df):
    # Moving average
    df["u_in_ma_lag1"] = df.groupby(["breath_id"])["u_in"].shift(1).rolling(window=3).mean()
#     df["u_in_ma_lag2"] = df.groupby(["breath_id"])["u_in"].shift(2).rolling(window=3).mean()
#     df["u_in_ma_lag3"] = df.groupby(["breath_id"])["u_in"].shift(3).rolling(window=3).mean()
#     df["u_in_ma_lag4"] = df.groupby(["breath_id"])["u_in"].shift(4).rolling(window=3).mean()
#     df["u_in_ma_lag5"] = df.groupby(["breath_id"])["u_in"].shift(5).rolling(window=3).mean()
    df["u_in_ma_lag-1"] = df.groupby(["breath_id"])["u_in"].shift(-1).rolling(window=3).mean()
#     df["u_in_ma_lag-2"] = df.groupby(["breath_id"])["u_in"].shift(-2).rolling(window=3).mean()
#     df["u_in_ma_lag-3"] = df.groupby(["breath_id"])["u_in"].shift(-3).rolling(window=3).mean()
#     df["u_in_ma_lag-4"] = df.groupby(["breath_id"])["u_in"].shift(-4).rolling(window=3).mean()
#     df["u_in_ma_lag-5"] = df.groupby(["breath_id"])["u_in"].shift(-5).rolling(window=3).mean()
    
    
    # Lag feature
    df["u_in_lag1"] = df.groupby(["breath_id"])["u_in"].shift(1)
#     df["u_in_lag2"] = df.groupby(["breath_id"])["u_in"].shift(2)
#     df["u_in_lag3"] = df.groupby(["breath_id"])["u_in"].shift(3)
#     df["u_in_lag4"] = df.groupby(["breath_id"])["u_in"].shift(4)
#     df["u_in_lag5"] = df.groupby(["breath_id"])["u_in"].shift(5)
    df["u_in_lag-1"] = df.groupby(["breath_id"])["u_in"].shift(-1)
#     df["u_in_lag-2"] = df.groupby(["breath_id"])["u_in"].shift(-2)
#     df["u_in_lag-3"] = df.groupby(["breath_id"])["u_in"].shift(-3)
#     df["u_in_lag-4"] = df.groupby(["breath_id"])["u_in"].shift(-4)
#     df["u_in_lag-5"] = df.groupby(["breath_id"])["u_in"].shift(-5)
    
    # u_in mean and std, u_out mean and std
    df["u_in_mean"] = df.groupby(['breath_id'])['u_in'].transform('mean')
    df["u_in_std"] = df.groupby(['breath_id'])['u_in'].transform('std')
    df["u_out_mean"] = df.groupby(['breath_id'])['u_out'].transform('mean')
    df["u_out_std"] = df.groupby(['breath_id'])['u_out'].transform('std')
    
    return df.fillna(0)
    

In [None]:
train_df = add_features(train_df)
test_df = add_features(test_df)

In [None]:
train_df.head(5)

In [None]:
# plt.figure(figsize=(15, 15))
# train_df_corr = train_df.drop(["id", "breath_id", "R", "C", "time_step"], axis=1)
# train_corr = train_df_corr.corr()
# sns.heatmap(train_corr, vmax=1, vmin=-1, center=0, annot=True)

In [None]:
# LightGBM
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from statistics import mean

folds = 4   # Number of fold
kf = KFold(n_splits=folds)

# Set LGBM hyper parameters
lgbm_params = {
    "objective":"regression",
    "random_seed":1234
}

# Set explanatory variable and Objective variable
train_x = train_df.drop(["id", "breath_id", "pressure"], axis=1)
train_y = train_df["pressure"]

In [None]:
# Train model and predict
models = []
rmses = []
preds = np.zeros(len(train_x))

num_boost_round = 20000
early_stopping_rounds = 1000
verbose_eval = 100

for train_index, val_index in kf.split(train_x):
    x_train = train_x.iloc[train_index]
    x_valid = train_x.iloc[val_index]
    y_train = train_y.iloc[train_index]
    y_valid = train_y.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    
    model_lgb = lgb.train(lgbm_params,
                         lgb_train,
                         valid_sets=lgb_eval,
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=verbose_eval)
    
    y_pred = model_lgb.predict(x_valid, num_iteration=model_lgb.best_iteration)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(tmp_rmse)
    
    models.append(model_lgb)
    rmses.append(tmp_rmse)
    preds[val_index] = y_pred
    


In [None]:
# Calculate average of RMSE
mean(rmses)

In [None]:
# Plot actual and pred
actual_pred_df = pd.DataFrame({
    "actual" : train_y,
    "pred" : preds
})

actual_pred_df.plot(xlim=[0,320])

In [None]:
# Plot variables importance
for model in models:
    lgb.plot_importance(model, importance_type="gain")

In [None]:
# Create submit data
test_x = test_df.drop(["id", "breath_id"], axis=1)

submit_preds = []

for model in models:
    submit_pred = model.predict(test_x)
    submit_preds.append(submit_pred)

# Calculate mean
preds_array = np.array(submit_preds)
preds_mean = np.mean(preds_array, axis=0)

In [None]:
# Create submit file
submission["pressure"] = preds_mean
submission.to_csv("ventilator_submit01.csv", index=False)

### 