In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load CSV files
train_df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test_df = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
# Shape of train and test dataset
print("Shape of training data : {}".format(train_df.shape))
print("Shape of test data : {}".format(test_df.shape))

In [None]:
# Set explanatory variable and Objective variable
train_x = train_df.drop(["id", "breath_id", "pressure"], axis=1)
train_y = train_df["pressure"]

In [None]:
# Implement Optuna
import optuna
import lightgbm as lgb
from sklearn.model_selection import KFold
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [None]:
# Optimize hyperparameters
def objective(trial):
    
    # Split data into train and test data
    x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=1234, shuffle=False, stratify=None)
    
    params = {
        'objective': 'regression',
        'n_estimators': 1000,
        'random_state': 42,
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_loguniform('subsample_freq', 0.4, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    }
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    
    model_lgb = lgb.train(params,
                         lgb_train,
                         valid_sets=lgb_eval,
                         num_boost_round=10000,
                         early_stopping_rounds=100,
                         verbose_eval=50)
    
    y_pred = model_lgb.predict(x_valid, num_iteration=model_lgb.best_iteration)
    
    score = mean_squared_error(y_valid, y_pred)
    
    return score

In [None]:
# # Get optimized parameters
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=10)
# print("Number of finished trials: ", len(study.trials))
# print("Best parameters: ", study.best_params)

In [None]:
# LightGBM
folds = 4   # Number of fold
kf = KFold(n_splits=folds)

# Set LGBM hyper parameters
lgbm_params = {
    'objective': 'regression',
    'random_state': 42,
    'learning_rate': 0.02,
    'subsample': 0.9586980708213185,
    'subsample_freq': 0.5886259785107316,
    'colsample_bytree': 0.9982054887945049,
    'reg_alpha': 0.021951494356699672,
    'reg_lambda': 0.8902816396602072,
    'min_child_weight': 60,
    'min_child_samples': 23,
    'bagging_fraction': 0.9024418505688159,
    'bagging_freq': 7,
}

In [None]:
# Train model and predict
models = []
rmses = []
preds = np.zeros(len(train_x))

num_boost_round = 10000
early_stopping_rounds = 1000
verbose_eval = 100

for train_index, val_index in kf.split(train_x):
    x_train = train_x.iloc[train_index]
    x_valid = train_x.iloc[val_index]
    y_train = train_y.iloc[train_index]
    y_valid = train_y.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    
    model_lgb = lgb.train(lgbm_params,
                         lgb_train,
                         valid_sets=lgb_eval,
                         num_boost_round=num_boost_round,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose_eval=verbose_eval)
    
    y_pred = model_lgb.predict(x_valid, num_iteration=model_lgb.best_iteration)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(tmp_rmse)
    
    models.append(model_lgb)
    rmses.append(tmp_rmse)
    preds[val_index] = y_pred
    


In [None]:
# Calculate average of RMSE
mean(rmses)

In [None]:
# Plot actual and pred
actual_pred_df = pd.DataFrame({
    "actual" : train_y,
    "pred" : preds
})

actual_pred_df.plot(xlim=[0,320])

In [None]:
# Plot variables importance
for model in models:
    lgb.plot_importance(model, importance_type="gain")

In [None]:
# Create submit data
test_x = test_df.drop(["id", "breath_id"], axis=1)

submit_preds = []

for model in models:
    submit_pred = model.predict(test_x)
    submit_preds.append(submit_pred)

# Calculate mean
preds_array = np.array(submit_preds)
preds_mean = np.mean(preds_array, axis=0)

In [None]:
# Create submit file
submission["pressure"] = preds_mean
submission.to_csv("ventilator_submit01.csv", index=False)
print("done")