In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
import optuna

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
ss       = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_train.drop('id',axis=1,inplace=True)
df_test.drop('id',axis=1,inplace=True)

In [None]:
df_train['RC_sum'] = df_train['R'] + df_train['R']
df_train['u_in_cumsum'] = df_train['u_in'].groupby(df_train['breath_id']).cumsum()
df_train['time_lag'] = df_train['time_step'].shift(2).fillna(0)
df_train['u_in_lag'] = df_train['u_in'].shift(2).fillna(0)
df_train.head()

In [None]:
df_test['RC_sum'] = df_test['R'] + df_test['R']
df_test['u_in_cumsum'] = df_test['u_in'].groupby(df_test['breath_id']).cumsum()
df_test['time_lag'] = df_test['time_step'].shift(2).fillna(0)
df_test['u_in_lag'] = df_test['u_in'].shift(2).fillna(0)
df_test.head()

In [None]:
categorical_features = ['R', 'C', 'u_out', 'RC_sum']
continuous_features  = ['time_step', 'u_in', 'u_in_lag', 'u_in_cumsum', 'time_lag']

In [None]:
labelencoder = LabelEncoder()
scaler       = StandardScaler()

for cols in categorical_features:
    df_train[cols] = labelencoder.fit_transform(df_train[cols])
    df_test[cols]  = labelencoder.fit_transform(df_test[cols])
    
for cols in continuous_features:
    df_train[cols] = scaler.fit_transform(np.array(df_train[cols]).reshape(-1, 1))
    df_test[cols]  = scaler.transform(np.array(df_test[cols]).reshape(-1, 1))

In [None]:
df_train.head()

In [None]:
columns = [col for col in df_train.columns if col not in ['breath_id', 'pressure']]
df_test.drop('breath_id',axis=1,inplace=True)

X = df_train[columns]
y = df_train['pressure']

In [None]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=228)
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 15000),
        'max_depth': trial.suggest_int('max_depth', 2, 3),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 50, 500),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'boosting_type': 'gbdt',
        'random_state': 228,
        'metric': 'mae',
        }
    
    model = LGBMRegressor(**params)
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=300, verbose=False)
    
    preds = model.predict(test_x)
    mae = mean_absolute_error(test_y, preds)
    
    return mae

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params=study.best_params
print(params)

In [None]:
folds = GroupKFold(n_splits=5)

preds = []
mae   = []

for fold, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=df_train['breath_id'])):
    
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    model = LGBMRegressor(**params)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=300,verbose=False)
    
    preds_valid = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_valid)
    
    print(f"Fold: {fold+1} MAE: {mae}")
    print('-'*25)
    
    test_preds = model.predict(df_test)
    preds.append(test_preds)
    
print(f"Overall Validation MAE: {np.mean(mae)}")

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['pressure'] = predictions
ss.to_csv('./lgbm.csv', index=False)
ss.head()