In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import optuna

In [None]:
train = pd.read_csv(r'../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv(r'../input/ventilator-pressure-prediction/test.csv')
sample_submission = pd.read_csv(r'../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
print(f'train set have {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'test set have {test.shape[0]} rows and {test.shape[1]} columns.') 
print(f'sample_submission set have {sample_submission.shape[0]} rows and {sample_submission.shape[1]} columns.') 

In [None]:
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
print('train: ')
train.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

In [None]:
print('test: ')
test.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

In [None]:
train.nunique()

In [None]:
train.dtypes

#### data visualization

In [None]:
plt.figure(figsize=(17,5))
R_values = train['R'].value_counts()
sns.barplot(x=R_values.index, y=R_values.values,linewidth=1.5, facecolor='white',
                 errcolor=".2", edgecolor=".2")
plt.title("R feature unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
plt.figure(figsize=(17,5))
C_values = train['C'].value_counts()
sns.barplot(x=C_values.index, y=C_values.values,linewidth=1.5, facecolor='white',
                 errcolor=".2", edgecolor=".2")
plt.title("C feature unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
plt.figure(figsize=(17,5))
u_out_values = train['u_out'].value_counts()
sns.barplot(x=u_out_values.index, y=u_out_values.values,linewidth=1.5, facecolor='white',
                 errcolor=".2", edgecolor=".2")
plt.title("u_out feature unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
fig = plt.figure(figsize = (40, 60))
for i in range(len(train.columns.tolist()[3:5])):
    plt.subplot(24,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[3:5][i], size = 10, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[3:5][i]], shade = True, alpha = 0.9, linewidth = 1.5, facecolor='white', edgecolor=".2")
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

#### Feature engineering

In [None]:
# Reference : https://www.kaggle.com/patrick0302/add-last-u-in-as-new-feat
idxmax_time_step = train.groupby('breath_id')['time_step'].idxmax()
last_value_u_in = train.loc[idxmax_time_step, ['breath_id','u_in']]
last_value_u_in.columns = ['breath_id','last_value_u_in']

train = train.merge(last_value_u_in, on='breath_id')

idxmax_time_step = test.groupby('breath_id')['time_step'].idxmax()
last_value_u_in = test.loc[idxmax_time_step, ['breath_id','u_in']]
last_value_u_in.columns = ['breath_id','last_value_u_in']

test = test.merge(last_value_u_in, on='breath_id')


# Add u_in_lag

train['u_in_lag'] = train['u_in'].shift(1)
train = train.fillna(0)

test['u_in_lag'] = test['u_in'].shift(1)
test = test.fillna(0)

In [None]:
# reference: https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
train['u_in_cumsum'] = (train['u_in']).groupby(train['breath_id']).cumsum()

test['u_in_cumsum'] = (test['u_in']).groupby(test['breath_id']).cumsum()

In [None]:
train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()
ss = StandardScaler()
cat = ['R','C','u_out'] 
num = ['time_step','u_in','u_in_lag','u_in_cumsum']
for i in cat:
    train[i] = le.fit_transform(train[i])
    test[i] = le.transform(test[i])
    
for i in num:
    train[i] = ss.fit_transform(np.array(train[i]).reshape(-1, 1))
    test[i] = ss.transform(np.array(test[i]).reshape(-1, 1))

#### Model building:

In [None]:
train.drop('breath_id',axis=1,inplace=True)
test.drop('breath_id',axis=1,inplace=True)

X = train.drop('pressure',axis=1)
y = train['pressure']

In [None]:
def fit_xgb(trial, x_train, y_train, x_test, y_test):
    params = {
         'max_depth': trial.suggest_int('max_depth', 1, 10), # Extremely prone to overfitting!
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'n_estimators': trial.suggest_int('n_estimators', 400, 20000, 100), # Extremely prone to overfitting!
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.99, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'booster': "gbtree",
        'eval_metric':'mae'
    } 
    
    
    model = XGBRegressor(**params,tree_method='gpu_hist')
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train mae": mean_absolute_error(y_train, y_train_pred),
        "valid mae": mean_absolute_error(y_test, y_test_pred)
    }
    
    return model, log

In [None]:
def objective(trial):
    mae = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    model, log = fit_xgb(trial, x_train, y_train, x_test, y_test)
    mae += log['valid mae']
        
    return mae

In [None]:
xgb_params = {'max_depth': 10, 
              'learning_rate': 0.8244792936012061, 
              'n_estimators': 1500, 
              'subsample': 0.6000000000000001, 
              'colsample_bytree': 0.8, 
              'min_child_weight': 0.5365522431887669, 
              'reg_lambda': 0.1988726192189123, 
              'reg_alpha': 0.6360428371059048,
              'tree_method':'gpu_hist',
              'booster': "gbtree",
              'eval_metric':'mae'} 

In [None]:
folds = KFold(n_splits = 5, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_test = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_test = y.iloc[trn_idx], y.iloc[val_idx]

    model = XGBRegressor(**xgb_params)
   
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    print(f" mae: {mae}")
    print("-"*50)
    
    predictions += model.predict(test) / folds.n_splits 

In [None]:
sample_submission['pressure'] = predictions
sample_submission.to_csv(f'xgb.csv',index = False)