![](https://1.bp.blogspot.com/-zeFUWzrz2FA/YUup-VsSKWI/AAAAAAAAIME/4s_s--D68xEJmBI8VdAaOJXNO3cd8qkqwCLcBGAsYHQ/s1900/header.png)

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from tqdm import tqdm
import xgboost as xgb

In [None]:
#===========================================================================
# read in the data
# Original kernel: https://www.kaggle.com/carlmcbrideellis/very-simple-xgboost-regression
#===========================================================================
train_data = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test_data  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')

test_ids = test_data['id'].tolist()

In [None]:
train_data.head()

# Missing Values Count

In [None]:
missing_values_count = train_data.isnull().sum()
print (missing_values_count)
total_cells = np.product(train_data.shape)
total_missing = missing_values_count.sum()
print ("% of missing data = ",(total_missing/total_cells) * 100)

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

train_data = add_features(train_data)
test_data = add_features(test_data)

train_data.drop(['id', 'breath_id'], axis=1, inplace=True)
test_data = test_data.drop(['id', 'breath_id'], axis=1)

In [None]:
#===========================================================================
# select some features of interest
#===========================================================================
features=[]
for i in train_data.columns:
    if i != 'pressure':
        features.append(i)
# features = ['R', 'C', 'time_step', 'u_in', 'u_out']

#===========================================================================
#===========================================================================
X_train = train_data[features]
y_train = train_data["pressure"]
final_X_test = test_data[features]

#===========================================================================
# XGBoost regression: 
# Parameters: 
# n_estimators  "Number of gradient boosted trees. Equivalent to number 
#                of boosting rounds."
# learning_rate "Boosting learning rate (xgb’s “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make 
#                the model more complex and more likely to overfit." 
#===========================================================================
# regressor=xgb.XGBRegressor(n_estimators  = 500,
#                            learning_rate = 0.1,
#                            max_depth     = 5)
# regressor.fit(X_train, y_train)

#===========================================================================
# To use early_stopping_rounds: 
# "Validation metric needs to improve at least once in every 
# early_stopping_rounds round(s) to continue training."
#===========================================================================
# perform a test/train split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=0)


regressor = xgb.XGBRegressor(
                 tree_method='gpu_hist',
                 colsample_bytree=0.9,
                 alpha=0.01563,
                 #gamma=0.0,
                 learning_rate=0.5,
                 max_depth=13,
                 min_child_weight=257,
                 n_estimators=1500,                                                                  
                 #reg_alpha=0.9,
                 reg_lambda=0.003,
                 subsample=0.9,
                 random_state=2020,
                 metric_period=100,
                 silent=1)

regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=0)

# Prediction

In [None]:
#===========================================================================
# use the model XGB to predict the prices for the test data
#===========================================================================
predictions = regressor.predict(final_X_test)

In [None]:
#===========================================================================
# write out CSV submission file
#===========================================================================
output = pd.DataFrame({"id":test_ids, "pressure":predictions})
output.to_csv('submission.csv', index=False)

In [None]:
output