# Import Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# Read datasets

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
sample = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

# EDA

## 1. View the data information & quantitative measure

In [None]:
train.info()

In [None]:
train.describe()

## 2. View the first, last & sample rows in the dataframe

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.sample()

## 3. Check for missing values.
No missing values in this dataset

In [None]:
train.isnull().sum()

## 4. Try to Find Correlation

In [None]:
sns.heatmap(train.corr())

## 5. Split Categorical and numeric columns

In [None]:
cat_cols = []
num_cols = []
for col in train.columns:
    if(train[col].value_counts().count() < 10):
        cat_cols.append(col)
    else:
        num_cols.append(col)
    print(col + '---' + str(train[col].value_counts().count()))

## 6. Find Unique values

In [None]:
for col in cat_cols:
    print(col + '---' + str(train[col].unique()))

## 7. Find Distribution of data

In [None]:
num_cols

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('TRAIN')

# time_step
sns.histplot(ax=axes[0],data=train, x='time_step')
axes[0].set_title('time_step')

# u_in
sns.histplot(ax=axes[1],data=train, x='u_in')
axes[1].set_title('u_in')

# pressure
sns.histplot(ax=axes[2],data=train, x='pressure')
axes[2].set_title('pressure')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('TEST')

# time_step
sns.histplot(ax=axes[0],data=test, x='time_step')
axes[0].set_title('time_step')

# u_in
sns.histplot(ax=axes[1],data=test, x='u_in')
axes[1].set_title('u_in')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 10))

sns.boxplot(ax=axes[0], data=train, x='time_step')
sns.boxplot(ax=axes[1], data=train, x='u_in')
sns.boxplot(ax=axes[2], data=train, x='pressure')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 10))
fig.suptitle('TRAIN')

sns.boxplot(ax=axes[0], data=train, x='time_step')
sns.boxplot(ax=axes[1], data=train, x='u_in')
sns.boxplot(ax=axes[2], data=train, x='pressure')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 10))
fig.suptitle('TEST')

sns.boxplot(ax=axes[0], data=train, x='time_step')
sns.boxplot(ax=axes[1], data=train, x='u_in')

### Outliers

In [None]:
Q1 = train.quantile(0.25)
Q3 = train.quantile(0.75)
IQR = Q3 - Q1
((train < (Q1 - 1.5 * IQR)) | (train > (Q3 + 1.5 * IQR))).sum()

In [None]:
Q1 = test.quantile(0.25)
Q3 = test.quantile(0.75)
testIQR = Q3 - Q1
((test < (Q1 - 1.5 * testIQR)) | (test > (Q3 + 1.5 * testIQR))).sum()

In [None]:
X = train.drop(['pressure'], axis=1)
y = train['pressure']

# Split train & test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
#regr = RandomForestRegressor(max_depth=2, random_state=0)

# Model hyperparameters

In [None]:
xgb_params = {
    'n_estimators': 5000,
    'learning_rate': 0.1,
    'subsample': 0.95,
    'colsample_bytree': 0.11,
    'max_depth': 2,
    'booster': 'gbtree', 
    'reg_lambda': 66.1,
    'reg_alpha': 15.9,
    'random_state':42,
    'tree_method':'gpu_hist',
    'gpu_id':0,
    'predictor':'gpu_predictor'
}

# Fit the model
***Note : Turn On GPU Accelerator***

In [None]:
model = XGBRegressor(**xgb_params)
#model = XGBRegressor(n_estimators= 5000,learning_rate= 0.1,random_state=1, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")

model.fit(X_train,y_train)

# Evaluate

In [None]:
model.score(X_test, y_test)

# Predict

In [None]:
predicted = model.predict(test)
predicted

In [None]:
predicted_pressure = pd.DataFrame({'pressure': predicted[:]})

In [None]:
test_result = test
test_result['pressure'] = predicted_pressure

In [None]:
submit_result = test_result[['id','pressure']]
submit_result

# Submit

In [None]:
submit_result.to_csv('submission.csv', index=False)

# **TADA** :D

Loading -- more EDA & FE 