In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor

In [None]:
def df_overview(df):
    print('Dataframe overview:\n')
    display(df.head())
    print('--------------------------------------------\nSample:\n')
    display(df.sample(10, random_state=555))
    print('--------------------------------------------\nInfo:\n')
    print(df.info())
    print('--------------------------------------------\nNaN\'s:\n')
    print(df.isna().sum())
    print('--------------------------------------------\nDescribe:\n')
    display(df.describe())
    print('--------------------------------------------\nFeature correlation:\n')
    display(df.corr())

In [None]:
def show_correlogram(df):
    plt.figure(figsize=(6,6), dpi= 80)
    sns.heatmap(
        df.corr(), xticklabels=df.corr().columns,
        yticklabels=df.corr().columns, cmap='RdYlGn', center=0, annot=True, cbar=False
    )
    plt.title('Correlogram between features', fontsize=16)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.show()

In [None]:
def plot_create(x, y):
    plt.plot(x, y, '-', label=y.name)
    
def process_visualisation(df, breath_id):
    plt.figure(figsize=(14, 6))
    plt.title('Breath Id - {}'.format(breath_id))
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['pressure'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['u_in'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['u_out'])
    plt.grid()
    plt.legend()
    plt.ylabel('Value')
    plt.show()

In [None]:
def process_visualisation_with_preds(df, df_preds, breath_id):
    plt.figure(figsize=(14, 6))
    plt.title('Breath Id - {}'.format(breath_id))
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['pressure'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['u_in'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['u_out'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df_preds)
    plt.grid()
    plt.legend()
    plt.ylabel('Value')
    plt.show()

In [None]:
def add_features(df):
    df['u_in_cumsum'] = df['u_in'].groupby(df['breath_id']).cumsum()
    df['u_in_lag_1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_in_lag_2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_in_lag_3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_in_rolling_mean_3'] = df.groupby('breath_id')['u_in'].shift().rolling(3).mean()
    df['u_in_rolling_mean_5'] = df.groupby('breath_id')['u_in'].shift().rolling(5).mean()
    df['u_in_begin'] = df.groupby('breath_id')['u_in'].transform('first')
    df['u_in_end'] = df.groupby('breath_id')['u_in'].transform('last')
    df['u_in_min'] = df.groupby('breath_id')['u_in'].transform('min')
    df['u_in_max'] = df.groupby('breath_id')['u_in'].transform('max')
    df['u_in_median'] = df.groupby('breath_id')['u_in'].transform('median')
    df = df.fillna(0)
    df = df.drop(['breath_id', 'u_in', 'u_out'], axis=1)
    return df

In [None]:
def train_and_score(model):
    model.fit(X_train, y_train)
    return mean_absolute_error(y_valid, model.predict(X_valid))

# Open data

In [None]:
df_train = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/train.csv')
df_test = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/test.csv')
df_sample_submission = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
df_train = df_train.drop('id', axis=1)

In [None]:
df_overview(df_train)

In [None]:
df_overview(df_test)

# EDA

In [None]:
show_correlogram(df_train)

In [None]:
for col in df_train.columns:
    df_train[col].plot(kind='hist', bins=30, title=col)
    plt.show()

In [None]:
for col in ['time_step', 'u_in', 'pressure']:
    df_train[col].plot(kind='box')
    plt.show()

In [None]:
df_train[df_train['pressure'] > 55]['pressure'].plot(kind='hist', bins=30, title='Pressure > 55')
plt.show()
df_train[df_train['u_in'] > 70]['u_in'].plot(kind='hist', bins=30, title='U in > 70')
plt.show()

In [None]:
df_train[df_train['pressure'] > 64.5]['pressure'].value_counts().sort_index(ascending=False)

In [None]:
df_train[df_train['u_in'] > 99.98]['u_in'].value_counts().sort_index(ascending=False)

In [None]:
df_test[df_test['u_in'] > 99.98]['u_in'].value_counts().sort_index(ascending=False)

In [None]:
df_train[df_train['u_in'] > 99.98]['pressure'].plot(kind='hist', bins=40, title='Pressure when U in is high')
plt.show()
df_train[df_train['pressure'] > 64.5]['u_in'].plot(kind='hist', bins=40, title='U in when Pressure is high')
plt.show()

In [None]:
df_breath = df_train.groupby('breath_id', as_index=False).median()
df_breath

In [None]:
for col in ['time_step', 'u_in', 'pressure']:
    df_breath[col].plot(kind='hist', bins=30, title=col)
    plt.show()

In [None]:
df_breath[df_breath['pressure'] < 0].head(10)

In [None]:
df_breath[df_breath['pressure'] < 0]['pressure'].plot(kind='hist', bins=30, title='Negative pressure')
plt.show()

In [None]:
df_breath[df_breath['pressure'] < 0]['u_in'].plot(kind='hist', bins=30, title='U in when Pressure < 0')
plt.show()
df_breath[df_breath['pressure'] == df_breath['pressure'].median()]['u_in'].plot(
    kind='hist', bins=30, title='U in when Pressure is normal')
plt.show()

Lets look at breathe processes. Take processes with normal, low and high pressures:

In [None]:
print('Some data where pressure is normal:')
display(df_breath[df_breath['pressure'] == df_train['pressure'].median()].sample(3, random_state=1))
print('\nSome data where pressure is below 0:')
display(df_breath[df_breath['pressure'] < 0].sample(3, random_state=1))
print('\nSome data where pressure is high:')
display(df_breath[df_breath['pressure'] > 12].sample(3, random_state=1))

In [None]:
print('Some data where pressure is normal:')
display(df_breath[df_breath['pressure'] == df_train['pressure'].median()].sample(3, random_state=1))
print('\nSome data where pressure is below 0:')
display(df_breath[df_breath['pressure'] < 0].sample(3, random_state=1))
print('\nSome data where pressure is high:')
display(df_breath[df_breath['pressure'] > 12].sample(3, random_state=1))

In [None]:
print('Process visualisation where pressure is normal:')
process_visualisation(df_train, 48945)
process_visualisation(df_train, 28141)
process_visualisation(df_train, 109737)

In [None]:
print('Process visualisation where pressure is below 0:')
process_visualisation(df_train, 98041)
process_visualisation(df_train, 118131)
process_visualisation(df_train, 11216)

In [None]:
print('Process visualisation where pressure is high:')
process_visualisation(df_train, 104581)
process_visualisation(df_train, 14416)
process_visualisation(df_train, 69384)

# Train and test

In [None]:
X = df_train.copy()
X = X.drop('pressure', axis=1)
X = add_features(X)
y = df_train['pressure']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=555)

In [None]:
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor(max_depth=15, random_state=555)
cb_model = CatBoostRegressor(
    depth=15, loss_function='MAE', task_type='GPU', devices='0-3',
    gpu_ram_part=0.3, random_seed=555, verbose=0
)
dummy = DummyRegressor()

In [None]:
display(
    pd.DataFrame(data=([train_and_score(linear_model)],
                       [train_and_score(tree_model)],
                       [train_and_score(cb_model)], 
                       [train_and_score(dummy)]
                      ),
                 columns=['Result MAE'],
                 index=['Linear', 'Tree', 'CatBoost', 'Dummy'])
)

In [None]:
pd.DataFrame(
    cb_model.feature_importances_, index=cb_model.feature_names_, columns=['importances']
).sort_values(by='importances').plot(kind='barh', figsize=(8, 6), title='CatBoost feature importances')
plt.show()

In [None]:
linear_model.fit(X, y)
tree_model.fit(X, y)
cb_model.fit(X, y)

In [None]:
#Lets check prediction quality with our process visualisation:
X_df_vis = df_train[df_train['breath_id'] == 28141].reset_index()
X_df_vis = add_features(X_df_vis)
X_df_vis = X_df_vis.drop(['index', 'pressure'], axis=1)

print('Pressure predictions by Linear Model where pressure is normal:')
process_visualisation_with_preds(df_train, pd.Series(linear_model.predict(X_df_vis), name='predictions'), 28141)
print('Pressure predictions by Tree Model where pressure in normal:')
process_visualisation_with_preds(df_train, pd.Series(tree_model.predict(X_df_vis), name='predictions'), 28141)
print('Pressure predictions by CatBoost Model where pressure in normal:')
process_visualisation_with_preds(df_train, pd.Series(cb_model.predict(X_df_vis), name='predictions'), 28141)

In [None]:
X_df_vis = df_train[df_train['breath_id'] == 98041].reset_index()
X_df_vis = add_features(X_df_vis)
X_df_vis = X_df_vis.drop(['index', 'pressure'], axis=1)

print('Pressure predictions by Linear Model where pressure is below 0:')
process_visualisation_with_preds(df_train, pd.Series(linear_model.predict(X_df_vis), name='predictions'), 98041)
print('Pressure predictions by Tree Model where pressure in below 0:')
process_visualisation_with_preds(df_train, pd.Series(tree_model.predict(X_df_vis), name='predictions'), 98041)
print('Pressure predictions by CatBoost Model where pressure in below 0:')
process_visualisation_with_preds(df_train, pd.Series(cb_model.predict(X_df_vis), name='predictions'), 98041)

In [None]:
X_df_vis = df_train[df_train['breath_id'] == 104581].reset_index()
X_df_vis = add_features(X_df_vis)
X_df_vis = X_df_vis.drop(['index', 'pressure'], axis=1)

print('Pressure predictions by Linear Model where pressure is high:')
process_visualisation_with_preds(df_train, pd.Series(linear_model.predict(X_df_vis), name='predictions'), 104581)
print('Pressure predictions by Tree Model where pressure in high:')
process_visualisation_with_preds(df_train, pd.Series(tree_model.predict(X_df_vis), name='predictions'), 104581)
print('Pressure predictions by CatBoost Model where pressure in high:')
process_visualisation_with_preds(df_train, pd.Series(cb_model.predict(X_df_vis), name='predictions'), 104581)

In [None]:
df_test_featured = df_test.copy()
df_test_featured = add_features(df_test_featured)
df_test_featured = df_test_featured.drop('id', axis=1)

In [None]:
preds = cb_model.predict(df_test_featured)

In [None]:
output = pd.DataFrame({'id': df_test['id'].values,
                       'pressure': preds})
display(output)
output.to_csv('submission.csv', index=False)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path=os.path.join(dirname, filename)
        if 'train' in path:
            __training_path=path
        elif 'test' in path:
            __test_path=path

In [None]:
#loaded files
print(f'Training path:{__training_path}\nTest path:{__test_path}')

In [None]:
# Kaggle Environment Prepration
#update kaggle env
import sys
#you may update the environment that allow you to run the whole code
!{sys.executable} -m pip install --upgrade scikit-learn=="0.24.2"

In [None]:
#record this information if you need to run the Kernel internally
import sklearn; sklearn.show_versions()

In [None]:
def __load__data(__training_path, __test_path, concat=False):
	"""load data as input dataset
	params: __training_path: the training path of input dataset
	params: __test_path: the path of test dataset
	params: if it is True, then it will concatinate the training and test dataset as output
	returns: generate final loaded dataset as dataset, input and test
	"""
	# LOAD DATA
	import pandas as pd
	__train_dataset = pd.read_csv(__training_path, delimiter=',')
	__test_dataset = pd.read_csv(__test_path, delimiter=',')
	return __train_dataset, __test_dataset
__train_dataset, __test_dataset = __load__data(__training_path, __test_path, concat=True)
__train_dataset.head()

In [None]:
# STORE SUBMISSION RELEVANT COLUMNS
__test_dataset_submission_columns = __test_dataset['id']

In [None]:
# DISCARD IRRELEVANT COLUMNS
__train_dataset.drop(['id'], axis=1, inplace=True)
__test_dataset.drop(['id'], axis=1, inplace=True)

Target Column
The target column is the value which we need to predict. Therefore, we need to detach the target columns in prediction. Note that if we don't drop this fields, it will generate a model with high accuracy on training and worst accuracy on test (because the value in test dataset is Null). Here is the list of target column: pressure

In [None]:
# DETACH TARGET
__feature_train = __train_dataset.drop(['pressure'], axis=1)
__target_train =__train_dataset['pressure']
__feature_test = __test_dataset

Training Model and Prediction

First, we will train a model based on preprocessed values of training data set. Second, let's predict test values based on the trained model.


LightGBM Regressor

We will use LightGBM Regressor which is constructing a gradient boosting model. We will use lightgbm package. More detail about LightGBM Regressor can be found here.

In [None]:
# MODEL
import numpy as np
from lightgbm import LGBMRegressor
__model = LGBMRegressor()
__model.fit(__feature_train, __target_train) 
__y_pred = __model.predict(__feature_test)

Submission File

We have to maintain the target columns in "submission.csv" which will be submitted as our prediction results.

In [None]:
# SUBMISSION
submission = pd.DataFrame(columns=['id'], data=__test_dataset_submission_columns)
submission['pressure'] = __y_pred
submission.head()

In [None]:
# save submission file
submission.to_csv("kaggle_submission.csv", index=False)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.callbacks import TensorBoard

Intro

Saving your pre trained model will save you a lot of time . So to show how this works I have made this notebook :

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")

In [None]:
train = train.sample(frac =.01,random_state = 90, replace = False)
train.shape

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
train.info()

In [None]:
train.corr()

In [None]:
sns.scatterplot(x=train["pressure"],y=train['u_in'])

In [None]:
sns.histplot( x=train["u_in"],binwidth=5 ,kde=True)

In [None]:
train = train.drop("breath_id",axis=1)

In [None]:
X = train.drop("pressure",axis=1)
y = train["pressure"]

Defining,Compiling,Saving,Loading and predicting from the model

In [None]:
model = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=[6]),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='selu'),
    layers.Dense(1),
])

In [None]:
model.compile(
    optimizer='adam',
    loss='mae',
)

In [None]:
history = model.fit(
    X, y,
    validation_data=(X, y),
    batch_size=256,
    epochs=10,
)

In [None]:
model.save("saved_model")

In [None]:
loaded_model = keras.models.load_model("./saved_model")

In [None]:
preds = loaded_model.predict(X[:1000])
mae = mean_absolute_error(y[:1000],preds)
print(mae)