# Table of Contents
* [Import and first Glance](#1)
* [Numerical Features](#2)
* [Categorical Features](#3)
* [Pick an Example](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

<a id='1'></a>
# Import and first Glance

In [None]:
# import data
t1 = time.time()
df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
df_sub = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))

In [None]:
# preview train data
df_train

In [None]:
df_train.info(verbose=True, show_counts=True)

#### => no missing values...

In [None]:
# count breaths
df_train.breath_id.value_counts()

In [None]:
# preview test data
df_test

In [None]:
df_test.info(verbose=True, show_counts=True)

In [None]:
# count breaths
df_test.breath_id.value_counts()

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['time_step', 'u_in', 'pressure']

In [None]:
# basic stats
df_train[features_num].describe()

In [None]:
for f in features_num:
    df_train[f].plot(kind='hist', bins=100)
    plt.title(f + ' [training]')
    plt.grid()
    plt.show()

#### We have some rows having negative pressure:

In [None]:
pressure_neg = df_train[df_train.pressure < 0]
pressure_neg

In [None]:
# show corresponding ids
pressure_neg.breath_id.value_counts()

In [None]:
# pairwise scatter plot - split by binary feature u_out
sns.pairplot(df_train[features_num+['u_out']], kind='hist', hue='u_out')
plt.show()

In [None]:
# more detailed plot of pressure vs time for u_out=0
sns.jointplot(data=df_train[df_train.u_out==0], 
              x='time_step', y='pressure',
              kind='scatter',
              joint_kws={'alpha':0.01})
plt.show()

In [None]:
# more detailed plot of pressure vs time for u_out=1
sns.jointplot(data=df_train[df_train.u_out==1], 
              x='time_step', y='pressure',
              kind='scatter',
              joint_kws={'alpha':0.01})
plt.show()

### Evaluate u_out=0 - phase at last available time step

In [None]:
# get last time_step for each breath in u_out=0 - phase
max_time_0 = df_train[df_train.u_out==0].groupby('breath_id')['time_step'].max()
# convert to nice data frame
df_max_time_0 = pd.DataFrame(max_time_0)
df_max_time_0.index.name='i'
df_max_time_0['breath_id'] = df_max_time_0.index
df_max_time_0 = df_max_time_0.reset_index(drop=True)
df_max_time_0

In [None]:
# plot distribution of last time step
plt.hist(df_max_time_0.time_step, bins=100)
plt.xlabel('time_step')
plt.grid()
plt.show()

In [None]:
# build table with data at final time_step by breath
df_final_0 = df_max_time_0.merge(df_train, on=['breath_id','time_step'], how='left')
df_final_0

In [None]:
# plot pressure at final time step
plt.scatter(df_final_0.time_step, df_final_0.pressure,
            alpha=0.2)
plt.title('Pressure at final time step for each breath | u_out=0')
plt.xlabel('max time_step')
plt.ylabel('pressure')
plt.grid()
plt.show()

In [None]:
df_final_0.pressure.plot(kind='hist', bins=20)
plt.title('Pressure distribution at last available time step | u_out=0')
plt.xlabel('pressure')
plt.grid()
plt.show()

In [None]:
# plot u_in at final time step
plt.scatter(df_final_0.time_step, df_final_0.u_in,
            alpha=0.2)
plt.title('u_in at final time step for each breath | u_out=0')
plt.xlabel('max time_step')
plt.ylabel('u_in')
plt.grid()
plt.show()

In [None]:
df_final_0.u_in.plot(kind='hist', bins=20)
plt.title('u_in distribution at last available time step | u_out=0')
plt.xlabel('u_in')
plt.grid()
plt.show()

<a id='3'></a>
# Categorical Features

In [None]:
# add combination of C and R as new feature
df_train['CR'] = df_train['C'].map(str) + '/' + df_train['R'].map(str)
df_train['CR'].value_counts()

In [None]:
features_cat = ['C', 'R', 'CR', 'u_out']

In [None]:
for f in features_cat:
    df_train[f].value_counts().sort_index().plot(kind='bar')
    plt.title(f + ' [training]')
    plt.grid()
    plt.show()

<a id='4'></a>
# Target vs Features

In [None]:
sns.set_theme(style='whitegrid') # show grid lines also
for f in features_cat:
    sns.violinplot(data=df_train, x=f, y='pressure')
    plt.show()

In [None]:
# show log version (we ignore the negative values for now)
df_train['pressure_log'] = np.log10(df_train.pressure) # => warning due to negative values!

for f in features_cat:
    sns.violinplot(data=df_train, x=f, y='pressure_log')
    plt.ylim(0,2)
    plt.show()

In [None]:
# reset style
sns.set_style('white')

<a id='5'></a>
# Pick an Example

In [None]:
my_ex = 1
df_ex = df_train[df_train.breath_id==my_ex]
# show table
df_ex

In [None]:
# plot development
plt.figure(figsize=(10,4))
plt.scatter(df_ex.time_step, df_ex.u_in, label='u_in')
plt.scatter(df_ex.time_step, df_ex.u_out, label='u_out')
plt.scatter(df_ex.time_step, df_ex.pressure, label='pressure')
plt.title('Example id=' + str(my_ex))
plt.legend()
plt.grid()
plt.show()

### Look only at u_out=0:

In [None]:
df_ex_0 = df_ex[df_ex.u_out==0]

In [None]:
df_ex_0.pressure.plot(kind='hist')
plt.title('Histogram of pressure | u_out=0')
plt.grid()
plt.show()

In [None]:
df_ex_0.u_in.plot(kind='hist')
plt.title('Histogram of u_in | u_out=0')
plt.grid()
plt.show()

In [None]:
plt.scatter(df_ex_0.u_in, df_ex_0.pressure)
plt.title('pressure vs u_in | u_out = 0')
plt.grid()
plt.show()

### Look only at u_out=1:

In [None]:
df_ex_1 = df_ex[df_ex.u_out==1]

In [None]:
df_ex_1.pressure.plot(kind='hist')
plt.title('Histogram of pressure | u_out=1')
plt.grid()
plt.show()

In [None]:
df_ex_1.u_in.plot(kind='hist')
plt.title('Histogram of u_in | u_out=1')
plt.grid()
plt.show()

In [None]:
plt.scatter(df_ex_1.u_in, df_ex_1.pressure)
plt.title('pressure vs u_in | u_out = 1')
plt.grid()
plt.show()

### Work in progress...