This notebook is EDA for VPP competition (<a href=https://www.kaggle.com/dmitryuarov/ventilator-pressure-eda-lstm-0-189/data>Google Brain - Ventilator Pressure Prediction</a>)   
    
reference <a href=https://www.kaggle.com/dmitryuarov/ventilator-pressure-eda-lstm-0-189> https://www.kaggle.com/dmitryuarov/ventilator-pressure-eda-lstm-0-189 </a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Input data

In [None]:
INPUT_PATH = '../input/ventilator-pressure-prediction/'
df_train = pd.read_csv(INPUT_PATH + 'train.csv')
df_test = pd.read_csv(INPUT_PATH + 'test.csv')

print(f'Length of train records : {len(df_train)}')
print(f'Length of test records  : {len(df_test)}')
print(f'Number of unique breath_id in train data : {len(df_train["breath_id"].unique())}')
print(f'Number of unique breath_id in test data  : {len(df_test["breath_id"].unique())}')
print(f'Number of time_steps in a breath_id: {len(df_train) / len(df_train["breath_id"].unique())}') # confirmed for all breath_ids
df_train.head(10)

# Correlation

Some correlations are observed among "time_step", "u_in", "u_out" and "pressure".

In [None]:
corr = df_train.corr()
fig = plt.figure(figsize=(9, 9))
sns.heatmap(corr, annot=True, fmt='.2f', cmap=sns.color_palette('coolwarm',200))
plt.title('Corralation')
plt.show()

## R and C

R and C are constant values in a breath_id and have only three distinct values, which are balanced between train and test.

In [None]:
R_all = pd.concat([df_train['R'], df_test['R']], axis=0)
C_all = pd.concat([df_train['C'], df_test['C']], axis=0)
R_unique = np.sort(R_all.unique())
C_unique = np.sort(C_all.unique())
print(f'Unique R: {R_unique}')
print(f'Unique C: {C_unique}')

In [None]:
# Define figure layout (label, ticks, frame, annotation,...)
def fig_layout(seaborn_plot):
    plt.xlabel('')
    plt.ylabel('')
    plt.yticks([])
    # Hide frame
    for l in ['right', 'top', 'left']:
        seaborn_plot.spines[l].set_visible(False)
    # Count record length
    record_length = 0
    for rectangle in seaborn_plot.patches:
        record_length += rectangle.get_height()
    # Add annotation of ratio 
    for rectangle in seaborn_plot.patches:
        height = rectangle.get_height()
        width = rectangle.get_width()
        ratio = round(height/record_length*100,1)
        # Ratio
        sns_plot.annotate(f'{ratio}%',
                          xy=(rectangle.get_x()+width/2, height),
                          ha='center', va='center', size=10,
                          xytext=(0, 10), textcoords='offset points')

# Plot
fig = plt.figure(figsize=(12, 8))
for i, rc in enumerate(['R', 'C']):
    # Train set
    plt.subplot(2,2,i+1)
    sns_plot = sns.countplot(x=rc, data=df_train)
    fig_layout(sns_plot)
    plt.title(rc, size=20)
    # Test set 
    plt.subplot(2,2,i+3)
    sns_plot = sns.countplot(x=rc, data=df_test)
    fig_layout(sns_plot)
    
plt.figtext(0.05, 0.75, 'Train', size=20)
plt.figtext(0.05, 0.25, 'Test', size=20)
plt.show()

## u_in, u_out, and pressure  
- u_out is changed from 0 to 1 at time_step=1. After that, the pressure is rapidly decreased to around 6 cmH2O.
- Pressure and u_in are some correlated under u_out=0. Pressure is changed accoding to u_in ocillation.


In [None]:
# Sampling breath_ids for display
sample_breath_ids = []
for r in R_unique:
    for c in C_unique:
        df_RC = df_train[(df_train['R']==r) & (df_train['C']==c)]
        breath_ids_RC = df_RC['breath_id'].unique()[:4] # Get 4 samples with R=r and C=c
        sample_breath_ids.append(breath_ids_RC)
        
# Set figure parameters
color_palette = sns.color_palette("tab10")
plt.subplots_adjust(wspace=0.4, hspace=0.4)

# Plot
for breath_ids_RC in sample_breath_ids: # Loop of R and C
    fig = plt.figure(figsize=(20, 4))
   
    for i, b_id in enumerate(breath_ids_RC): # Loop of 3 samples with same R and C
        df_tmp = df_train[df_train['breath_id']==b_id]
        R = df_tmp["R"].iloc[0]
        C = df_tmp["C"].iloc[0]
        # Axis
        ax1 = plt.subplot(1,4,i+1) # 1st axis
        ax2 = ax1.twinx() # 2nd axis (for u_out)
        ax1.set_xlabel('time_step')
        # Plot
        ax1.plot(df_tmp['time_step'], df_tmp['pressure'], color=color_palette[0], label='pressure')
        ax1.plot(df_tmp['time_step'], df_tmp['u_in'],     color=color_palette[0], label='u_in', linestyle='dashed', linewidth=0.75)
        ax2.plot(df_tmp['time_step'], df_tmp['u_out'],    color=color_palette[3], label='u_out', linestyle='dashed', linewidth=0.75)
        # y-limit
        y_max_lim = 50 if df_tmp['u_in'].max()<50 else 100
        ax1.set_ylim([0, y_max_lim])
        ax2.set_ylim([0, 1.5])
        # Ticks
        ax1.tick_params(axis='y', colors=color_palette[0])
        ax2.tick_params(axis='y', colors=color_palette[3])
        ax2.set_yticks(np.linspace(0, 1.5, 4))
        # Legend
        h1, l1 = ax1.get_legend_handles_labels()
        h2, l2 = ax2.get_legend_handles_labels()
        ax1.legend(h1+h2, l1+l2, loc='upper right')
        # Title
        ax1.set_title(f'breath_id={b_id}')
    
    plt.figtext(0.01, 0.5, f'R={R}, C={C}', size=20)
    plt.show()


### Feature engineering

In [None]:
def generate_features(df):
    # u_in and u_out interaction
    df['u_in_out'] = df['u_in'] * (df['u_out'] - 0.5)
    # u_in difference
    df['u_in_diff'] = df['u_in'].diff().fillna(0)
    # u_in shift
    df['u_in_lag1'] = df['u_in'].shift(4).fillna(0)
    df['u_in_lag2'] = df['u_in'].shift(4).fillna(0)
    # moving meen and std
    df['u_in_10mv_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['u_in_10mv_std']  = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    # moving mean and std with exponential weights
    df['u_in_ew_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['u_in_ew_std']  = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    
    # 'pressure' moved to end column 
    df['pressure_'] = df['pressure']
    df.drop('pressure', axis=1, inplace=True)
    df.rename(columns={'pressure_': 'pressure'}, inplace=True)
    return df

In [None]:
# Correlation for generated features
df_train_ = df_train.copy()
df_train_ = generate_features(df_train_)
df_train_ = df_train_.drop(df_train_.columns[[0,1]], axis=1)
corr = df_train_.corr()
fig = plt.figure(figsize=(16, 16))
sns.heatmap(corr, annot=True, fmt='.2f', cmap=sns.color_palette('coolwarm',200))
plt.title('Corralation for generated features')
plt.show()