In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 

# Reference: https://www.kaggle.com/subinium/tps-aug-simple-eda by Subin An

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")
sample_submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
print(f'Train Shape :  {train.shape}')
print(f'Test Shape :  {test.shape}')

In [None]:
target = train['pressure']
train.drop(['id', 'breath_id'], axis=1, inplace=True)
test.drop(['id', 'breath_id'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 8))

target_cnt = train['pressure'].value_counts().sort_index()

ax.bar(target_cnt.index, target_cnt, color=['#d4dddd' if i%2==0 else '#fafafa' for i in range(9)],
       width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax.margins(0.02, 0.05)
ax.set_title('Pressure Distribution', weight='bold', fontsize=15)
ax.grid(axis='y', linestyle='-', alpha=0.4)

fig.tight_layout()
plt.show()


In [None]:
pd.DataFrame(target_cnt)

In [None]:
target_cnt_df = pd.DataFrame(target_cnt)
target_cnt_df['ratio(%)'] = target_cnt_df/target_cnt.sum()*100
target_cnt_df.sort_values('ratio(%)', ascending=False, inplace=True)
target_cnt_df['cummulated_sum(%)'] = target_cnt_df['ratio(%)'].cumsum()
target_cnt_df.style.bar(subset=['cummulated_sum(%)'], color='#205ff2')

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

# Discrete Features

In [None]:
discrete_features = []

for col in train.columns:
    if np.array_equal(train[col].values, train[col].values.astype(int)):
        discrete_features.append(col)

print(f'Total {len(discrete_features)} : ')
print(discrete_features)

In [None]:
for dcol in discrete_features:
    print(f'{dcol} unique value : {train[dcol].nunique()}')

In [None]:
R_pressure = train.groupby(['R'])['pressure'].mean().sort_values()
R_pressure

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 6))

ax.bar(range(len(R_pressure)), R_pressure, alpha=0.7, color='lightgray', label='Test Dataset')
ax.set_yticks(range(0, 20, 3))
ax.margins(0.01)
ax.grid(axis='y', linestyle='--', zorder=5)
ax.set_title('Average of pressure grouped by R', loc='left', fontweight='bold')
ax.legend()
plt.show()

In [None]:
C_pressure = train.groupby(['C'])['pressure'].mean().sort_values()
C_pressure

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 6))

ax.bar(range(len(C_pressure)), C_pressure, alpha=0.7, color='lightgray', label='Test Dataset')
ax.set_yticks(range(0, 20, 3))
ax.margins(0.01)
ax.grid(axis='y', linestyle='--', zorder=5)
ax.set_title('Average of pressure grouped by C', loc='left', fontweight='bold')
ax.legend()
plt.show()

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
features = [i for i in train.columns if ("id" not in i) and ("pressure" not in i)]
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

In [None]:
train

# Target & Feature Relation

In [None]:
train['pressure_rounded'] = train.pressure.round()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 20))
sns.heatmap(train.groupby('pressure_rounded').mean().sort_index(),
            square= 0 , center = 0, linewidth = 2,
            cmap= sns.diverging_palette(240, 100, as_cmap=True),
            cbar= False, 
           )

ax.set_title('Mean : Group by Target(Pressure)',loc='left')
plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 30))
sns.heatmap(train.groupby('pressure_rounded').mean().sort_index(),
            square=True, vmin=-0.5, vmax=0.5, center=0, linewidth=1,
            cmap=sns.diverging_palette(240, 10, as_cmap=True),
            cbar=False, 
           )

ax.set_title('Mean : Group by Target(Pressure)',loc='left')
plt.show()

# Feature Distribution

In [None]:
fig, axes = plt.subplots(5,1,figsize=(12, 24))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'{features[idx]}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'{features[idx]}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'{features[idx]}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by feature', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()