In [None]:
import numpy as np 
import pandas as pd
import os 
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import seaborn as sns 
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans 
from sklearn.manifold import TSNE
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMRegressor

import warnings 

warnings.simplefilter("ignore")

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")
train.head()

In [None]:

'''
Prepare the table in chronological order . 
'''

train["time_step_class"] = train.groupby("breath_id").cumcount()
test["time_step_class"] = test.groupby("breath_id").cumcount()

piv_train = train.pivot_table(values="u_in", columns="time_step_class", index="breath_id")
piv_test = test.pivot_table(values="u_in", columns="time_step_class", index="breath_id")

m = MinMaxScaler(feature_range=(0.0, 1.0)).fit(piv_train)
piv_train = pd.DataFrame(m.transform(piv_train), columns=piv_train.columns, index=piv_train.index)
piv_train = pd.DataFrame(m.transform(piv_train), columns=piv_train.columns, index=piv_train.index)

piv_train.head()

In [None]:
pca = PCA(n_components=3, random_state=42).fit(piv_train)

pca_train = pca.transform(piv_train)
pca_test = pca.transform(piv_test)

pca_train = pd.DataFrame(pca_train, columns=["c"+str(c) for c in range(3)], index=piv_train.index)
pca_test = pd.DataFrame(pca_test, columns=["c"+str(c) for c in range(3)], index=piv_test.index)

pca_train.head()

# Clustering 

In [None]:
km = KMeans(n_clusters=2, random_state=42)
y_km = km.fit_predict(pca_train)
y_km_te = km.fit_predict(pca_test) ### 

pca_train["cluster"] = y_km
pca_test["cluster"] = y_km_te 

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111 , projection='3d')
sc = ax.scatter(pca_train.iloc[:, 0], 
                pca_train.iloc[:, 1],
                zs=pca_train.iloc[:, 2],
                zdir='z',
                s=50,
                vmin=0,
                vmax=1,
                c=pca_train.iloc[:, 3],
                cmap=plt.cm.jet) 
plt.colorbar(sc)
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111 , projection='3d')
sc = ax.scatter(pca_test.iloc[:, 0], 
                pca_test.iloc[:, 1],
                zs=pca_test.iloc[:, 2],
                zdir='z',
                s=50,
                vmin=0,
                vmax=1,
                c=pca_test.iloc[:, 3],
                cmap=plt.cm.jet) 
plt.colorbar(sc)
plt.show()

In [None]:

# merge 
pca_train["breath_id"] = pca_train.index 
pca_train = pca_train.reset_index(drop=True)
pca_train = pca_train[["breath_id", "cluster"]]
train = pd.merge(train, pca_train, how="left", on="breath_id")

pca_test["breath_id"] = pca_test.index 
pca_test = pca_test.reset_index(drop=True)
pca_test = pca_test[["breath_id", "cluster"]]
test = pd.merge(test, pca_test, how="left", on="breath_id")


# helper 
def find_cluster_r_c(df):
    fig, ax = plt.subplots(2, 2, figsize=(15, 6))
    for c in range(2):
        for r_c in range(2):
            x = df.loc[df.cluster == c, "R" if r_c == 0 else "C" ]
            sns.countplot(x, ax=ax[c][r_c])
            ax[c][r_c].set_title(f"Cluster={c}")
    plt.tight_layout()
    
    
def find_cluster_transition(df, is_train=True):
    fig, ax = plt.subplots(2, 5, figsize=(15, 6))
    for c in range(2):
        x = df.loc[df.cluster == c]
        breath = x.breath_id.unique()
        for n in range(5):
            if is_train:
                xx = x.loc[x.breath_id == breath[n], ["time_step", "u_in", "u_out", "pressure"]]
            else:
                xx = x.loc[x.breath_id == breath[n], ["time_step", "u_in", "u_out"]]
            xx.set_index("time_step").plot(ax=ax[c][n])
            ax[c][n].set_title(f"breath_id={breath[n]}")
            ax[c][n].set_xticks([])
            
            if n == 0:
                ax[c][n].set_ylabel(f"Cluster={c}")
    plt.tight_layout()

In [None]:
find_cluster_r_c(train)

In [None]:
find_cluster_r_c(test)

In [None]:

'''
The difference in the distribution of u_in is clear by comparing the two.
'''

find_cluster_transition(train)

In [None]:
find_cluster_transition(test, False)

In [None]:
train[["time_step", "u_in", "pressure", "cluster"]].groupby("cluster").mean().T

In [None]:
pd.crosstab(train.cluster, [train.R, train.C]).T

# Train 

In [None]:
# GroupFold 
train.drop(["time_step_class", "id"], axis=1, inplace=True)
test.drop(["time_step_class", "id"], axis=1, inplace=True)

def k_split(df):
    kf = GroupKFold(n_splits=2)
    for i, (v, t) in enumerate(kf.split(df, df.pressure, groups=df.breath_id)):
        df.loc[v, "fold"] = int(i)
    df["fold"] = df.fold.astype(np.uint8)
    return df 

train = k_split(train)


In [None]:
train.head()

In [None]:
def fit(train, test):
    os.makedirs("models", exist_ok=True)
    x, y = train.drop(["pressure","breath_id"], axis=1), train[["pressure", "fold"]]
    use_col = x.drop("fold", axis=1).columns 
    x_test = test[use_col]
    predict = []
    models = []
    for fold in range(2):
        x_train, y_train = x[x.fold != fold].drop("fold", axis=1), y[y.fold != fold].drop("fold", axis=1)
        x_val, y_val = x[x.fold == fold].drop("fold", axis=1), y[y.fold == fold].drop("fold", axis=1)   
        
        model = LGBMRegressor(random_state=42, n_estimators=1000).fit(x_train,
                                                                      y_train, 
                                                                      eval_set=[(x_train, y_train), (x_val, y_val)], 
                                                                      early_stopping_rounds=10,
                                                                      verbose=100)
        pred_test = model.predict(x_test)
        predict.append(pred_test)
        models.append(model)
    predict = np.mean(predict, axis=0)
    return predict, models

In [None]:
pred, models = fit(train, test)

# a simple submission 
sub = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")
sub["pressure"] = pred
sub.to_csv("submission.csv", index=False)

# EDA Clustering

In [None]:

'''
Let's see the difference between each by separating each cluster when making a prediction. 
Analyze how u_out affects forecast data.
'''

def predict(models, df):
    predict = []
    for model in models:
        pred = model.predict(df.drop(["breath_id", "pressure", "fold"], axis=1))
        predict.append(pred)
    predict = np.mean(predict, axis=0).tolist()
    return predict 

def predict_cluster(models, df):
    df_list = []
    for c in range(2):
        c_df = df[df.cluster == c]
        pred = predict(models, c_df)
        c_df_ = c_df.copy()
        c_df_["predict"] = pred 
        df_list.append(c_df_)
    return df_list[0], df_list[1]


def metrics(df):
    out_0 = mean_absolute_error(df.loc[df.u_out == 0, "pressure"], df.loc[df.u_out == 0,"predict"])
    out_1 = mean_absolute_error(df.loc[df.u_out == 1, "pressure"], df.loc[df.u_out == 1, "predict"])
    alls = mean_absolute_error(df["pressure"], df["predict"])
    return pd.DataFrame({"u_out_0": [out_0], "u_out_1": [out_1], "all": [alls]}, index=["Error"]).T.style.background_gradient(cmap="coolwarm")

def viz_predict(df, c_name=0):
    breath = df.breath_id.unique()
    fig, axes = plt.subplots(3, 3, figsize=(10, 10))
    ax = axes.ravel()
    for i in range(9):
        x = df[df.breath_id == breath[i]]
        x[["time_step", "u_in", "u_out", "pressure", "predict"]].set_index("time_step").plot(ax=ax[i])
        ax[i].set_title(f"Cluster={c_name}, breath_id={breath[i]}")
    plt.tight_layout()

In [None]:
c0, c1 = predict_cluster(models, train)

In [None]:
plt.subplot(121)
sns.histplot(c0.pressure)
plt.title("Cluster=0")
plt.subplot(122)
sns.histplot(c0.predict)

In [None]:
plt.subplot(121)
sns.histplot(c1.pressure)
plt.title("Cluster=1")
plt.subplot(122)
sns.histplot(c1.predict)

In [None]:
metrics(c0)

In [None]:

'''
Comparing the error of the predicted value around 1.0,
it is inferred that the degree of dispersion is wider before 1.0 second. 
By comparing with cluster 0, the distribution difference before 1.0 second is clear.


My guess is that the score will fluctuate depending on whether
or not you can accurately predict 1.0 seconds ago from the data belonging to 
cluster 1 which is different from the normal distribution.
'''

metrics(c1)

In [None]:
viz_predict(c0)

In [None]:
viz_predict(c1, 1)