In [None]:
import pandas as pd 
import numpy as np 
import scipy as sp 
import random 
import matplotlib.pyplot as plt 

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA 

In [None]:
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv", nrows=3200)
test = pd.read_csv("../input/ventilator-pressure-prediction/test.csv", nrows=3200)
train.head()

In [None]:

'''
Organize to make the time series of each breath_id common. 
Scale processing is also added because we want to reduce the dimension.
'''

# Creation of serial numbers
train["time_step_class"] = train.groupby("breath_id").cumcount()
test["time_step_class"] = test.groupby("breath_id").cumcount()

# pivot table 
piv_train = train.pivot_table(values="u_in", index="breath_id", columns="time_step_class")
piv_test = test.pivot_table(values="u_in", index="breath_id", columns="time_step_class")

# Scale
m = MinMaxScaler(feature_range=(0.0, 1.0)).fit(piv_train)
piv_train = pd.DataFrame(m.transform(piv_train), columns=piv_train.columns, index=piv_train.index)
piv_test = pd.DataFrame(m.transform(piv_test), columns=piv_test.columns, index=piv_test.index)

piv_train.head()

In [None]:
pca = PCA()
pca.fit(piv_train)

plt.figure(figsize=(15, 6))
plt.plot(pca.explained_variance_ratio_.cumsum())
plt.grid()
plt.xlabel("n_components")
plt.ylabel("explained_variance_ratio_")
plt.show()

In [None]:
pca = PCA(n_components=3, random_state=42).fit(piv_train)

pca_train = pca.transform(piv_train)
pca_test = pca.transform(piv_test)

pca_train = pd.DataFrame(pca_train, columns=["c"+str(c) for c in range(3)], index=piv_train.index)
pca_test = pd.DataFrame(pca_test, columns=["c"+str(c) for c in range(3)], index=piv_test.index)

pca_train.head()

In [None]:
# concat train, test 
df = pd.concat([pca_train, pca_test])

last_train_breath_idx = pca_train.shape[0]

In [None]:


piv_sparse = sp.sparse.csr_matrix(df.values)
breath = cosine_similarity(piv_sparse)
breath = pd.DataFrame(breath, columns=df.index, index=df.index)
breath = breath.iloc[:last_train_breath_idx, last_train_breath_idx:]

breath.head()

'''
index: train.breath_id 
columns: test.breath_id

'''

In [None]:
# helper F 
def find_similar_breath_id(te_breath, n=10):
    x = breath[te_breath].sort_values(ascending=False)[:n]
    return pd.DataFrame({"similar": x.values}, index=x.index)


def find_similar_pressure(te_breath):
    similar = find_similar_breath_id(te_breath, n=1)
    sim_pressure = train[train.breath_id == similar.index[0]].sort_values("time_step", ascending=True)["pressure"].values
    return sim_pressure, similar.index[0]

In [None]:
random_test_breath_id = np.random.choice(test.breath_id.unique(), 5)
find_similar_breath_id(random_test_breath_id[0]).style.background_gradient(cmap="Blues")

In [None]:
find_similar_breath_id(random_test_breath_id[1]).style.background_gradient(cmap="Blues")

In [None]:
a, _  = find_similar_pressure(random_test_breath_id[0])
a

In [None]:
b, _ = find_similar_pressure(random_test_breath_id[1])
b

In [None]:

'''
Process for all sample test data.
'''

test_breath_id = test.breath_id.unique()

predict_list, sim_breath_list = [], []
for b in test_breath_id:
    simlar, sim_breath = find_similar_pressure(b)
    predict_list.append(simlar)
    sim_breath_list.append(np.array([sim_breath]*80))
predict_list = np.concatenate(predict_list)
sim_breath_list = np.concatenate(sim_breath_list)

test["pressure"] = predict_list
test["similar_breath_id"] = sim_breath_list 
test.head()

In [None]:

'''
Draw the pressure value of the most similar training data from each test data. 
As it can be read, it captures the general tendency of u_in, 
but the impression that it strongly reflects the similar relationship after 1.5 seconds in particular.
'''

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
ax = axes.ravel()

for i in range(5):
    x = test[test.breath_id == random_test_breath_id[i]].sort_values("time_step", ascending=True)
    tr_breath_id = x.similar_breath_id.values[0]
    xx = train[train.breath_id == tr_breath_id].sort_values("time_step", ascending=True)
    
    x[["time_step", "u_in", "u_out", "pressure"]].set_index("time_step").plot(ax=ax[i])
    xx[["time_step", "u_in", "u_out", "pressure"]].set_index("time_step").plot(ax=ax[i+5])
    
    ax[i].set_title(f"braeth_id={tr_breath_id}")
    ax[i+5].set_title(f"braeth_id={test_breath_id[i]}")
    
    ax[i].set_xlabel("")
    ax[i+5].set_xlabel("")    
    ax[i].set_yticks(range(0, 50+1, 10))
    ax[i].set_ylim(0, 50)
    ax[i+5].set_yticks(range(0, 50+1, 10))
    ax[i+5].set_ylim(0, 50)
    
    if i == 0:
        ax[i].set_ylabel("Train", c="r")
        ax[i+5].set_ylabel("Test", c="b")
plt.tight_layout()

# A simple submission 

In [None]:

'''
However, we do not recommend submitting forecasts from this process.
'''

test[["id", "pressure"]].to_csv("submission.csv", index=False)