In [None]:
### ※In this notebook , main language is English, sublanguage is Japanese.

# Overview

#### ※In this notebook , main language is English, sublanguage is Japanese.

### Thanks reference

* [Ventilator Pressure: EDA and simple submission](https://www.kaggle.com/carlmcbrideellis/ventilator-pressure-eda-and-simple-submission)
* [[V11]Ventilator: EDA + Understanding + Model + W&B](https://www.kaggle.com/ishandutta/v11-ventilator-eda-understanding-model-w-b)
* [Deep Learning Starter : Simple LSTM](https://www.kaggle.com/theoviel/deep-learning-starter-simple-lstm)




## EDA

R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow. / 気道がどの程度制限されているかを示す肺属性（単位：cmH2O/L/S）。物理的には、流量（時間当たりの空気量）の変化に対する圧力の変化です。直感的には、ストローで風船を膨らませるようなイメージです。ストローの直径を変えることでRを変化させることができ、Rが大きいほど吹きにくくなります。

C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow. / 肺の適合性を示す肺属性（単位：mL/cmH2O）。物理的には、圧力の変化に対する体積の変化を表します。直感的には、同じ風船の例を想像してください。風船のラテックスの厚さを変えることでCを変化させることができます。Cが大きいほどラテックスが薄く、吹きやすくなります

### Library

In [None]:
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn import metrics 

In [None]:
path = "../input/ventilator-pressure-prediction/"
os.listdir(path)

In [None]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train

In [None]:
test

#### pressure(objective variable)

In [None]:
# histgoram of pressure
plt.figure(figsize= (10,5))
train['pressure'].hist(bins=50)
print("mean: {}, std: {}".format(train['pressure'].mean(), train['pressure'].std()))
plt.show()

#### Visualize TimeStamp

Each time series represents an approximately 3-second breath

In [None]:
plt.figure(figsize = (10,5))
sns.histplot(data=train,x='time_step', bins=20)
print("max time_step is {} \n".format(train.time_step.max()))
plt.show()

#### u_in  
The control input for the inspiratory solenoid valve. Ranges from 0 to 100 (i.e., 0 is completely closed and no air is let in and 100 is completely open)  
空気を肺に入れるために吸気電磁弁を開く割合を表す。0は完全に閉じて空気を入れず、100は完全に開く    
When Look under graph
* most value exist between 0~5
* train&test is completely same distribution

In [None]:
fig, ax = plt.subplots(figsize = (20, 12))
plt.subplot(2, 2, 1)
sns.histplot(data=train,x='u_in', bins=100)
print("u_in train mean is {} , mode is {} ".format(train['u_in'].mean(),train['u_in'].mode()))
plt.title('count of u_in train')

plt.subplot(2, 2, 2)
sns.histplot(data=test,x='u_in', bins=100)
print("u_in test mean is {} , mode is {} ".format(test['u_in'].mean(),test['u_in'].mode()))
plt.title('count of u_in test')
plt.show()

u_out  
(The control input for the binary variable representing whether the exploratory valve is open (1) or closed (0) to let air out.  
- in this competition the expiratory phase is not scored, 

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
plt.subplot(2, 2, 1)
sns.countplot(x='u_out', data=train)
plt.title('Count of u_out in train')
plt.subplot(2, 2, 2)
sns.countplot(x='u_out', data=test)
plt.title('Count of u_out in test')
plt.show()

#### Look pressure u_out ==0

In [None]:
u_out_is_zero = train.query("u_out == 0").reset_index(drop=True)
plt.figure(figsize = (12,5))
ax = sns.distplot(u_out_is_zero['pressure'], 
             bins=120, 
             kde_kws={"clip":(0,50)}, 
             hist_kws={"range":(0,50)},
             color='skyblue', 
             kde=False);

plt.xlabel("Histogram of pressures (u_out=0)", size=14)
print("median is {}".format(u_out_is_zero.pressure.median()))
plt.show()

#### Look pressure u_out ==1

In [None]:
u_out_is_zero = train.query("u_out == 1").reset_index(drop=True)
plt.figure(figsize = (12,5))
ax = sns.distplot(u_out_is_zero['pressure'], 
             bins=120, 
             kde_kws={"clip":(0,50)}, 
             hist_kws={"range":(0,50)},
             color='skyblue', 
             kde=False);

plt.xlabel("Histogram of pressures (u_out=1)", size=14)
print("median is {}".format(u_out_is_zero.pressure.median()))
plt.show()

### R / C value  
* R : with higher R being harder to blow  pressure 
* C : with higher C having thinner latex and easier to blow pressure
* R ⇄ C affect pressure?

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))
plt.subplot(2, 2, 1)
sns.countplot(x='R', data=train)
plt.title('Count of R in train')
plt.subplot(2, 2, 2)
sns.countplot(x='R', data=test)
plt.title('Count of R in test')
plt.subplot(2, 2, 3)
sns.countplot(x='C', data=train)
plt.title('Count of C in train')
plt.subplot(2, 2, 4)
sns.countplot(x='C', data=test)
plt.title('Count of C in test')

#### check the correlatoin 

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

df_corr = train.corr()
sns.heatmap(df_corr,annot=True, fmt=".2f",cmap='Blues')
plt.show()

#### Check 1 ventilation cycle

In [None]:
ventilation_cycle = train[train['breath_id']==2]
print(f"Unique value counts in each time stamp\n{ventilation_cycle.nunique()}\n")

### Time series data(pressure/ u_in)

In [None]:
breath_542 = train.query('breath_id == 202').reset_index(drop = True)
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(breath_542["time_step"],breath_542["u_in"], lw=2, label='u_in')
ax.plot(breath_542["time_step"],breath_542["pressure"], lw=2, label='pressure')
#ax.set(xlim=(0,1))
ax.legend(loc="upper right")
ax.set_xlabel("time_id", fontsize=14)
ax.set_title("breath_id = 542", fontsize=14)
plt.show();

breath_119582 = train.query('breath_id == 119582').reset_index(drop = True)
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(breath_119582["time_step"],breath_119582["u_in"], lw=2, label='u_in')
ax.plot(breath_119582["time_step"],breath_119582["pressure"], lw=2, label='pressure')
#ax.set(xlim=(0,1))
ax.legend(loc="upper right")
ax.set_xlabel("time_id", fontsize=14)
ax.set_title("breath_id = 119582", fontsize=14)
plt.show();

### Time series data(pressure/ u_in / u_out)

In [None]:
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))

breath_1 = train.loc[train['breath_id'] == 9]
ax2 = ax1.twinx()

ax1.plot(breath_1['time_step'], breath_1['pressure'], 'r-', label='pressure')
ax1.plot(breath_1['time_step'], breath_1['u_in'], 'g-', label='u_in')
ax2.plot(breath_1['time_step'], breath_1['u_out'], 'b-', label='u_out')

ax1.set_xlabel('Timestep')

ax1.legend(loc=(1.1, 0.8))
ax2.legend(loc=(1.1, 0.7))
plt.show();

#### Time series data(pressure/ u_in) 
#### in u_out=0 "pressure inhale"/ u_out=1 "pressure exhale"

In [None]:
for i in range(25,30,1):
    one_breath = train[train["breath_id"]==i]

    plt.figure(figsize=(8,6));
    sns.lineplot(x = 'id',y='pressure',data=one_breath[one_breath['u_out']==0],color='green',label='pressure inhale');
    sns.lineplot(x = 'id',y='pressure',data=one_breath[one_breath['u_out']==1],color='orange',label='pressure exhale');
    sns.lineplot(x = 'id',y='u_in',data=one_breath,color='blue',label='input valve')
    plt.title(f"Variation of Pressure and Input valve position during breath {i}");
    plt.legend();

### Dataset

In [None]:
# for try to train from small data
df = train[train['breath_id'] < 5].reset_index(drop=True)

In [None]:
df2 = df.groupby('breath_id').agg(list).reset_index()
df2

In [None]:
import torch
from torch.utils.data import Dataset

class VenilatorDataset(Dataset):
    def __init__(self, df):
        if "pressure" not in df.columns:
            df["pressure"] = 0
        
        self.df = df.groupby('breath_id').agg(list).reset_index()
        
        self.prepare_data()
        
    def __len__(self):
        return self.df.shape[0]
            
    def prepare_data(self):
        # ~ later ~ (for preprocessing)
        self.pressures = np.array(self.df['pressure'].tolist())
        
        rs = np.array(self.df['R'].tolist())
        cs = np.array(self.df['R'].tolist())
        u_ins = np.array(self.df['u_in'].tolist())
        
        self.u_outs = np.array(self.df['u_out'].tolist())
        
        self.inputs = np.concatenate([
            # specify 1 column
            rs[:, None],
            cs[:, None],
            u_ins[:, None],
            np.cumsum(u_ins, 1)[:, None],
            self.u_outs[:, None]
        ], 1).transpose(0, 2, 1)
              
        
    def __getitem__(self, idx):
        data = {
            "input": torch.tensor(self.inputs[idx], dtype=torch.float),
            "u_out": torch.tensor(self.u_outs[idx], dtype=torch.float),
            "pressure": torch.tensor(self.pressures[idx], dtype=torch.float)
        }
        
        return data

In [None]:
dataset = VenilatorDataset(df)
dataset[0]

### Model

In [None]:
import torch
import torch.nn as nn
import random
import gc
import time
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import GroupKFold

In [None]:
class RNNModel(nn.Module):
    def __init__(
        self,
        input_dim = 4,
        lstm_dim = 256,
        dense_dim = 256,
        logit_dim = 256,
        num_classes = 1
    ):
        
        super().__init__()
        
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, dense_dim // 2),
            nn.Relu(),
            nn.Linear(dense_dim // 2, dense_dim),
            nn.Relu()
        )
        
        '''
        nn.LSTM(input_size, hidden_size,num_layers, batch_first) , 2 output: (hn, cn)  
        batch_first=True: (seq_len, batch, input_size)→(batch, seq_len, input_size)
        '''
        self.lstm = nn.LSTM(dense_dim, lstm_dim, batch_fisrt=True, bidirectional=True)
        
        self.logits = nn.Sequential(
            nn.Linear(lstm_dim * 2, logit_dim), # *2 for (concatenate forward & backward output)
            nn.ReLU(),
            nn.Linear(logit_dim, num_classes)
        )
        
    def forward(self, x):
        features = self.mlp(x)
        features, _ = self.lstm(features)
        pred = self.logits(features)
        return pred

### Utils 

### Metrics

The competition will be scored as the mean absolute error between the predicted and actual pressures during the inspiratory phase of each breath. The expiratory phase is not scored.

In [None]:
class VentilatorLoss(nn.Module):
    """
    Directly optimizes the competition metric
    """
    def __call__(self, preds, y):
        w = 1 - u_out
        mae = w * (y - preds).abs()
        mae = mae.sum(-1) / w.sum(-1)
        
        return mae

#### Fit

In [None]:
def fit(model,
        train_dataset,
        val_dataset,
        optimize="Adam",
        epochs=3,
        batch_size=32,
        val_bs=32,
        ):
    avg_val_loss = 0
    
    # Optimizer
    optimizer = getattr(torch.optim, optimizer)(model.parameters(), lr=lr)
    
    # Data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=NUM_WORKERS,
        pin_memory=True, # for fast train
        worker_init_fn = worker_init_fn
        )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True, # for fast train
        )   
    
    # Loss
    loss = VentilatorLoss()
    
    # Sheduler
    #for epoch in range(epoch):
    
    return pred

#### Predict

In [None]:
def predict(model, ):
    model.eval()
    
    loader = DataLoader()
    
    return pred


### Train

In [None]:
def train():
    
    seed_configure(config.seed)
    model = RNNModel()
    model.zero_grad()
    
    train_dataset = 
    val_dataset = 
    test_dataset = 
    
    pred_val = fit()
    
    pred_test = predict()
    
    # save weights
    

### Predict

#### k-fold

In [None]:
def k_fold():
    # add later


### Config

In [None]:
class Config:
    """
    Parameters used for training
    """
    # General
    seed = 42
    verbose = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"
    save_weithts = True
    
    # k-fold
    k = 5
    selction_fold = [0, 1, 2, 3, 4]
    
    # Model
    selected_model = 'rnn'
    input_dim = 5
    
    dense_dim = ?
    lstm_dim = ?
    logit_dim = ?
    
    # Training
    loss = 
    optimizer = "Adam"
    bathc_size = 128
    epochs = 100
    
    lr = 1e-3
    #later
    
    
    

### Inderence

In [None]:
k_fold(Config, df_train, df_test)