In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import optuna
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize, MinMaxScaler
from sklearn.model_selection import train_test_split, GroupKFold, KFold

from IPython.display import display

In [None]:
DEBUG = False
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
sub = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

if DEBUG:
    train = train[:80*10000] #debugモードならtrainのデータ数を削減

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['cross'] = df['u_in']*df['u_out']
    df['cross2'] = df['time_step'] * df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cumsum_mean'] = df['u_in_cumsum'] / df['count']
    
    df['breath_id_lag'] = df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2'] = df['breath_id'].shift(2).fillna(0)
    df['breath_id_neg_lag2'] = df['breath_id'].shift(-2).fillna(0)
    df['breath_id_lagsame'] = np.select([df['breath_id_lag'] == df['breath_id']],[1],0)
    df['breath_id_lag2same'] = np.select([df['breath_id_lag2'] == df['breath_id']],[1],0)
    df['breath_id_neg_lag2same'] = np.select([df['breath_id_neg_lag2'] == df['breath_id']],[1],0)
    
    
    
    df['u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['u_in_lag'] = df['u_in_lag'] * df['breath_id_lagsame']
    df['u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['u_in_lag2'] = df['u_in_lag2'] * df['breath_id_lag2same']
    df['u_in_neg_lag2'] = df['u_in'].shift(-2).fillna(0)
    df['u_in_neg_lag2'] = df['u_in_neg_lag2'] * df['breath_id_neg_lag2same']
    
    df['u_in_lag_diff'] = df['u_in'] - df['u_in_lag']
    df['u_in_lag2_diff'] = df['u_in'] - df['u_in_lag2']
    
    df['u_in_last'] = df.groupby('breath_id')['u_in'].transform('last')
    #df['u_in_mean_for_breath'] = df.groupby('breath_id')['u_in'].transform('mean')
    #df['u_in_max_for_breath'] = df.groupby('breath_id')['u_in'].transform('max')
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['RC'] = df['R']+df['C']
    
    output_df = pd.get_dummies(df)
    return output_df

train = add_features(train)
test = add_features(test)

In [None]:
train_drop_cols = ['pressure','id','breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_neg_lag2',
                  'breath_id_lagsame','breath_id_lag2same','breath_id_neg_lag2same']
test_drop_cols = ['id','breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_neg_lag2',
                  'breath_id_lagsame','breath_id_lag2same','breath_id_neg_lag2same']

In [None]:
targets = train[['pressure']].to_numpy().reshape(-1, 80)
train = train.drop(train_drop_cols ,axis=1)
test = test.drop(test_drop_cols,axis=1)

In [None]:
train.head()

In [None]:
RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.fit_transform(test)

In [None]:
train = train.reshape(-1,80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

In [None]:
EPOCH = 300
BATCH_SIZE = 1024

tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

with tpu_strategy.scope():
    kf = KFold(n_splits=5, shuffle=True, random_state=2021)
    test_preds=[]
    fold_mae = []
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train, targets)):
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
        X_train, X_valid = train[train_idx], train[valid_idx]
        y_train, y_valid = targets[train_idx], targets[valid_idx]
        model = keras.models.Sequential([
            keras.layers.Input(shape=train.shape[-2:]),
            keras.layers.Bidirectional(keras.layers.LSTM(300, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(250, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(150, return_sequences=True)),
            keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
            keras.layers.Dense(50, activation='selu'),
            keras.layers.Dense(1),
        ])
        
        model.compile(optimizer='Adam', loss='mae')
        scheduler = ExponentialDecay(1e-3, 400*((len(train)*0.8)/BATCH_SIZE), 1e-5)
        lr = LearningRateScheduler(scheduler, verbose=1)
        #es = EarlyStopping(monitor='val_loss', patience=15, verbose=1, mode='min', restore_best_weights=True)
        
        history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE,
                 callbacks=[lr], verbose=1)
        
        y_pred = model.predict(X_valid)
        loss = mae(np.ravel(y_valid), np.ravel(y_pred))
        fold_mae.append(loss)
        
        print(f'Fold {fold+1} MAE: {loss}')
        
        plt.figure(figsize=(15,5))
        plt.plot(np.arange(EPOCH),history.history['loss'],label='Train',color='gray')
        plt.plot(np.arange(EPOCH),history.history['val_loss'],label='Val',color='green')
        plt.ylabel('Loss',size=14)
        plt.legend()
        plt.title(f"Fold: {fold+1}")
        plt.show()  
        
        test_preds.append(model.predict(test).squeeze().reshape(-1,1).squeeze())
        

In [None]:
plt.plot(fold_mae)
plt.title('Fold MAE')
plt.xlabel('Fold')
plt.ylabel('MAE')
print(f"Mean MAE: {np.mean(fold_mae)}")

In [None]:
sub['pressure'] = sum(test_preds)/5
sub.to_csv('submission.csv', index=False)