In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
BASE_FOLD = '/kaggle/input/ventilator-pressure-prediction/'
DEBUG = False

In [None]:
train = pd.read_csv(BASE_FOLD + 'train.csv', index_col='id')
test = pd.read_csv(BASE_FOLD + 'test.csv', index_col='id')
train.shape,test.shape

In [None]:
if DEBUG:
    train = train[:80*1000]

# Train/Test data is batched by 80

In [None]:
train.breath_id.nunique(), train.breath_id.nunique()*80

In [None]:
test.breath_id.nunique(), test.breath_id.nunique()*80

# Train data has similar mean/std

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
#plot
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (10,10)

#sklearn
from sklearn.preprocessing import minmax_scale

# Check each batch

In [None]:
df_full = train[['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']]
df_full = pd.DataFrame(minmax_scale(df_full), columns=['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure'])
df = df_full.iloc[:80,:]
df.describe()

In [None]:
for col in df.columns:
    print(col)
    df[col].hist()
    plt.show()


In [None]:
pd.plotting.scatter_matrix(df, alpha=0.2)

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
corr_df = df.corr()
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
#generate plot
sns.heatmap(corr_df, cmap='RdYlGn', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()

In [None]:
df.mean().plot(style='.')

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(16,6))

# Line chart showing
sns.lineplot(data=df)

In [None]:
df['R'].nunique(),df['C'].nunique(),df['u_out'].diff().abs().sum()

In [None]:
#breath_id is not continuous
train['breath_id'].unique()[:81]

In [None]:
test['breath_id'].unique()[:81]

# From above, we can see R and C are constant, u_out only changes once. Let's confirm it in all train/test data

In [None]:
def count_changes(series):
    return series.diff().abs().sum()

In [None]:
agg = train.groupby('breath_id').agg({'u_out': count_changes}).reset_index(drop=False)

In [None]:
agg.describe()

In [None]:
agg = train.groupby('breath_id').agg({'R': [np.mean, np.std], 'C':[np.mean, np.std]}).reset_index(drop=False)
agg.describe()

In [None]:
agg = test.groupby('breath_id').agg({'u_out': count_changes, 'R': [np.mean, np.std], 'C':[np.mean, np.std]}).reset_index(drop=False)
agg.describe()

In [None]:
agg = train.groupby('breath_id').agg({'pressure': ['first', 'last', np.mean, np.std]}).reset_index(drop=False)
agg.head()

In [None]:
def flatten_name(prefix, src_names):
    ret = []
    for c in src_names:
        ret.append('.'.join([prefix] + list(c)))
    return ret

In [None]:
agg.columns = flatten_name('pressure', agg.columns)

In [None]:
agg['last_first_diff'] = (agg['pressure.pressure.first']*0.5-agg['pressure.pressure.last']*0.5).abs()

In [None]:
agg.head()

In [None]:
agg.describe()

As we can see, the last value of pressure is similar to the first one, the average is higher than the first/last value.

In [None]:
neg_pressure = agg[agg['pressure.pressure.first'] <= 0]
neg_pressure.shape

In [None]:
data=agg[['pressure.pressure.first', 'pressure.pressure.last', 'pressure.pressure.mean', 
          'last_first_diff']].plot()

## Split train to batch, change pressure to log change

In [None]:
#found when u_out is changed
def change_index(serial):
    return np.sum(np.abs(serial - serial.iloc[-1]))

In [None]:
%%script echo skipping
#add time serial id
df_train = train.copy()
df_train["time_id"] = train.groupby("breath_id")["time_step"].rank(method="first", ascending=True)


# make pressure not negtive
df_train['pressure.log'] = df_train['pressure']+2


df_train['pressure.log'] = np.log(df_train['pressure.log'])
# df_train.describe()

df_train["pressure.logdiff"] = df_train.groupby("breath_id")["pressure.log"].diff()

df_train["u_in.log1p"] = np.log1p(df_train['u_in'])
# df_train.describe()

df_train["u_in.log1pdiff"] = df_train.groupby("breath_id")["u_in.log1p"].diff()
# df_train.head(81)

pv_train = pd.pivot_table(df_train,index=['breath_id'], columns=df_train.groupby(['breath_id']).cumcount().add(1), 
               values=['u_in', 'u_in.log1p', 'u_in.log1pdiff', 'pressure'])
pv_train.columns=pv_train.columns.map('{0[0]}{0[1]}'.format)

agg = train.groupby('breath_id').agg({'u_out': ['first', 'last', change_index], 'R': np.mean, 'C': np.mean}).reset_index(drop=False)
agg.columns = flatten_name('const', agg.columns)
# agg.head(81)

agg.rename(columns={'const.breath_id.':'breath_id'}, inplace=True)
# agg.head()

pv_train = pd.merge(pv_train, agg, on='breath_id', how='left')
pv_train.head(81)

In [None]:
#Feature engineering
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
#     df['R'] = df['R'].astype(str)
#     df['C'] = df['C'].astype(str)
#     df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

In [None]:
train = add_features(train)
test = add_features(test)

In [None]:
train.shape, test.shape

In [None]:
def get_feature(train, target=False):
    df_train = train.copy()
#     df_train["u_in.log1p"] = np.log1p(df_train['u_in'])
#     df_train["u_in.log1pdiff"] = df_train.groupby("breath_id")["u_in.log1p"].diff()
    # remove constant features
    value_cols = [col for col in df_train.columns if col not in ['breath_id', 'R', 'C', 'time_step', 'u_out']]

    pv_train = pd.pivot_table(df_train,index=['breath_id'], columns=df_train.groupby(['breath_id']).cumcount().add(1), 
                   values=value_cols)
#     pv_train.columns=pv_train.columns.map('{0[0]}{0[1]}'.format)
    print(pv_train.shape)
    features = {'u_out': ['first', 'last', change_index], 'R': np.mean, 'C': np.mean}
    agg = train.groupby('breath_id').agg(features).reset_index(drop=False)
    agg.columns = flatten_name('const', agg.columns)
    agg.rename(columns={'const.breath_id.':'breath_id'}, inplace=True)
    pv_train = pd.merge(pv_train, agg, on='breath_id', how='left')
    del df_train
    
    return pv_train

In [None]:
%%time
pv_train = get_feature(train, True)
#pv_train.to_csv('pv_train.csv', index=False)
pv_train.head()

In [None]:
import collections
a = pv_train.columns.to_list()
[x for x, y in collections.Counter(a).items() if y > 1]

In [None]:
[col for col in pv_train.columns if 'pressure' in col]

## Get feature for test

In [None]:
pv_test = get_feature(test)
pv_test.head(81)

In [None]:
#pv_test.to_csv('pv_test.csv', index=False)

In [None]:
y_features = [col for col in pv_train.columns if 'pressure' in col]
y_target = pv_train[y_features]
y_target.head()

# Model

In [None]:
import gc

In [None]:
# del train,test
_= gc.collect()

In [None]:
from numpy.random import seed
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

tf.random.set_seed(42)
from tensorflow import keras
import numpy as nptensorflow
from tensorflow.keras import backend as K

In [None]:
#reset Keras Session
def reset_keras():
    sess = tf.compat.v1.keras.backend.get_session()
    tf.compat.v1.keras.backend.clear_session()
    sess.close()
    sess = tf.compat.v1.keras.backend.get_session()

    # use the same config as you used to create the session
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 1
    config.gpu_options.visible_device_list = "0"
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))
    gc.collect()

## Baseline

In [None]:
def base_model(feature_num, out_num, hidden_units, dropout_rates):
    print('base_model', feature_num, out_num, hidden_units, dropout_rates)
    
    num_input = keras.Input(shape=(feature_num,), name='num_data')
    input_dense_num = feature_num
    if hidden_units[0] != -1:
        input_dense_num = hidden_units[0]
    
    input_features = keras.layers.Dense(input_dense_num)(num_input)
    
    input_features = keras.layers.BatchNormalization()(input_features)
    input_features = keras.layers.Activation('swish')(input_features)
    if dropout_rates[0] < 1:
        input_features = keras.layers.Dropout(dropout_rates[0])(input_features)

    out = input_features
    
#     # Add one or more hidden layers
    for i in range(1, len(hidden_units)):
        out = keras.layers.Dense(hidden_units[i])(out)
        out = keras.layers.BatchNormalization()(out)
        out = keras.layers.Activation('swish')(out)
        if dropout_rates[i] < 1:
            out = keras.layers.Dropout(dropout_rates[i])(out)

    # A single output: our predicted rating
    out = keras.layers.Dense(out_num, activation='linear', name='prediction')(out)
    
    model = keras.Model(
    inputs = [num_input],
    outputs = out
    )
    
    model.summary()
    
    return model


In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

def train_and_evaluate_nn_base(train, test, params):
    
    features = [col for col in train.columns if 'pressure' not in col and 'breath_id' not in col ]
    y_features = [col for col in train.columns if 'pressure' in col]
    print(train.shape, test.shape, len(features), len(y_features))
    y = train[y_features]
    
    y_train = np.zeros(y.shape)
    y_test = np.zeros((test.shape[0],y.shape[1]))
    
    print('Check null in train', train[features].isnull().any())
    print('Check null in test', test[features].isnull().any())
    train[features] = train[features].fillna(train[features].mean())
    test[features] = test[features].fillna(train[features].mean())
    print('Check null in test again', test[features].isnull().any())
    
    kf = KFold(n_splits=NFOLD, shuffle=True, random_state=1)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train)):
        print('Fold:', fold)
        x_train, x_val = train.iloc[train_idx], train.iloc[valid_idx]
        y_tra, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        scaler = MinMaxScaler(feature_range=(-1, 1))
        num_data = x_train[features].values
        num_data = scaler.fit_transform(num_data)

        num_data_val = x_val[features].values
        num_data_val = scaler.transform(num_data_val)
        
        model = base_model(len(features), len(y_features), params['hidden_units'], params['dropout_rates'])

        model.compile(
            keras.optimizers.Adam(learning_rate=params['learning_rate']),
            loss=keras.losses.MeanAbsoluteError()
        )
        

        es = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=20, verbose=0,
            mode='min',restore_best_weights=True)

        plateau = tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss', factor=0.2, patience=7, verbose=0,
            mode='min')

        model.fit([num_data], 
                  y_tra,               
                  batch_size=params['batch_size'],
                  epochs=params['epochs'],
                  validation_data=([num_data_val], y_val),
                  callbacks=[es, 
                            plateau
                            ],
                  validation_batch_size=len(y_val),
                  shuffle=True,
                 verbose = 1)

        pred_val = model.predict([num_data_val]).reshape(1,-1, len(y_features))[0]
        y_train[valid_idx] = pred_val
        test_nn = test[features].values
        test_nn = scaler.transform(test_nn)
        y_test += model.predict([test_nn]).reshape(1,-1, len(y_features))[0]
        
        print(y_train[valid_idx][:3], y_test[:3])
        print('NN base MSE Fold:', mean_absolute_error(y_target.iloc[valid_idx], y_train[valid_idx]))
        
        #Delete model and release GPU memory
        del model, num_data, num_data_val, scaler, test_nn
        gc.collect()
        reset_keras()
    y_test/=NFOLD
    
    return y_train, y_test

In [None]:
import time

nn_base_time = time.time()
NFOLD = 5


params = {
    'batch_size': 4096,
    'epochs': 1000,
    'learning_rate': 0.006,
#     'hidden_units': [128, 128, 64, 32], 1.180226882297499
    'hidden_units': [-1, 2048, 1024, 512],
    'dropout_rates': [0.03527936123679956, 0.32024444956111164,
                     0.2716856145683449,
                     0.4379233941604448] # 1 means no dropout
}

y_nn_train1, y_nn_test1 = train_and_evaluate_nn_base(pv_train, pv_test, params)
_= gc.collect()

print('Check zero in prediction:', (y_nn_train1 == 0).sum())
np.savetxt('pred_nn.csv', y_nn_train1, delimiter=',')

print( 'NN base MSE CV:', mean_absolute_error(y_target, y_nn_train1), 'time: ', int(time.time() - nn_base_time), 's', y_nn_test1[:3])

In [None]:
y_pressure = y_nn_test1.reshape((-1))

In [None]:
sample_submission = pd.read_csv(BASE_FOLD + 'sample_submission.csv')

In [None]:
sample_submission.pressure = y_pressure

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)

In [None]:
train['pressure.pred'] = y_nn_train1.reshape((-1))

In [None]:
train.head(81)

In [None]:
# Set the width and height of the figure
plt.figure(figsize=(16,6))

# Line chart showing
sns.lineplot(data=train.iloc[:80,:][['R', 'C', 'time_step', 'u_in', 'u_out', 'pressure', 'pressure.pred']])