# Introduction

In [this notebook](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/280996) @lucasmorin suggested that some of the R and C values might be mislabeled in the training set. In this notebook we use a reasonably good [model](https://www.kaggle.com/mistag/train-ventilator-lstm-model-part-i) by @mistag to **predict pressures** for **all training breaths** and **all possible R and C combinations** (so we have nine predictions for each training breath).

Original @mistag's prediction is in [notebook](https://www.kaggle.com/mistag/pred-ventilator-lstm-model-0-149).

We only use **model trained on 80% of the data** to ensure that 20% of the data was not part of the training process. If we train on all the data, we might not be able to distinguish R and C mislabelling if the model is overfit. This model on its own scores 0.1756.

# Preliminaries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from pickle import load
import json
!cp ../input/ventilator-feature-engineering/VFE.py .
from VFE import add_features
import gc

Rescaler

In [None]:
RS = load(open('../input/ventilator-feature-engineering/RS.pkl', 'rb'))

Fetch batch size from training session:

In [None]:
with open('../input/train-ventilator-lstm-model-part-i/train_params.json', 'r') as fp:
    config = json.load(fp)

Pretrained model, tensorflow strategy

In [None]:
m = '../input/train-ventilator-lstm-model-part-i/lstm_fold0.hdf5'
strategy = tf.distribute.get_strategy()
with strategy.scope():
    model = keras.models.load_model(m)

Pressure statistics used later for [rounding](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/276083)

In [None]:
pressure = np.load('../input/ventilator-feature-engineering/y_train.npy')
P_MIN = np.min(pressure)
P_MAX = np.max(pressure)
P_STEP = pressure[0][1] - pressure[0][0]
print('Min pressure: {}'.format(P_MIN))
print('Max pressure: {}'.format(P_MAX))
print('Pressure step: {}'.format(P_STEP))
print('Unique values:  {}'.format(np.unique(pressure).shape[0]))

# Test dataset

In [None]:
#Load test features, sample submission file
test_ori = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

#Add features used by the model, drop unused columns
test = add_features(test_ori)
test.drop(['id', 'breath_id'], axis=1, inplace=True)

#Scale and reshape
test = RS.transform(test)
test = test.reshape(-1, 80, test.shape[-1])

#Predict
with strategy.scope():
    submission['pressure'] = model.predict(test, batch_size=config['BATCH_SIZE'], verbose=2).squeeze().reshape(-1, 1).squeeze()

#Round to the grid of pressure values
submission['pressure'] = np.round((submission['pressure'] - P_MIN)/P_STEP) * P_STEP + P_MIN
submission['pressure'] = np.clip(submission['pressure'], P_MIN, P_MAX)

#Save
submission.to_csv('submission.csv', index=False)

# Clean up
del submission, test, test_ori
tf.keras.backend.clear_session()
gc.collect()

# Train dataset - original R, C

In [None]:
results = np.zeros(6036000)

# Load train features, remove the pressure to mock the test data
train_ori = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
train_ori.drop(['pressure'], axis=1, inplace=True)

# Add features used by the model, drop unused columns 
train = add_features(train_ori)
train.drop(['id', 'breath_id'], axis=1, inplace=True)

#Scale and reshape
train = RS.transform(train)
train = train.reshape(-1, 80, train.shape[-1])

#Predict
with strategy.scope():
    results[:] = model.predict(train, batch_size=config['BATCH_SIZE'], verbose=1).squeeze().reshape(-1, 1).squeeze()

#Round to the grid of pressure values
results = np.round((results - P_MIN)/P_STEP) * P_STEP + P_MIN
results = np.clip(results, P_MIN, P_MAX)

# Clean up
del train, train_ori
tf.keras.backend.clear_session()
gc.collect()

#Save the results
np.save(f'results.npy', results)

# Train dataset - all R, C combinations

Define all possible R, C combinations

In [None]:
RC_pairs = [[5,10], [5,20], [5,50], [20,10], [20,20], [20,50], [50,10], [50,20], [50,50]]

Predict pressure values for all breaths and all possible R, C combinations

In [None]:
results = np.zeros((9, 6036000))

#For each R, C combination, make predictions for each train breath assuming it has this R, C
for idx, (our_R, our_C) in enumerate(RC_pairs):
    
    # Load train features, remove the pressure to mock the test data
    train_ori = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
    train_ori.drop(['pressure'], axis=1, inplace=True)

    # Add features used by the model, drop unused columns 
    train = add_features(train_ori)
    train.drop(['id', 'breath_id'], axis=1, inplace=True)
    
    # Artificially set R, C to the values assumed in this loop
    for x in ['R_20', 'R_5', 'R_50', 'C_10', 'C_20','C_50', 
          'R__C_20__10', 'R__C_20__20', 'R__C_20__50', 
          'R__C_50__10', 'R__C_50__20', 'R__C_50__50', 
          'R__C_5__10', 'R__C_5__20', 'R__C_5__50'
         ]:
            train[x] = 0

    train['R_' + str(our_R)] = 1
    train['C_' + str(our_C)] = 1
    train['R__C_' + str(our_R) + '__' + str(our_C)] = 1
    
    #Scale and reshape
    train = RS.transform(train)
    train = train.reshape(-1, 80, train.shape[-1])
    
    #Predict
    with strategy.scope():
        results[idx] = model.predict(train, batch_size=config['BATCH_SIZE'], verbose=1).squeeze().reshape(-1, 1).squeeze()
    
    #Round to the grid of pressure values
    results[idx] = np.round((results[idx] - P_MIN)/P_STEP) * P_STEP + P_MIN
    results[idx] = np.clip(results[idx], P_MIN, P_MAX)
    
    # Clean up
    del train, train_ori
    tf.keras.backend.clear_session()
    gc.collect()

#Save the results
np.save(f'results_vary_RC.npy', results)