In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 0. Looking for an EDA?
Check this [EDA](https://www.kaggle.com/carlmcbrideellis/ventilator-pressure-eda-and-simple-submission) EDA by [Ellis](https://www.kaggle.com/carlmcbrideellis) 

# 1. Motivation

I wrote this notebook for some simple sanity checks and an easy and simple baseline solution. The idea was to compute the **average of all breathing patterns** and then compute the MAE against the target in the train set. After that, submit the same average pattern to compare against the MAE on the public score. 

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv',index_col='id')
test  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv', index_col='id')

# breathing patterns blocks are 80 rows long 
# this function computes the average pattern for a feature
def compute_mean(df,feature):
    return np.reshape(np.array(df[feature]),(-1,80)).mean(axis=0) 



In [None]:
# let's compute the average target pattern 
p_mean = compute_mean(train,'pressure')

# repeat the pattern by the number of breathing patterns in the train file 
p_mean_tile = np.tile(p_mean,(1,train.shape[0]//80)).transpose()

# create a new column in train with the average breathing pattern
train['p_mean'] = p_mean_tile
print(f'train set MAE for average pattern: {np.mean(np.abs(np.array(train["pressure"])-np.array(train["p_mean"]))):.3f}')


# 2. Submit!
We got an expected MAE of 3.141. Okay, now let's make a submission file!

In [None]:
p_mean_tile = np.tile(p_mean,(1,test.shape[0]//80)).transpose()
test['pressure'] = p_mean_tile

# produce the submission file
test['pressure'].to_csv('submission_average.csv',header=True)

# 3. Something is not right...
To my surprise the **public score** was **6.358**, which is hugely different from the above **estimation** of **3.141**. I wondered what was the issue. Could the train and test sets be rather different? Unlikely. After reading Ellis EDA it became clear.
The scoring is only performed in part of the breathing patterns. More specifically, scoring happens when the feature u_out is 0. When u_out is 1, the breathing "pressure" is close to zero. This makes the overall MAE smaller when the entire breathing pattern is included.

# 4. Let's fix the scoring estimation
Let's recompute the MAE taking u_out into account.

In [None]:
# indices in which u_out = 0.
u_out_idx = train[train['u_out']==0].index

print(f'train set MAE for average pattern (restricted to u_out=0): {np.mean(np.abs(np.array(train.iloc[u_out_idx]["pressure"])-np.array(train.iloc[u_out_idx]["p_mean"]))):.3f}')

# 5. Result
Now we get **MAE=6.575**, which is closer to the **public score** of **6.358** for this simple averaging solution.