In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv(r'/kaggle/input/ventilator-pressure-prediction/train.csv')

#sample_submission = pd.read_csv(r'/kaggle/input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
train.head()

* First column is an id column, we can use it for the index later
* breath_id is another id
* R seems constant
* C seems constant
* u_out seems bool

In [None]:
train.describe()

Confusing, I'll plott the quantiles of the upper table into a boxplot, it will be more readable.

In [None]:
sns.boxplot(data=train.drop(labels=['id'], axis=1));

Breath_id is clearly some ID

In [None]:
sns.boxplot(data=train.drop(labels=['id', 'breath_id'], axis=1));

* R and C, u_out look strange.
* u_in and pressure have many outliers

In [None]:
train_R_set = set(train.R.unique())
train_C_set = set(train.C.unique())
print('Unique Values in R column of the train data: ' + str(train_R_set))
print('Unique Values in C column of the train data: ' + str(train_C_set))
print('Unique Values in u_out column: ' + str(train.u_out.unique()))

* u_out is boolean (it said so already in the data description)
* R and C are some lung describing categories (they are explained as balloon hole diameter and latex thickness in the data description). But the important part is, that there are only three values each, so only 9 types of lungs exist in the *training* data set.

So because of the upper findings I descided to use the breath_id, R, C and id columns as Multiindices.

## Adapted import

In [None]:
# train = pd.read_csv(r'/kaggle/input/ventilator-pressure-prediction/train.csv', index_col=['breath_id', 'id'], dtype={'u_out': 'bool'} )
train = pd.read_csv(r'/kaggle/input/ventilator-pressure-prediction/train.csv', dtype={'u_out': 'bool'} )

In [None]:
train['lung_type'] = train['R'].apply(lambda x: 'R%02d' %(x)) + train['C'].apply(lambda x: '_C%02d' %(x))
train = train.set_index(['lung_type', 'breath_id', 'id']).drop(columns=['R', 'C'])
train.sort_index(inplace=True)
train.head()

time step seems looks like an x axis

In [None]:
train.info()

In [None]:
len_breaths = len(train.index.get_level_values(1).unique())
print('Amount of single breaths: ' + str(len_breaths))
print('Amount of data points per breath: ' + str(len(train.index.get_level_values(2)) / len_breaths))

In [None]:
All = slice(None)
fig, ax = plt.subplots(9, 4, figsize=(25,30))
ax = ax.flatten()
idx=0
for lung_type in train.index.get_level_values(0).unique():  # iterate over lung types
    for breath_id in train.loc[(lung_type, All, All), All].index.get_level_values(1).unique()[:4]:  # iterate over single breaths
        train.loc[(lung_type, breath_id, All), All].plot(x='time_step', 
                                                    title='Lung Type: %s\nBreath ID: %2d' %(lung_type, breath_id),
                                                    include_bool=True,
                                                    sharex=True,
                                                    grid=True,
                                                    ax=ax[idx]);
        idx += 1
fig.tight_layout()

## Investigate test data set

In [None]:
test  = pd.read_csv(r'/kaggle/input/ventilator-pressure-prediction/test.csv', dtype={'u_out': 'bool'} )

In [None]:
test['lung_type'] = test['R'].apply(lambda x: 'R%02d' %(x)) + test['C'].apply(lambda x: '_C%02d' %(x))
test = test.set_index(['lung_type', 'breath_id', 'id']).drop(columns=['R', 'C'])
test.sort_index(inplace=True)
test.head()

In [None]:
train_lung_set = set(train.index.get_level_values(0).unique())
test_lung_set = set(test.index.get_level_values(0).unique())

print('Are the "lungs" in the train and test data identical? %r' % (train_lung_set == test_lung_set))