In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the train data and get the general feeling

In [None]:
train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')

First few lines

In [None]:
train.head()

Data columns and their types

In [None]:
train.info()

There are no missing values

In [None]:
train.isnull().any()

Basic descriptive statistics

In [None]:
train.describe()

id numbers are unique as expected (no duplicates)

In [None]:
train['id'].duplicated().any()

# Individual breaths

Number of different breath data

In [None]:
unique_breaths = train['breath_id'].unique()
num_breaths = len(unique_breaths)
print(num_breaths)

There are several rows corresponding to each breath:

In [None]:
train['breath_id'][:500].plot();

For each breath we have exactly 80 data points:

In [None]:
breath_lengths = train[['id','breath_id']].groupby('breath_id').count()['id']
breath_lengths.unique()

In [None]:
BREATH_LENGTH = breath_lengths.unique()[0]

# R and C

R and C values are constant within each breath (having zero standard deviation)

In [None]:
r_c_std_in_breaths = train[['breath_id','R','C']].groupby('breath_id').std()
print(r_c_std_in_breaths['R'].unique())
print(r_c_std_in_breaths['C'].unique())

R has only three distinct values:

In [None]:
r_values = train[['breath_id', 'R']].groupby('breath_id').mean()['R']
print(r_values)
print()
print('Unique values:')
print(r_values.value_counts())

r_unique = np.sort(r_values.unique()).astype(int)

So does C:

In [None]:
c_values = train[['breath_id', 'C']].groupby('breath_id').mean()['C']
print(c_values)
print()
print('Unique values:')
print(c_values.value_counts())

c_unique = np.sort(c_values.unique()).astype(int)

There is about a factor two scatter in the various R/C combinations. 

For R = 20 we see C = 50 most often, for R = 5, 50 we see C = 10 most often.

In [None]:
rc_values = np.array([
    [r, c, len(train[(train['R'] == r) & (train['C'] == c)])//BREATH_LENGTH] 
    for r in r_unique 
    for c in c_unique
])

x = range(len(rc_values))
plt.bar(x, rc_values[:,2])
plt.xticks(x, [str(r) + '_' + str(c) for r, c in rc_values[:,:2] ])
plt.xlabel('R_C')
plt.ylabel('Number counts')
plt.show()

# Time steps in individual breaths

Take a look at time sampling for the first two breaths. Looks like pretty uniform sampling in time.

In [None]:
first_breath  = train[train['breath_id'] == 1]
second_breath = train[train['breath_id'] == 2]

x = range(BREATH_LENGTH)
t1 = first_breath['time_step']
t2 = second_breath['time_step']
plt.plot(x, t1)
plt.plot(x, t2, ls = '--')

One time step seems to correspond to about

In [None]:
(max(t1) - min(t1)) / BREATH_LENGTH

The two time series for the first two breaths are not perfectly aligned

In [None]:
plt.plot(t1.values - t2.values);

All breaths start at timestep zero

In [None]:
train[['breath_id', 'time_step']].groupby('breath_id').min()['time_step'].std()

Each breath is ~ 2.5 to 3 seconds long

In [None]:
time_step_max = train[['breath_id', 'time_step']].groupby('breath_id').max()['time_step']
time_step_max.describe()

In [None]:
plt.hist(time_step_max, 50);

Most of the maximal timesteps are unique, so there is some randomness in these

In [None]:
len(time_step_max.unique())

The time steps within each individual breath are mostly equal sized, varying by a few percent max

In [None]:
first_dt = first_breath['time_step'].diff()[1:]
print(max(first_dt)/min(first_dt))
second_dt = second_breath['time_step'].diff()[1:]
print(max(second_dt)/min(second_dt))

Calculate and plot histogram of time step sizes

In [None]:
train['dt'] = train['time_step'].diff()
dt_loc = train.columns.get_loc('dt')
train.iloc[::BREATH_LENGTH, dt_loc] = np.nan #corresponds to start of each breath
plt.hist(train['dt'], 30);
plt.xlabel('time step size');

Zoom in on the main peak

In [None]:
plt.hist(train['dt'], np.arange(0, 0.05, 0.001))
plt.xlabel('time step size');

There is actually a very clear structure. Looks a bit like a sum of normal distributions.

In [None]:
plt.hist(train['dt'], np.arange(0.0315, 0.0355, 0.000002))
plt.xlabel('time step size');

It is not that all timesteps from a single breath belong to the same Gaussian..

In [None]:
plt.hist(train['dt'][240:320], np.arange(0.0315, 0.0355, 0.00002))
plt.xlabel('time step size');

No timestep is shorter than 0.031

In [None]:
train[train['dt'] < 0.031]

And only a handful are longer than 0.05. All such timesteps but one occur for R = 50, C = 10. Both u_out zero and one present.

In [None]:
train[train['dt'] > 0.05]

In most of the cases we miss 7-8 timesteps

In [None]:
train[train['dt'] > 0.05]['dt'].values/0.03

Get distribution of median time step size for each breath. Find the approximate position of the discovered peaks;

In [None]:
dt_means = np.median(np.reshape(train['dt'].values, (-1, BREATH_LENGTH))[:,1:], axis = -1)
plt.hist(dt_means,90);
median_dt_peaks = [0.03170, 0.03195, 0.03340, 0.03355, 0.03375, 0.03402, 0.03428,]

for peak in median_dt_peaks:
    plt.axvline(peak, color = 'r', lw = 1)

# u_out control input

Two sample breaths

In [None]:
first_u_out  = first_breath['u_out']
second_u_out = second_breath['u_out']
plt.plot(range(BREATH_LENGTH), first_u_out, marker = 'o')
plt.plot(range(BREATH_LENGTH), second_u_out, marker = 'o')

Within each breath, we have periods of both zero and unit u_out (so for example not always zero)

In [None]:
u_out_min = train[['breath_id', 'u_out']].groupby('breath_id').min()['u_out']
u_out_max = train[['breath_id', 'u_out']].groupby('breath_id').max()['u_out']
print(max(u_out_min))
print(min(u_out_max))

Within each breath, u_out never decreases, so it is always a step up:

In [None]:
u_out_decreases        = train['u_out'].diff()[1:].values < 0
breath_id_remains_same = train['breath_id'].diff()[1:].values == 0
assert(np.max(u_out_decreases * breath_id_remains_same) == 0)

The u_out transitions happen between time steps 25 and 32, with almost all between 30 and 32

In [None]:
u_out_diff = train['u_out'].diff()
u_out_diff[0] = 0. #remove the NaN
u_out_transitions = u_out_diff.index[u_out_diff == 1] % BREATH_LENGTH
plt.hist(u_out_transitions, 7, align = 'right')
print('u_out transitions happen between timesteps:')
print(min(u_out_transitions))
print(max(u_out_transitions))

We find that the longer the breath, the sooner the u_out transition happens. This suggests that u_out transition occurs always at the same time.

In [None]:
plt.scatter(u_out_transitions, time_step_max, alpha = 0.01);
plt.ylabel('Lenght of the breath')
plt.xlabel('step of u_out transition');

This is the distribution of the last recorded time for which u_out is still zero.

In [None]:
idxs = u_out_transitions + np.arange(0,BREATH_LENGTH*num_breaths,BREATH_LENGTH) - 1
times_before_transition = train['time_step'][idxs]
plt.hist(times_before_transition, 50)
assert(max(train['u_out'][idxs]) == 0) #Check all really before the transition

This is the distribution of the first recorded time for which u_out is one.

In [None]:
idxs = u_out_transitions + np.arange(0,BREATH_LENGTH*num_breaths,BREATH_LENGTH)
times_after_transition = train['time_step'][idxs]
plt.hist(times_after_transition,50)
assert(min(train['u_out'][idxs]) == 1) #Check all really after the transition

It is not possible to make a 100% perfect separation, though the time of the u_out transition seems to be very close to 0.99

In [None]:
plt.plot(times_after_transition[:1000].values)
plt.plot(times_before_transition[:1000].values)
plt.axhline(0.99, c = 'k');
plt.ylabel('Time');
plt.xlabel('Breath');

# u_in control input

A handful of examples - after a varying initial sequence, there seems to be a standard protocol when u_out is switched on

In [None]:
for idx in range(7):
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[idx]]['u_in'], marker = 'o')
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[idx]]['u_out'], marker = 'o')
    plt.show()

Most of the behavior with u_out turned on is pretty consistent

In [None]:
for idx in range(15):
    u_in = train[train['breath_id'] == unique_breaths[idx]]['u_in']
    u_out_transition = u_out_transitions[idx]
    step_since_transition = range(BREATH_LENGTH - u_out_transition)
    plt.plot(step_since_transition, u_in[u_out_transition:])

But not all

In [None]:
for idx in [15,16,29]:
    u_in = train[train['breath_id'] == unique_breaths[idx]]['u_in']
    u_out_transition = u_out_transitions[idx]
    step_since_transition = range(BREATH_LENGTH - u_out_transition)
    plt.plot(step_since_transition, u_in[u_out_transition:])

Using time on the x axis seems to lead to a bit better alignment

In [None]:
for idx in range(15):
    u_in = train[train['breath_id'] == unique_breaths[idx]]['u_in']
    u_out_transition = u_out_transitions[idx]
    time_steps = train[train['breath_id'] == unique_breaths[idx]]['time_step'].values
    time_since_transition = time_steps - time_steps[u_out_transition]
    plt.plot(time_since_transition[u_out_transition:], u_in[u_out_transition:])
plt.xlim([0.4,0.65])
plt.ylim([0,3.4])

u_in maxima are interestingly distributed

In [None]:
plt.hist(train[['breath_id', 'u_in']].groupby('breath_id').max()['u_in'], 100);

Few examples where the u_in reaches the maximal value:

In [None]:
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[31]]['u_in'], marker = 'o');

In [None]:
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[32]]['u_in'], marker = 'o');

In [None]:
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[40]]['u_in'], marker = 'o');

It is likely that the big first peak in the histogram above is breaths where in the first part of the breath u_in does not reach the peak of the typicall u_in progression in the second part of the breath:

In [None]:
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[35]]['u_in'], marker = 'o');

# Pressure

Plot several

In [None]:
for idx in range(7):
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[idx]]['u_in'], marker = 'o', label = 'u_in')
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[idx]]['u_out'], marker = 'o', label = 'u_out')
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == unique_breaths[idx]]['pressure'], marker = 'o', label = 'pressure')
    plt.legend()
    plt.show()

Average pressure

In [None]:
avgs = np.zeros(BREATH_LENGTH)
for t in range(BREATH_LENGTH):
    avgs[t] = train['pressure'][t::BREATH_LENGTH].mean()
plt.plot(avgs)

We know from train.describe that there are negative pressure values. There are actually plenty.

In [None]:
train[train['pressure'] <= 0]

Breaths where we encounter a negative pressure:

In [None]:
negative_pressure_breaths = train[train['pressure'] <= 0]['breath_id'].unique()
negative_pressure_breaths

Few such examples. Interestingly, all of them have very small maximal values of u_in / none of them has the standard second part of the breath

In [None]:
for idx in [542, 851, 3928, 7949, 11216, 124575]:
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == idx]['u_in'], marker = 'o')
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == idx]['u_out'], marker = 'o')
    plt.plot(range(BREATH_LENGTH), train[train['breath_id'] == idx]['pressure'], marker = 'o')
    plt.show()

Indeed, they all have maximal u_in below four

In [None]:
max_u_in_for_negative_pressure_breaths = [train[train['breath_id'] == b]['u_in'].max() for b in negative_pressure_breaths]
plt.hist(max_u_in_for_negative_pressure_breaths);

Pressure typically starts between 4 and 7, but there are outliers

In [None]:
initial_pressure = train['pressure'][::BREATH_LENGTH]
plt.hist(initial_pressure,20);

They can not be quite determined from initial u_in alone

In [None]:
initial_u_in = train['u_in'][::BREATH_LENGTH]
plt.scatter(initial_u_in, initial_pressure);
plt.ylabel('Initial pressure')
plt.xlabel('Initial u_in');

Turns out only R = 50, C = 10 shows very low initial pressures

In [None]:
fig, axs = plt.subplots(len(r_unique), len(c_unique), figsize=(14,14))
for i, r in enumerate(r_unique):
    for j, c in enumerate(c_unique):
        foo = train[(train['R'] == r) & (train['C'] == c)]
        axs[i,j].scatter(foo['u_in'][::BREATH_LENGTH], foo['pressure'][::BREATH_LENGTH]);
        axs[i,j].set_ylim([-2,8])
        axs[i,j].text(35,-1.2,f'R={r},C={c}')
        axs[i,j].axhline(4, color = 'k', ls = '--')
        axs[i,j].set_ylabel('Initial pressure')
        axs[i,j].set_xlabel('Initial u_in');

Overplotting pressures for many breaths over each other reveals presence of several independent modes.

In [None]:
fig, axs = plt.subplots(len(r_unique), len(c_unique), figsize=(14,14))
for i, r in enumerate(r_unique):
    for j, c in enumerate(c_unique):
        foo = train[(train['R'] == r) & (train['C'] == c)]
        for k in range(600):
            axs[i,j].plot(range(35), foo[k*BREATH_LENGTH:k*BREATH_LENGTH+35]['pressure'], c = 'g', alpha = 0.005);
#        axs[i,j].set_ylim([0,8])
        axs[i,j].text(20,5,f'R={r},C={c}')
        axs[i,j].set_ylabel('Pressure')
        axs[i,j].set_xlabel('Timestep');    

# Analyze how often we see non-standard second part of u_in

We saw above that u_in very often follows a template once u_out is turned on. We want to see how often this happens.

Find the index of the first nonzero u_in after the u_out turned on. This should be the first point of the template for those breaths that follow it.

In [None]:
template_start = np.zeros(num_breaths, dtype = int)
for idx in range(num_breaths):
    u_in = train.iloc[idx*BREATH_LENGTH:(idx+1)*BREATH_LENGTH]['u_in'].values
    u_in_second_stage = u_in[u_out_transitions[idx]:]
    dt = np.argmax(u_in_second_stage != 0.) #Do not count the zeros right after the transition
    template_start[idx] = u_out_transitions[idx] + dt #Offset from the beginning

Read off the template from the first breath:

In [None]:
u_in_template = train[train['breath_id'] == unique_breaths[0]]['u_in'][template_start[0]:].values
l_template = len(u_in_template)
np.savetxt('u_in_template.txt', u_in_template)
plt.plot(u_in_template);
plt.xlabel('timestep')
plt.ylabel('u_in');

Investigate one breath with and one without the template. Plot the comparison, evaluate chi2. In principle, template can be shifted by one time step due to unequal time sampling, so we compare chi2 for three time shifts (-1, 0, 1 time step).

In [None]:
u_in = train[train['breath_id'] == unique_breaths[2]]['u_in'][template_start[2]:].values
plt.plot(u_in)
plt.plot(u_in_template)
l1 = len(u_in)
l2 = len(u_in_template)
print(sum((u_in[1:min(l1,l2)] - u_in_template[:min(l1,l2)-1])**2))
print(sum((u_in[:min(l1,l2)] - u_in_template[:min(l1,l2)])**2))
print(sum((u_in[:min(l1,l2)-1] - u_in_template[1:min(l1,l2)])**2))

In [None]:
u_in = train[train['breath_id'] == unique_breaths[29]]['u_in'][template_start[29]:].values
plt.plot(u_in)
plt.plot(u_in_template)
l1 = len(u_in)
l2 = len(u_in_template)
print(sum((u_in[1:min(l1,l2)] - u_in_template[:min(l1,l2)-1])**2))
print(sum((u_in[:min(l1,l2)] - u_in_template[:min(l1,l2)])**2))
print(sum((u_in[:min(l1,l2)-1] - u_in_template[1:min(l1,l2)])**2))

We find out whether the template is present by comparing u_in data during the final part of the breath and the template. We also store the chi2 value for a sanity check.

In [None]:
has_template = np.zeros(num_breaths, dtype = bool)
template_chi2 = np.zeros(num_breaths)
for idx in range(num_breaths):
    u_in = train.iloc[idx*BREATH_LENGTH + template_start[idx]:(idx+1)*BREATH_LENGTH]['u_in'].values
    li = len(u_in)
    #compare breath data shifted by +-1, 0 with the template, calculate chi2
    s1 = sum((u_in[1:min(li,l_template)] - u_in_template[:min(li,l_template)-1])**2)
    s2 = sum((u_in[:min(li,l_template)] - u_in_template[:min(li,l_template)])**2)
    s3 = sum((u_in[:min(li,l_template)-1] - u_in_template[1:min(li,l_template)])**2)
    template_chi2[idx] = min(s1, s2, s3)
    has_template[idx] = template_chi2[idx] < 4.

Sanity check:

1) we confirm results for the two breaths we investigated explicitly above

2) there are no breaths where chi2 is small but larger than 5 by a little bit - we have a clear separation

In [None]:
print(has_template[2])
print(has_template[29])
np.histogram(template_chi2, np.arange(0,50,4))

Vast majority of the breaths follows the standard protocol during the u_out = 1 phase:

In [None]:
print('Number of breaths with template: ', sum(has_template))
print(f'Fraction: {sum(has_template)/num_breaths:.3}')

More interestingly, in R = 50, C = 10 case only 60% of the breaths follow the template in the second part of the breath. For all other R, C combinations, all breaths follow the template once u_out is turned on. This suggests that for these R, C combinations we can compress all u_in information once u_out has been turned on into a single number (start time of the template).

In [None]:
print('R    C    fraction with template')
for r in r_unique:
    for c in c_unique:
        filt = (r_values == r) * (c_values == c)
        print(f'{r:02}   {c}   {sum(has_template[filt])/sum(filt):.2f}')

We check that our choice of chi2 cutoff was not too conservative. We find that the breath we labeled as having a template indeed seems to follow the same u_in pattern, but some time steps seem to be missing. We should investigate this in more detail.

In [None]:
worst_template_id = np.argmax(template_chi2*(template_chi2 < 4))
print('Chi2: ', template_chi2[worst_template_id])
u_in = train.iloc[worst_template_id*BREATH_LENGTH + template_start[idx]:(worst_template_id+1)*BREATH_LENGTH]['u_in'].values
plt.plot(u_in);
plt.ylabel("u_in")
plt.xlabel('time step');

Given R = 50, C = 10 is the most frequent combination, we still encounter plenty of cases with non-standard second part of the breath

In [None]:
without_template = [i for i in range(num_breaths) if has_template[i] == False]
len(without_template)

There is a wide range of shapes:

In [None]:
for i in without_template[:15]:
    b = unique_breaths[i]
    ts = template_start[i]
    u_in = train[train['breath_id'] == b]['u_in'][ts:].values
    plt.plot(u_in)
plt.ylim([0, 30]);

For each row of the dataframe, keep info about whether it belongs to a breath with/without template

In [None]:
full_has_template = np.zeros(len(train))
full_no_template  = np.zeros(len(train))
for idx in range(BREATH_LENGTH):
    full_has_template[idx::BREATH_LENGTH] = has_template
    full_no_template[idx::BREATH_LENGTH] = 1 - has_template

Plot the initial pressure/u_in distribution for R = 50, C = 10 with and without the template. The strange initial pressures all correspond to the cases without the template present in the second part of u_in. This strongy suggests that we can not consider the pressure at early times independent of the pressures at late times. Possibly because the same cycle was repeated several times?

In [None]:
foo_template = train[(train['R'] == 50) & (train['C'] == 10) & full_has_template]
foo_no_template = train[(train['R'] == 50) & (train['C'] == 10) & full_no_template]
plt.scatter(foo_no_template['u_in'][::BREATH_LENGTH], foo_no_template['pressure'][::BREATH_LENGTH], label = 'Without template');
plt.scatter(foo_template['u_in'][::BREATH_LENGTH], foo_template['pressure'][::BREATH_LENGTH], label = 'With template');
plt.ylim([-2,8])
plt.legend()
plt.text(35,-1.2,f'R=50,C=10')
plt.axhline(4, color = 'k', ls = '--')
plt.ylabel('Initial pressure')
plt.xlabel('Initial u_in');