# This EDA specializes in time_step and u_out. 

If you find it useful, please upvote it. We plan to investigate other parameters in the future.

Chart Plot referred to [Ventilator Pressure Prediction: EDA, FE and models](https://www.kaggle.com/artgor/ventilator-pressure-prediction-eda-fe-and-models). Thank you very much.

Following this, I made the u_in version: [EDA about u_in](https://www.kaggle.com/marutama/eda-about-u-in).
Please take a look here as well.

**Note: The point where 'time_step' is strange is changed to a broken line. It was speeded up by calculating time_delta in advance. October 5th.**

The importance of the features introduced in the "EDA about" series below:
- [EDA about: LSTM Feature Importance](https://www.kaggle.com/marutama/eda-about-lstm-feature-importance)

And [finetune of Tensorflow Bi-LSTM EDA about](https://www.kaggle.com/marutama/finetune-of-tensorflow-bi-lstm-eda-about) is for Modeling.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def plot_bid(bid):
    fig, ax1 = plt.subplots(figsize = (6, 4)) # original (12, 8)

    tmp = train.loc[train['breath_id'] == bid].reset_index(drop=True)
    ax2 = ax1.twinx()

    ax1.plot(tmp['time_step'], tmp['pressure'], 'm-', label='pressure')
    ax1.plot(tmp['time_step'], tmp['u_in'], 'g-', label='u_in')
    ax2.plot(tmp['time_step'], tmp['u_out'], 'b-', label='u_out')

    ax1.set_xlabel('Timestep')
    
    R = tmp['R'][0]
    C = tmp['C'][0]
    ax1.set_title(f'breath_id:{bid}, R:{R}, C:{C}')

    ax1.legend(loc=(1.1, 0.8))
    ax2.legend(loc=(1.1, 0.7))
    plt.show()

In [None]:
def plot_time_step(bid):
    plt.figure()
    tmp = train.loc[train['breath_id'] == bid].reset_index(drop=True)
    R = tmp['R'][0]
    C = tmp['C'][0]
    plt.title(f'breath_id:{bid}, R:{R}, C:{C}')
    plt.ylabel('Timestep')
    plt.xlabel('Row No.')

    plt.plot(train.loc[train['breath_id'] == bid]['time_step'].tolist())
    plt.show()

In [None]:
oj = os.path.join

In [None]:
path = '../input/ventilator-pressure-prediction'
train = pd.read_csv(oj(path, 'train.csv'))
test  = pd.read_csv(oj(path, 'test.csv'))
sub   = pd.read_csv(oj(path, 'sample_submission.csv'))

# Add Features

In [None]:
%%time
train['time_delta'] = train.groupby('breath_id')['time_step'].diff()

In [None]:
train

# Unique number in each column

In [None]:
# number of unique
train.nunique()

# Let's plot the first three

In [None]:
bid_list = list(train['breath_id'].unique())

In [None]:
print(len(bid_list))
bid_list[:10]

breath_id is not a continuous number.

In [None]:
for bid in bid_list[:3]:
    plot_bid(bid)

# EDA about time_step

## The number of time_steps in each breath_id

In [None]:
6036000 / 75450

The number of all time_steps will be 80.

## EDA about time_step

First, let's plot the time_step with breath_id = 1.

In [None]:
plot_time_step(1)

It looks like a proportional straight line. I'll try to find out if everything is so.

In [None]:
#first_one_list = []
#last_one_list  = []
#outlier_bid_list = []
#for bid in tqdm(bid_list):
#    tmpdf = train.loc[train['breath_id'] == bid]['time_step'].reset_index(drop=True)
#    first_one_list.append(tmpdf[0])
#    last_one_list.append(tmpdf[79])
#    if tmpdf[79] > 2.8:
#        outlier_bid_list.append(bid)

speed up version

In [None]:
%%time
first_df = train.loc[0::80,:]
last_df = train.loc[79::80,:]

first_one_list = list(first_df['time_step'])
last_one_list = list(last_df['time_step'])

In [None]:
list(last_df[last_df['time_step']>2.8]['breath_id'])

In [None]:
%%time
outlier_bid_list = list(last_df[last_df['time_step']>2.8]['breath_id'])

### Start point

In [None]:
plt.hist(first_one_list, bins=100)
plt.show()

All time_steps start at 0.

### End point

In [None]:
plt.hist(last_one_list, bins=100)
plt.show()

There is one big lump in 2.5 seconds and four lumps around 2.7 seconds.

Let's zoom in.

In [None]:
plt.hist(last_one_list, bins=100)
plt.ylim(0,5)
plt.show()

In [None]:
outlier_bid_list

When expanded, there are two large chunks and seven outliers greater than 2.8. Let's take a look at that chart.

In [None]:
for bid in outlier_bid_list:
    plot_bid(bid)

The shape of the chart after u_out becomes 1 is all the same. breath_id: 44245 has a slanted rise of u_out. Let's plot the time_step of 44245.

In [None]:
plot_time_step(44245)

There seems to be a time_step that is not a proportional straight line. Let's find out.

In [None]:
#no_prop_list = []
#for bid in tqdm(bid_list):
#    fx = train.loc[train['breath_id'] == bid]['time_step'].reset_index(drop=True)
#
#    x_max = 79
#    y_max = fx[x_max]
#    a = (fx[x_max] - fx[0]) / x_max
#    
#    for i in range(80):
#        d = fx[i] - a * i
#        if np.abs(d) > 0.1: # not proportional
#            no_prop_list.append(bid)
#            break

speed up, more!

In [None]:
no_prop_list = list(train.loc[train['time_delta']>0.15]['breath_id'].unique())

In [None]:
for bid in no_prop_list:
    plot_time_step(bid)

There is a chart that is broken in one place and a chart that is broken in two places. Separate them.

In [None]:
broken_one_list = [3178, 16315, 18117, 24127, 28942, 39045, 46324, 54129, 55244, 72104, 76037, 87776, 104001, 119689, 120878]
broken_two_list = [36175, 38415, 44245, 55851, 74766, 109693, 111439]

In [None]:
def plot_double_bid(bid, time_delta=False):
    fig = plt.figure(figsize = (12, 4))
    ax1 = fig.add_subplot(1, 2, 1)
    ax2 = fig.add_subplot(1, 2, 2)
    
    tmp = train.loc[train['breath_id'] == bid].reset_index(drop=True)

    ts = []
    td = []
    if time_delta:
        outlier = tmp.loc[tmp['time_delta'] > 0.15]
        rw = list(outlier['id'])
        ts = list(outlier['time_step'])
        td = list(outlier['time_delta'])
        
    
    R = tmp['R'][0]
    C = tmp['C'][0]
    ax1.set_title(f'breath_id:{bid}, R:{R}, C:{C}')
    ax1.set_ylabel('Timestep')
    ax1.set_xlabel('Row No.')

    ymax = 3.0
    ax1.set_ylim(0, ymax)

    if time_delta:
        rows = []
        for a in rw:
            aa = a % 80 - 2
            if aa < 0:
                aa += 80
            rows.append(aa)
            aa = a % 80 - 1
            if aa < 0:
                aa += 80
            rows.append(aa)
        ax1.vlines(rows, 0, ymax, "red", linestyles='dashed', alpha=0.2)

    
    ax1.plot(train.loc[train['breath_id'] == bid]['time_step'].tolist())

    ##############################
    ax3 = ax2.twinx()

    ax2.plot(tmp['time_step'], tmp['pressure'], 'm-', label='pressure')
    ax2.plot(tmp['time_step'], tmp['u_in'], 'g-', label='u_in')
    ax3.plot(tmp['time_step'], tmp['u_out'], 'b-', label='u_out')

    ax2.set_xlabel('Timestep')
    
    R = tmp['R'][0]
    C = tmp['C'][0]
    ax2.set_title(f'breath_id:{bid}, R:{R}, C:{C}')

    ymax = 100
    ax2.set_ylim(0, ymax)
    
    if time_delta:
        lines = []
        for a, b in zip(ts, td):
            lines.append(a-b)
            lines.append(a)
        ax2.vlines(lines, 0, ymax, "red", linestyles='dashed', alpha=0.2)
    
    ax2.legend(loc=(1.1, 0.8))
    ax3.legend(loc=(1.1, 0.7))
    
    fig.tight_layout()
    plt.show()


In [None]:
for bid in broken_one_list:
    plot_double_bid(bid, time_delta=True)

In [None]:
for bid in broken_two_list:
    plot_double_bid(bid, time_delta=True)

The charts of 74766 and 109693 have different shapes. Only for 44245, when u_out rises, the time_step breaks overlap, and the rise of u_out is slanted.

The charts are similar except for breath_id: 16315.

# 

# EDA about u_out

u_out rises to 1.0 in around 1.0 seconds. Let's examine the distribution.

In [None]:
# u_out1_timing
# generate empty df
#df = pd.DataFrame(columns=['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure'])
#for i in tqdm(bid_list):
#    breath_one = train[train['breath_id']==i].reset_index(drop = True)
#    tmp_df=breath_one[breath_one['u_out']==1].head(1)
#    df = df.append(tmp_df)


speed up version

In [None]:
%%time
# u_out1_timing : spped up
train['u_out_diff'] = train['u_out'].diff()
train['u_out_diff'].fillna(0, inplace=True)
train['u_out_diff'].replace(-1, 0, inplace=True)
df = train[train['u_out_diff']==1]

In [None]:
df

It shows the time when time_step of this df rises to 1. Let's plot.

In [None]:
plt.hist(df['time_step'], bins=100)
plt.show()

Around 1.0 second, there are large chunks of each. However, there are likely to be outliers for 0.95 seconds or less and 1.05 seconds or more. Let's enlarge it.

In [None]:
plt.hist(df['time_step'], bins=100)
plt.ylim(0,5)
plt.show()

There were 3 points.

In [None]:
df[(df['time_step']<0.95) | (df['time_step']>1.05)]

In [None]:
u_out1_outlier = list(df[(df['time_step']<0.96) | (df['time_step']>1.04)]['breath_id'])
u_out1_outlier

# u_out1 outlier

In [None]:
for bid in u_out1_outlier:
    plot_double_bid(bid, time_delta=True)

44245 and 129878 are also outliers for time_step. All three have similar chart shapes.