In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlt

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")

### 変数
- R 

気道がどれだけ制限されているかを示す肺の属性（単位：cmH2O/L/S）。物理的には、流量（時間当たりの空気量）の変化に対する圧力の変化です。直感的には、ストローで風船を膨らませるようなイメージです。ストローの直径を変えることでRを変化させることができ、Rが大きいほど吹きにくくなります。

- C

肺の適合性を示す肺属性（単位：mL/cmH2O）。物理的には、圧力の変化に対する体積の変化です。直感的には、同じ風船の例を想像することができます。風船のラテックスの厚さを変えることでCを変化させることができます。Cが大きいほどラテックスが薄く、吹きやすいということになります。

- time_step

実際のタイムスタンプ

- u_in

吸気電磁弁の制御入力。範囲は0〜100です。

- u_out

探索的電磁弁の制御入力。0または1のいずれか。

- pressure

呼吸回路で測定された気道内圧。cmH2Oで測定されます。

In [None]:
train.head()

In [None]:
train.query("breath_id==3")["u_out"].value_counts()

In [None]:
train.query('breath_id==1 and u_out==0').count()

- breath_id : 

In [None]:
train.describe()

In [None]:
train.info()

### 変数同士の相関係数(sns.heatmap)

In [None]:
df = train.copy()
plt.figure(figsize=(12,10),dpi=80)
sns.heatmap(
    df.corr(),
    xticklabels = df.corr().columns,
    yticklabels = df.corr().columns,
    cmap = sns.diverging_palette(250,5,as_cmap=True),
    center=0,
    annot=True
)

plt.show()

### TrinデータとTestデータの被り(venn2)
- idの被りとbreath_idの被り

In [None]:
from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
from matplotlib_venn import venn3, venn3_circles
fig,ax = plt.subplots(figsize=(10,5),nrows=1,ncols=2)
venn2(subsets = (set(train["id"]),set(test["id"])),set_labels=("train","test"),ax=ax[0])
ax[0].set_title("id")
venn2(subsets = (set(train["breath_id"]),set(test["breath_id"])),set_labels=("train","test"),ax=ax[1])
ax[1].set_title("breath_id")
plt.show()

### ヒストグラムと分布図(sns.histplot)
- 重複なしで10000件のデータに対して可視化

In [None]:
fig,ax=plt.subplots(figsize=(16,8),nrows=2,ncols=3,tight_layout=True)
plt.subplots_adjust(hspace = 0.3)
num=0
cols=["R","C","time_step","u_in","u_out","pressure"]
df = train[cols].sample(n=10000)
for i in range(2):
    for j in range(3):
        sns.distplot(df[cols[num]],ax=ax[i,j])
        num += 1
        
plt.show() 

### RとCとu_outの棒グラフ(sns.countplot)

In [None]:
df = train[cols]
cols = ["R","C","u_out"]
fig , ax= plt.subplots(figsize=(15,5),nrows=1,ncols=3)

for i in range(3):
    sns.countplot(x=cols[i],data=df,ax=ax[i])

### 数値変数の箱髭図(sns.boxplot)

In [None]:
fig,ax=plt.subplots(figsize=(14,5),nrows=1,ncols=3,tight_layout=True)
plt.subplots_adjust(hspace = 0.3)
num=0
#R,C,U_outに関しては同じ数値が多いのでカテゴリとして扱う
cols=["time_step","u_in","pressure"]
df = train[cols]
for j in range(3):
    sns.boxplot(df[cols[num]],ax=ax[j])
    num += 1
        
plt.show() 

In [None]:
print(train["u_out"].value_counts())
print(train["C"].value_counts())
print(train["R"].value_counts())

### 可視化の方針
- カテゴリ変数(u_out,C,R)ごとの目的変数の分布
- RとCの種類


- カテゴリ変数ごとの目的変数の分布
 - u_outでの分布が少し変わっている

In [None]:
cols =["R","C","u_out","pressure"]
df = train[cols].sample(n=10000)

for i in range(3):
    grid = sns.FacetGrid(df, col=cols[i],hue=cols[i],col_wrap=3,size=4)
    grid.map(sns.distplot,"pressure")
    plt.show()


- breath_idごとのpressureの平均値、最小値、最大値の分布

In [None]:
group_pressure = train.groupby("breath_id").agg({"pressure":["mean","max","min"]}).reset_index()
group_pressure.columns = ["".join(i) for i in group_pressure.columns]

fig , ax = plt.subplots(figsize = (15,10) , nrows=1, ncols=3)
for i in range(1,4):  
    sns.distplot(x=group_pressure[group_pressure.columns[i]],ax = ax[i-1])
    ax[i-1].set_xlabel(group_pressure.columns[i])
    
    

### R,C,u_out別のbreath_idでのpressureの時間的変化
- plotlyを用いて可視化

カテゴリは18種類

In [None]:
df = train.copy()
df.groupby(["R","C","u_out"]).count()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

for row,c in enumerate([10,20,50]):
    fig = go.Figure()
    df = train.query('R==5 & C==@c').reset_index()#ランダムに10人抽出
    #df = train.copy()
    num_id = list(df["breath_id"].unique())
    sample_num_id = num_id[0:10]
    for i in sample_num_id:
        breath_id=df.loc[df["breath_id"]==i,"breath_id"].unique()[0]
        R=df.loc[df["breath_id"]==i,"R"].unique()[0]
        C=df.loc[df["breath_id"]==i,"C"].unique()[0]
        fig.add_trace(go.Scatter(x=df.loc[df["breath_id"] == i,"time_step"],
                            y=df.loc[df["breath_id"] == i, "pressure"],
                            name = f'breath_id:{breath_id},R:{R},C:{C}')
                           )
    fig.show()

In [None]:
for c in [10,20,50]:
    fig = go.Figure()
    df = train.query('R==20 & C==@c').reset_index()#ランダムに10人抽出
    #df = train.copy()
    num_id = list(df["breath_id"].unique())
    sample_num_id = num_id[0:10]
    for i in sample_num_id:
        breath_id=df.loc[df["breath_id"]==i,"breath_id"].unique()[0]
        R=df.loc[df["breath_id"]==i,"R"].unique()[0]
        C=df.loc[df["breath_id"]==i,"C"].unique()[0]
        fig.add_trace(go.Scatter(x=df.loc[df["breath_id"] == i,"time_step"],
                            y=df.loc[df["breath_id"] == i, "pressure"],
                            name = f'breath_id:{breath_id},R:{R},C:{C}')
                 )
    fig.show()

In [None]:
for c in [10,20,50]:
    fig = go.Figure()
    df = train.query('R==20 & C==@c').reset_index()#ランダムに10人抽出
    #df = train.copy()
    num_id = list(df["breath_id"].unique())
    sample_num_id = num_id[0:10]
    for i in sample_num_id:
        breath_id=df.loc[df["breath_id"]==i,"breath_id"].unique()[0]
        R=df.loc[df["breath_id"]==i,"R"].unique()[0]
        C=df.loc[df["breath_id"]==i,"C"].unique()[0]
        fig.add_trace(go.Scatter(x=df.loc[df["breath_id"] == i,"time_step"],
                            y=df.loc[df["breath_id"] == i, "pressure"],
                            name = f'breath_id:{breath_id},R:{R},C:{C}')
                 )
    fig.show()

### 新しい特徴量
累積のu_in

In [None]:
df_train = train.copy()
df_train['u_in_cumsum'] = (df_train['u_in']).groupby(df_train['breath_id']).cumsum()

In [None]:
df_train