In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
import seaborn as sns
from dask import dataframe as dd

colors_dark = ["#313131", '#484848', '#AEAEAE'] 

In [None]:
df = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")

In [None]:
def reformat_large_tick_values(tick_val, pos):
    if tick_val >= 1000000000:
        val = round(tick_val/1000000000, 1)
        new_tick_format = '{:}B'.format(val)
    elif tick_val >= 1000000:
        val = round(tick_val/1000000, 1)
        new_tick_format = '{:}M'.format(val)
    elif tick_val >= 1000:
        val = round(tick_val/1000, 1)
        new_tick_format = '{:}K'.format(val)
    elif tick_val < 1000:
        new_tick_format = round(tick_val, 1)
    else:
        new_tick_format = tick_val

    new_tick_format = str(new_tick_format)
    
    index_of_decimal = new_tick_format.find(".")
    
    if index_of_decimal != -1:
        value_after_decimal = new_tick_format[index_of_decimal+1]
        if value_after_decimal == "0":
            new_tick_format = new_tick_format[0:index_of_decimal] + new_tick_format[index_of_decimal+2:]
            
    return new_tick_format

# R 

In [None]:
dfs =  df.R.value_counts().to_frame()

fig, ax = plt.subplots(figsize=(18, 8))

bars = ax.bar(dfs.index.astype(str), dfs.R, alpha=0.3, color="#8E9798")

ax.set_xticks(dfs.index.astype(str))
ax.set_xticklabels(dfs.index.astype(str))
ax.grid(axis="y", alpha=0.5)
ax.set_axisbelow(True)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_xlabel("R", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("R Count", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.tick_params(labelsize=14)

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

font = {
    'color':  'white',
    'weight': 'bold',
    'size': 24,
}

ax.text(
    x=bars[0].get_x()+0.25, y=bars[0].get_height() / 2,
    s=reformat_large_tick_values(bars[0].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[1].get_x()+0.25, y=bars[1].get_height() / 2,
    s=reformat_large_tick_values(bars[1].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[2].get_x()+0.25, y=bars[2].get_height() / 2,
    s=reformat_large_tick_values(bars[2].get_height(), 0),
    fontdict=font,
    va="center",
)

bars[0].set_color('#83C8CC')
bars[0].set_alpha(1)
bars[0].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D61')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D64')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

plt.suptitle("C value counts", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("No significant changes were seen.", loc='center', x=0.50, y=1.03, fontsize=14, color=colors_dark[1])
plt.tight_layout()
plt.show()

# C 

In [None]:
dfs =  df.C.value_counts().to_frame()

fig, ax = plt.subplots(figsize=(18, 8))

bars = ax.bar(dfs.index.astype(str), dfs.C, alpha=0.3, color="#8E9798")

ax.set_xticks(dfs.index.astype(str))
ax.set_xticklabels(dfs.index.astype(str))
ax.grid(axis="y", alpha=0.5)
ax.set_axisbelow(True)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_xlabel("C", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("C Count", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.tick_params(labelsize=14)

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

font = {
    'color':  'white',
    'weight': 'bold',
    'size': 24,
}

ax.text(
    x=bars[0].get_x()+0.25, y=bars[0].get_height() / 2,
    s=reformat_large_tick_values(bars[0].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[1].get_x()+0.25, y=bars[1].get_height() / 2,
    s=reformat_large_tick_values(bars[1].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[2].get_x()+0.25, y=bars[2].get_height() / 2,
    s=reformat_large_tick_values(bars[2].get_height(), 0),
    fontdict=font,
    va="center",
)

bars[0].set_color('#83C8CC')
bars[0].set_alpha(1)
bars[0].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D61')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D64')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

plt.suptitle("C value counts", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("No significant changes were seen.", loc='center', x=0.50, y=1.03, fontsize=14, color=colors_dark[1])
plt.tight_layout()
plt.show()

# R and C 

In [None]:
df["R_C"] = df.R.astype(str) + "_" + df.C.astype(str)
dfs = df.R_C.value_counts().to_frame()

fig, ax = plt.subplots(figsize=(18, 8))

bars = ax.bar(dfs.index.astype(str), dfs.R_C, alpha=0.3, color="#8E9798")

ax.set_xticks(dfs.index.astype(str))
ax.set_xticklabels(dfs.index.astype(str))
ax.grid(axis="y", alpha=0.5)
ax.set_axisbelow(True)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_xlabel("R and C", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("R and C Count", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.tick_params(labelsize=14)

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

font = {
    'color':  'white',
    'weight': 'bold',
    'size': 24,
}

ax.text(
    x=bars[0].get_x()+0.21, y=bars[0].get_height() / 2,
    s=reformat_large_tick_values(bars[0].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[1].get_x()+0.21, y=bars[1].get_height() / 2,
    s=reformat_large_tick_values(bars[1].get_height(), 0),
    fontdict=font,
    va="center",
)


bars[0].set_color('#83C8CC')
bars[0].set_alpha(1)
bars[0].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D61')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

plt.suptitle("R_C value counts", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.title("Read the count value from the data that combines each category.", loc='center', x=0.52, y=1.03, fontsize=14, color=colors_dark[1])
plt.tight_layout()
plt.show()

# What is the u_in relationship between r_c?

In [None]:
dfs = df.groupby("R_C").mean().loc[:, ["u_in"]].sort_values("u_in", ascending=False)
mean_u_in = df.u_in.mean()


fig, ax = plt.subplots(figsize=(18, 8))

bars = ax.bar(dfs.index, dfs.u_in, alpha=0.3, color="#8E9798")

ax.set_xticks(dfs.index)
ax.set_xticklabels(dfs.index)
ax.grid(axis="y", alpha=0.5)
ax.set_axisbelow(True)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_xlabel("R_C", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("u_in values", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.tick_params(labelsize=14)

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

font = {
    'color':  'white',
    'weight': 'bold',
    'size': 24,
}

ax.text(
    x=bars[0].get_x()+0.19, y=bars[0].get_height() / 2,
    s=reformat_large_tick_values(bars[0].get_height(), 0),
    fontdict=font,
    va="center",
)

ax.text(
    x=bars[1].get_x()+0.19, y=bars[1].get_height() / 2,
    s=reformat_large_tick_values(bars[1].get_height(), 0),
    fontdict=font,
    va="center",
)

line1 = ax.axhline(y=mean_u_in, linestyle="--", color="#8E9798")

ax.text(
    x=8.7, 
    y=mean_u_in,
    s="Average \n u_in",
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=14
)

bars[0].set_color('#83C8CC')
bars[0].set_alpha(1)
bars[0].set_edgecolor(colors_dark[0])

bars[1].set_color('#2A5D61')
bars[1].set_alpha(1)
bars[1].set_edgecolor(colors_dark[0])

plt.suptitle("Average value of u_in from each category", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

In contrast to the graphs we've seen so far, the differences between categories are clear.  
Seems to be separated by peculiarities in some examples

# What is the difference between top and bottom ? 

In [None]:
df1 = df[df.R_C == "5_50"].groupby("breath_id").mean().loc[:, ["u_in"]].sort_values("u_in", ascending=False)[:20].sort_values("u_in")
df2 = df[df.R_C == "50_10"].groupby("breath_id").mean().loc[:, ["u_in"]].sort_values("u_in", ascending=False)[:20].sort_values("u_in")

fig, ax = plt.subplots(1, 2, figsize=(18, 14))

bars1 = ax[0].barh(y=df1.index.astype(str), width=df1.u_in, edgecolor=colors_dark[0], color="#83C8CC")
bars2 = ax[1].barh(y=df2.index.astype(str), width=df2.u_in, edgecolor=colors_dark[0], color="#2A5D61")

ax[0].grid(axis="x", alpha=0.5)
ax[0].xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[0].set_axisbelow(True)
ax[0].tick_params(labelsize=12)
ax[0].set_title("The magnitude relationship of the average value \n of breath_id u_in in which r_c belongs to 5_50", loc='center', fontsize=14)
ax[0].set_xlabel("u_in", fontsize=14, fontweight='bold', color=colors_dark[0])
ax[0].set_ylabel("breath_id", fontsize=14, fontweight='bold', color=colors_dark[0])
ax[0].legend(["group breath id by u_in"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)

ax[0].spines['right'].set_visible(False)
ax[0].spines['left'].set_visible(False)
ax[0].spines['top'].set_visible(False)


ax[1].grid(axis="x", alpha=0.5)
ax[1].xaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax[1].set_axisbelow(True)
ax[1].tick_params(labelsize=12)
ax[1].set_title("The magnitude relation of the average value \n in u_in of breath_id in which r_c belongs to 20_10", loc='center', fontsize=14, color=colors_dark[0])
ax[1].set_xlabel("u_in", fontsize=14, fontweight='bold', color=colors_dark[0])
ax[1].legend(["group breath_id by u_in"], loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=1, frameon=False)

ax[1].spines['right'].set_visible(False)
ax[1].spines['left'].set_visible(False)
ax[1].spines['top'].set_visible(False)

plt.suptitle("Top 10 : Visible u_in from top and bottom classifications", fontsize=18, fontweight='bold', color=colors_dark[0])
plt.tight_layout()
plt.show()

As we have seen, there is a big difference between the two unique breath_ids, but the value taken by u_in is also small.

# Let's look at the width of u_in by dividing it from the time axis.

Shows each color when separated

In [None]:
a = pd.cut(df.u_in, bins=6).unique()
pd.DataFrame({"u_in Amounts": a, "color": ["Blue", "Light Blue", "Green", 'Yellow', 'Orange', "Magenta"]})

In [None]:
def split_time(x):
    if x < 1.0:
        return 0
    elif x < 1.5:
        return 1
    else:
        return 2 
    
u_in_color = ["#3442C1", '#33955A', '#49C8DA', '#CE6CB9', '#CEA76C', '#CE6C6C']

df["timestamp"] = df.time_step.apply(split_time)
df["u_in_classes"] = pd.cut(df.u_in, bins=6, labels=False)
dfs = pd.crosstab(df.timestamp, df.u_in_classes)
mean_u_in = dfs.mean().mean()


fig, ax = plt.subplots(figsize=(14, 8))

bar = dfs.plot(kind='bar', ax=ax, stacked=True, color=u_in_color, rot=0, edgecolor=colors_dark[0], alpha=0.8)
line1 = ax.axhline(y=mean_u_in, linestyle="--", color="#8E9798")

ax.text(
    x=2.3, 
    y=mean_u_in,
    s="Average \nu_in counts",
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=14
)

ax.grid(axis="y", alpha=0.5)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.tick_params(labelsize=14)
ax.set_xlabel("Time Step", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("u_in classes counts", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=2, frameon=False, fontsize=12)

ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)


plt.annotate(f'*time step from {df.time_step.min()} to {df.time_step.max()}', (0,0), (650,-80), fontsize=11,xycoords='axes fraction', textcoords='offset points')
plt.suptitle("How does it change over time? ", fontsize=18, fontweight='bold', color=colors_dark[1])
plt.title("In the initial stage, it takes various values, \nlarge and small, but the range of values becomes smaller with the passage of time.", loc='center', x=0.59, y=1.03, fontsize=14, color=colors_dark[0])
plt.tight_layout()
plt.show()

# Let's look at the objective variable in the same way.

In [None]:
a = pd.cut(df.pressure, bins=6).unique()
pd.DataFrame({"u_in Amounts": a, "color": ["Blue", "Light Blue", "Green", 'Yellow', 'Orange', "Magenta"]})

In [None]:
df["pressure_classes"] = pd.cut(df.pressure, bins=6, labels=False)
dfs = pd.crosstab(df.timestamp, df.pressure_classes)
mean_pressure = dfs.mean().mean()


fig, ax = plt.subplots(figsize=(14, 8))

bar = dfs.plot(kind='bar', ax=ax, stacked=True, color=u_in_color, rot=0, edgecolor=colors_dark[0], alpha=0.8)
line1 = ax.axhline(y=mean_pressure, linestyle="--", color="#8E9798")

ax.text(
    x=2.4, 
    y=mean_u_in,
    s="Average \npressure counts",
    backgroundcolor="#2A5D61",
    color="white",
    fontweight='bold',
    fontsize=14
)

ax.grid(axis="y", alpha=0.5)
ax.yaxis.set_major_formatter(tick.FuncFormatter(reformat_large_tick_values))
ax.set_axisbelow(True)
ax.tick_params(labelsize=14)
ax.set_xlabel("Time Step", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.set_ylabel("pressure classes counts", fontsize=14, fontweight='bold', color=colors_dark[0])
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=5, borderpad=2, frameon=False, fontsize=12)


ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)

plt.annotate(f'*time step from {df.time_step.min()} to {df.time_step.max()}', (0,0), (650,-80), fontsize=11,xycoords='axes fraction', textcoords='offset points')
plt.suptitle("How does it change over time? ", fontsize=18, fontweight='bold', color=colors_dark[1])
plt.title("Like u_in,\n it initially detects a wide range of values, and the range of values gradually narrows.", loc='center', x=0.59, y=1.03, fontsize=14, color=colors_dark[0])
plt.tight_layout()
plt.show()