First time trying out plotting stuff with seaborn and matplotlib

# Dependancies

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.patches as patches
import matplotlib.pyplot as plt

# Read CSVs and look at data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jul-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jul-2021/test.csv")

train["date_time"] = pd.to_datetime(train["date_time"])
test["date_time"] = pd.to_datetime(test["date_time"])

train.head()

In [None]:
print("Training data info")
print(f"Number of rows: {train.shape[0]};  Number of columns: {train.shape[1]}; No of missing values: {sum(train.isna().sum())}")
print()
print("Testing data info")
print(f"Number of rows: {test.shape[0]};  Number of columns: {test.shape[1]}; No of missing values: {sum(test.isna().sum())}")

# Distributions and time-series plots of data

In [None]:
background_color = "#F1F3F4"
orange = "#FF7124"
red = "#ED2939"
purple = "#9400D3"
sns.set_palette([red, purple])
legend_handles = [patches.Patch(edgecolor="black", facecolor=red, label="Train"), patches.Patch(edgecolor="black", facecolor=purple, label="Test")]

In [None]:
train_comb = train.loc[:,"deg_C":"sensor_5"]
train_comb["type"] = "train"
test_comb = test.loc[:,"deg_C":"sensor_5"]
test_comb["type"] = "test"
train_test = pd.concat((train_comb, test_comb))
train_test_long = pd.melt(train_test, id_vars="type", var_name="metrics", value_name="values")

fig, ax = plt.subplots(1, 1, figsize=(16, 8), dpi=250, facecolor=background_color)
sns.despine()

ax_sns = sns.boxplot(x="metrics", y="values", hue="type", data=train_test_long, ax=ax, boxprops={"zorder":2})
ax_sns.set_facecolor(background_color)
ax_sns.legend_.remove()
ax_sns.tick_params(labelsize=10, width=1, length=2.5)
ax_sns.set_xlabel("Features",fontsize=14, weight="bold")
ax_sns.set_ylabel("Values", fontsize=14, weight="bold")
ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.5)

fig.legend(handles=legend_handles, ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc="upper left", bbox_to_anchor=(0.14, 0.93))
fig.text(0.14, 0.93, "Distributions of features with boxplots", fontsize=24, weight="bold")


plt.show()

In [None]:
features = train_test.columns[0:8]

fig, axs = plt.subplots(2, 4, figsize=(16, 8), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()
for idx, feature in enumerate(features):
    i = int(idx / 4)
    j = idx % 4
    ax_sns = sns.histplot(data=train_test, x=feature, hue="type", bins=28, multiple="dodge", ax=axs[i,j], legend=False, zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5)
    ax_sns.set_yticks(list(range(0, 1001, 200)))
    ax_sns.set_xlabel(feature ,fontsize=10, weight="bold")
    ax_sns.set_ylabel(None)
    ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.3)
    
fig.legend(handles=legend_handles, ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc="upper left", bbox_to_anchor=(0.14, 0.93))
fig.text(0.14,0.93, "Distributions of features with histograms", fontsize=24, weight="bold") 

plt.show()

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(16, 8), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()
for idx, feature in enumerate(features):
    i = int(idx / 4)
    j = idx % 4
    ax_sns = sns.kdeplot(data=train_test, x=feature, hue="type", ax=axs[i,j], legend=False, fill=True, alpha=0.6, common_norm=False, zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5, left=False)
    ax_sns.set_xlabel(feature ,fontsize=10, weight="bold")
    ax_sns.set_ylabel(None)
    ax_sns.set_yticklabels([])
    
fig.legend(handles=legend_handles, ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc="upper left", bbox_to_anchor=(0.14, 0.93))
fig.text(0.14,0.93, "Normalized distributions of features with KDE plots", fontsize=24, weight="bold")

plt.show()

In [None]:
train_time = train.loc[:,"date_time":"deg_C"]
train_time["type"] = "train"
test_time = test.loc[:,"date_time":"deg_C"]
test_time["type"] = "test"
train_test_time = pd.concat((train_time, test_time)).loc[:,["date_time", "type"]]

fig, ax = plt.subplots(1, 1, figsize=(16, 2), dpi=250, facecolor=background_color)
sns.despine(left=True)

ax_sns = sns.histplot(data=train_test_time, x="date_time", hue="type", bins=int(len(train_test_time) / 2), ax=ax, legend=False, linewidth=0, alpha=1, zorder=2)
ax_sns.set_facecolor(background_color)
ax_sns.tick_params(labelsize=10, width=1, length=2.5, left=False)
ax_sns.set_xlabel("Date", fontsize=10, weight="bold")
ax_sns.set_ylabel(None)
ax_sns.set_yticklabels([])

fig.legend(handles=legend_handles, ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc="upper left", bbox_to_anchor=(0.17, 1.07))
fig.text(0.17,1.07, "Timespan of samples", fontsize=24, weight="bold")

plt.show()

In [None]:
fig, axs = plt.subplots(8, 1, figsize=(16, 16), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()

for idx, feature in enumerate(features):
    ax_sns = sns.lineplot(x=train["date_time"], y=train[feature], ax=axs[idx], ci=None, legend=False, zorder=2)
    sns.lineplot(x=test["date_time"], y=test[feature], ax=axs[idx], ci=None, legend=False, zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5)
    ax_sns.set_xlabel(None)
    ax_sns.set_ylabel(feature ,fontsize=10, weight="bold")
    ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.5)
    
fig.legend(handles=legend_handles, ncol=2, facecolor=background_color, edgecolor=background_color, fontsize=10, loc="upper left", bbox_to_anchor=(0.14, 0.91))
fig.text(0.14,0.91, "Time series plot of features", fontsize=24, weight="bold")

plt.show()


In [None]:
targets = train.columns[9:12]

fig, axs = plt.subplots(1, 3, figsize=(12, 4), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()
for idx, target in enumerate(targets):
    ax_sns = sns.histplot(data=train, x=target, bins=28, ax=axs[idx], legend=False, color=orange, zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5)
    ax_sns.set_xlabel(target ,fontsize=10, weight="bold")
    ax_sns.set_yticks(list(range(0, 1601, 200)))
    ax_sns.set_ylabel(None)
    ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.3)
    
fig.text(0.14,0.93, "Distributions of targets with histograms", fontsize=24, weight="bold") 

plt.show()


In [None]:
fig, axs = plt.subplots(3, 1, figsize=(16, 8), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()

for idx, target in enumerate(targets):
    ax_sns = sns.lineplot(x=train["date_time"], y=train[target], ax=axs[idx], ci=None, legend=False, color=orange, zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5)
    ax_sns.set_xlabel(None)
    ax_sns.set_ylabel(target ,fontsize=10, weight="bold")
    ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.5)
    
fig.text(0.14,0.91, "Time series plot of targets", fontsize=24, weight="bold")

plt.show()

# Correlation plots

In [None]:
features = train.columns[0:8]
train_corr = train[features].corr()
test_corr = test[features].corr()

fig, axs = plt.subplots(1, 2, figsize=(16, 8), dpi=250, facecolor=background_color)
mask = ~np.tril(np.ones_like(train_corr, dtype=bool))

ax_sns = sns.heatmap(train_corr, cmap=sns.dark_palette(red), square=True, linewidths=1, linecolor=background_color, mask=mask, ax=axs[0], cbar_kws={"shrink": 0.7}, annot=True, zorder=2)
ax_sns.set_facecolor(background_color)
ax_sns.tick_params(labelsize=10, width=1, length=2.5)
ax_sns.set_xlabel("Train features" ,fontsize=20, weight="bold")

ax_sns = sns.heatmap(test_corr, cmap=sns.dark_palette(purple), square=True, linewidths=1, linecolor=background_color, mask=mask, ax=axs[1], cbar_kws={"shrink": 0.7}, annot=True, zorder=2)
ax_sns.set_facecolor(background_color)
ax_sns.tick_params(labelsize=10, width=1, length=2.5)
ax_sns.set_xlabel("Test features" ,fontsize=20, weight="bold")

fig.text(0.14,0.85, "Correlation between features", fontsize=24, weight="bold") 

plt.show()

In [None]:
target_corr = train[targets].corr()

fig, ax = plt.subplots(1, 1, figsize=(6, 6), dpi=250, facecolor=background_color)
mask = ~np.tril(np.ones_like(target_corr, dtype=bool))

ax_sns = sns.heatmap(target_corr, cmap=sns.dark_palette(orange), square=True, linewidths=1, linecolor=background_color, mask=mask, ax=ax, cbar_kws={"shrink": 0.7}, annot=True, zorder=2)
ax_sns.set_facecolor(background_color)
ax_sns.tick_params(labelsize=10, width=1, length=2.5)

fig.text(-0.06,0.85, "Correlation between targets", fontsize=24, weight="bold") 

plt.show()

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(16, 8), dpi=250, facecolor=background_color)
fig.tight_layout
sns.despine()

for idx, target in enumerate(targets):
    corr = train[features].corrwith(train[target])
    ax_sns = sns.barplot(x=corr.index, y=corr, ax=axs[idx], color=orange, linewidth=0.8, edgecolor="black", zorder=2)
    ax_sns.set_facecolor(background_color)
    ax_sns.tick_params(labelsize=10, width=1, length=2.5)
    ax_sns.set_yticks([-1.0, -0.5, 0.0, 0.5, 1.0])
    ax_sns.set_ylabel(target ,fontsize=10, weight="bold")
    
    for p in ax_sns.patches:
        value = f"{p.get_height():0.2f}"
        x = p.get_x() + p.get_width() / 2
        y = p.get_height() + (0.09 if p.get_height() > 0 else -0.12)
        ax_sns.text(x, y, value, ha="center", va="center", fontsize=12)

    ax_sns.grid(which="major", axis="y", zorder=0, color="#CCCCCC", linewidth=0.5)