In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv", index_col='row-col')

In [None]:
_ = data.copy()

# 1. Basic Exploration

In [None]:
import missingno as msno
msno.matrix(_)

In [None]:
_.head()

In [None]:
def missing(df):
    # Calculate missing value and their percentage for each feature
    missing_percent = df.isnull().sum() * 100 / df.shape[0]
    df_missing_percent = pd.DataFrame(missing_percent).round(2)
    df_missing_percent = df_missing_percent.reset_index().rename(
                    columns={
                            'index':'Feature',
                            0:'Missing Percentage (%)'
                    }
                )
    df_missing_value = df.isnull().sum()
    df_missing_value = df_missing_value.reset_index().rename(
                    columns={
                            'index':'Feature',
                            0:'Missing Values'
                    }
                )

    Final = df_missing_value.merge(df_missing_percent, how = 'inner', left_on = 'Feature', right_on = 'Feature')
    Final = Final.sort_values(by = 'Feature')
    return Final

In [None]:
with_missing = missing(_)
with_missing.loc[with_missing['Missing Percentage (%)'] > 0]

In [None]:
_.info()

In [None]:
print(_.shape)

In [None]:
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("",
                                                               ['#ECB390',
                                                                '#FCF8E8',
                                                                '#94B49F',
                                                               ])

stats = _.describe().T.drop('row_id').style.background_gradient(cmap=cmap)
pd.option_context('display.max_rows', None,)
stats

- `F_2` are the only float type, while the rest are in terms of int.
- No missing values in `F_2`.
- The percentage of missing values per feature wrt to whole range between **1.80%** to **1.84%.**
- The labels among `F_2` and `F_4` are more varying and noisy as opposed to `F_1` and `F_3`.
- Some of the highest mean values came from `F_2`.
- Highest standard deviation and max value, and lowest min value in `F_4_11`.

# 2. Exploratory Data Analysis

In [None]:
# color palette for visualizations
colors = ['#FCF8E8', '#94B49F', '#ECB390', '#DF7861',]
palette = sns.color_palette( palette = colors)

sns.palplot(palette, size = 2.5)

plt.text(-0.5,
         -0.7,
         'Color Palette', 
         {'font':'monospace',
          'size': 24,
          'weight':'normal'}
        )

plt.show()

In [None]:
plt.subplots(figsize = (30, 30))
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("",
                                                           ['#ECB390',
                                                            '#FCF8E8',
                                                            '#94B49F',
                                                           ])

mask = np.triu(np.ones_like(_.corr() ))
sns.heatmap(_.corr(),
            mask = mask,
            cmap = cmap,
            square = True,
           )

Only features belonging to 2 and 4 have significantly varying correlations. Let's zoom in both features to understand better.

In [None]:
_.columns

In [None]:
f2 = _[['F_2_0', 'F_2_1', 'F_2_2', 'F_2_3', 'F_2_4', 'F_2_5', 'F_2_6',
       'F_2_7', 'F_2_8', 'F_2_9', 'F_2_10', 'F_2_11', 'F_2_12', 'F_2_13',
       'F_2_14', 'F_2_15', 'F_2_16', 'F_2_17', 'F_2_18', 'F_2_19', 'F_2_20',
       'F_2_21', 'F_2_22', 'F_2_23', 'F_2_24']]

plt.subplots(figsize = (20, 20))
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("",
                                                           ['#ECB390',
                                                            '#FCF8E8',
                                                            '#94B49F',
                                                           ])

mask = np.triu(np.ones_like(f2.corr() ))
sns.heatmap(f2.corr(),
            mask = mask,
            cmap = cmap,
            cbar = False,
            square = True,
            annot = True,
            linewidths = 3,
           )

In [None]:
f4 = _[['F_4_0', 'F_4_1', 'F_4_2', 'F_4_3', 'F_4_4', 'F_4_5', 'F_4_6', 'F_4_7',
       'F_4_8', 'F_4_9', 'F_4_10', 'F_4_11', 'F_4_12', 'F_4_13', 'F_4_14']]

plt.subplots(figsize = (12, 12))
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("",
                                                           ['#ECB390',
                                                            '#FCF8E8',
                                                            '#94B49F',
                                                           ])

mask = np.triu(np.ones_like(f4.corr() ))
sns.heatmap(f4.corr(),
            mask = mask,
            cmap = cmap,
            cbar = False,
            square = True,
            annot = True,
            linewidths = 3,
           )

In [None]:
f = _.describe().T.drop('row_id')
f = f.reset_index().rename(columns = {'index': 'features'})

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = f['features'],
                         y = f['mean'],
                         mode = 'markers',
                         marker_color = '#94B49F',
                         name = 'mean',
                 ))

fig.add_trace(go.Scatter(x = f['features'],
                         y = f['min'],
                         mode = 'markers',
                         marker_color = '#ECB390',
                         name = 'min',
                 ))

fig.add_trace(go.Scatter(x = f['features'],
                         y = f['max'],
                         mode = 'markers',
                         marker_color = '#DF7861',
                         name = 'max',
                 ))

fig.update_layout(title='Visualizing the Statistics',
                  titlefont={'size': 24,
                             'family': 'Proxima Nova',
                            },
#                   paper_bgcolor='#FCF8E8',
                  plot_bgcolor='#FCF8E8',
                  hovermode="x unified",
                  width = 800,
                 )
fig.update_traces(hovertemplate=None)
fig.update_yaxes(showgrid=False, showline=False, showticklabels=True)
fig.update_xaxes(showgrid=False, showline=False, showticklabels=True)
fig.show()

- The min values of F_2 are in line among the mean values of F_1 and F_3.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = f['features'],
                         y = f['50%'],
                         mode = 'markers',
                         marker_color = '#94B49F',
                         name = 'median',
                 ))

fig.add_trace(go.Scatter(x = f['features'],
                         y = f['mean'],
                         mode = 'markers',
                         marker_color = '#ECB390',
                         name = 'mean',
                 ))

fig.update_layout(title='Skewness of the Features',
                  titlefont={'size': 24,
                             'family': 'Proxima Nova',
                            },
#                   paper_bgcolor='#FCF8E8',
                  plot_bgcolor='#FCF8E8',
                  hovermode="x unified",
                  width = 800,
                 )
fig.update_traces(hovertemplate=None)
fig.update_yaxes(showgrid=False, showline=False, showticklabels=True)
fig.update_xaxes(showgrid=False, showline=False, showticklabels=True)
fig.show()

Skewness can be indirectly depicted by the difference between the mean and median.
- Among F_1, `F_1_7`, `F_1_12`, and `F_1_13` follow a similar distribution characterized by **negative skewness.**
- Among F_3, `F_3_19`, `F_3_21`, aforementionedly follow the same characteristics.
- Among F_2, **medians** are about the same across:
 - `F_2_4`,`F_2_10`,`F_2_12`, and `F_2_14`.
 - `F_2_0`,`F_2_1`,`F_2_3`, `F_2_7`, `F_2_11`, `F_2_12`, `F_2_15`, `F_2_16`, `F_2_20`, `F_2_21`.

In [None]:
plt.figure(figsize=(25, 25))

i = 1
for F in f2.columns:
    plt.subplot(5, 5, i)
    ax = sns.countplot(data = _,
                       x = _[F],
                       palette = ['#94B49F', '#ECB390', '#DF7861'],
                      )
    ax.set(ylabel=None)
    i += 1

Features 2 are actually categorical in nature. Most of them are heavily skewed to the right.

In [None]:
from plotly.colors import n_colors
colors = n_colors(
    'rgb(148, 180, 159)',
    'rgb(236, 179, 144)',
    15,
    colortype='rgb'
)

In [None]:
fig = go.Figure()

i = 1
for F in f4.columns:
    fig.add_trace(go.Violin(
        x = _[F],
        name = F,
        line_color = colors[i-1],
    ))
    i += 1

fig.update_traces(orientation='h',
                  side='positive',
                  width=3,
                  points=False)

fig.update_layout(xaxis_showgrid=False,
                  xaxis_zeroline=False,
                  
                  title='F_4 Distributions',
                  titlefont={'size': 24,
                             'family': 'Proxima Nova',
                            },
                  
                  template='simple_white',
                  paper_bgcolor='#FCF8E8',
                  plot_bgcolor='#FCF8E8',                
#                   width=1500,
#                   height=800
                 )

fig.show()