In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import warnings
from tqdm import tqdm
from IPython.display import display
from cycler import cycler

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error

from mlxtend.regressor import StackingRegressor
from mlxtend.classifier import StackingClassifier

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier

sns.set()
%matplotlib inline
warnings.filterwarnings(action='ignore')

In [None]:
def read_data(dir='../input/tabular-playground-series-jun-2022'):
    return (
        pd.read_csv(f'{dir}/data.csv'),
        pd.read_csv(f'{dir}/sample_submission.csv'),
    )

def create_submission_file(df, cols=['id', 'target'], file_name='submission.csv'):
    df[cols].to_csv(f'submissions/{file_name}', index=False)    
    
data_df, submission_df = read_data()

## Introduction

This notebook walks through the dataset and perform high level exploration of different features. Motivation behind this notebook is to understand the dataset at a high level before starting to work on the modeling part.

This notebook does the followings:

- Explores the different features and their distributions.
- Explore the correlation between different features.
- Check the number of null values for each features and at row level.

## Bird Eye View of the Dataset

In [None]:
print(data_df.shape)
print(submission_df.shape)
display(data_df.head(10))

In [None]:
data_df.info()

In [None]:
pd.set_option('display.max_columns', 500)
display(data_df[data_df.columns[1:]].agg(['mean', 'min', 'max', 'std']))

In [None]:
pd.set_option('display.max_rows', 500)
display(data_df[data_df.columns[1:]].agg(['mean', 'min', 'max', 'std']).transpose())

## Feature Level Data Analysis

### Number of rows with null values for each columns

In [None]:
cols_with_null_values = []
cols_without_null_values = []
for col in data_df.columns:
    null_rows = data_df[data_df[col].isna()].shape[0]
    if null_rows > 0:
        cols_with_null_values.append(col)
        print(f"{col} column has {null_rows} null rows which is {null_rows * 100 /data_df.shape[0]}% of the data.")
    else:
        cols_without_null_values.append(col)        

In [None]:
print(f"Out of {data_df.shape[1] - 1} columns, {len(cols_with_null_values)} columns has null values.\n")
print("Columns without null values:\n\n", cols_without_null_values, "\n")
print("Columns with null values:\n\n", cols_with_null_values, "\n")

### 55 float features

#### Features starting with `F_1_*`

In [None]:
sns.set_palette('RdBu')
feature_f_1_columns = [
    f
    for f in data_df.columns
    if data_df[f].dtype == 'float64' and f.startswith('F_1_')
]

print(f"{len(feature_f_1_columns)} columns are float columns starting with F_1_*.")

fig, axs = plt.subplots(
    5, 3, figsize=(20, 12)
)

for f, ax in zip(feature_f_1_columns, axs.ravel()):
    ax.hist(
        data_df[f], density=True, bins=100,
    )
    ax.set_title(
        f"{f}: mean={data_df[f].mean():.1f}, std={data_df[f].std():.1f}"
    )
plt.tight_layout()
plt.suptitle(
    "Histograms of the float features starting with F_1_*",
    y=1.02,
    fontsize=20
)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    data_df[
        feature_f_1_columns
    ].corr(),
    cmap='coolwarm',
    center=0,
    annot=True,
    fmt='.2f'
)
plt.suptitle(
    'Correlation in F_1_* features',
    y=1.02,
    fontsize=20
)
plt.show()

**Observations**

- All features apart from `F_1_7, F_1_12, F_1_13` are normally distributed, are centered around 0(`Mean = 0`) and have `Standard Deviation = 1`.
- `F_1_7, F_1_12, F_1_13` features have `Standard Deviation = .7` and `Mean = -0.1`.

#### Features starting with `F_3_*`

In [None]:
sns.set_palette('RdBu')

feature_f_3_columns = [
    f
    for f in data_df.columns
    if data_df[f].dtype == 'float64' and f.startswith('F_3_')
]

print(f"{len(feature_f_3_columns)} columns are float columns starting with F_3_*.")

fig, axs = plt.subplots(
    5, 5, figsize=(30, 16)
)

for f, ax in zip(feature_f_3_columns, axs.ravel()):
    ax.hist(
        data_df[f], density=True, bins=100
    )
    ax.set_title(
        f"{f}: mean={data_df[f].mean():.1f}, std={data_df[f].std():.1f}"
    )
plt.suptitle(
    "Histograms of the float features starting with F_3_*",
    y=1.02,
    fontsize=20
)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(
    data_df[
        feature_f_3_columns
    ].corr(),
    cmap='coolwarm',
    center=0,
    annot=True,
    fmt='.2f'
)
plt.suptitle(
    'Correlation in F_3_* features',
    y=1.02,
    fontsize=20
)
plt.show()

**Observations**

- All features apart from `F_3_19, F_3_21` are normally distributed, are centered around 0(`Mean = 0`) and have `Standard Deviation = 1`.
- `F_3_19, F_3_21` features have `Standard Deviation = 0.7` and `Mean = -0.1`.

#### Features starting with `F_4_*`

In [None]:
sns.set_palette('RdBu')

feature_f_4_columns = [
    f
    for f in data_df.columns
    if data_df[f].dtype == 'float64' and f.startswith('F_4_')
]

print(f"{len(feature_f_4_columns)} columns are float columns starting with F_4_*.")

fig, axs = plt.subplots(
    5, 3, figsize=(20, 12)
)

for f, ax in zip(feature_f_4_columns, axs.ravel()):
    ax.hist(
        data_df[f], density=True, bins=100
    )
    ax.set_title(
        f"{f}: mean={data_df[f].mean():.1f}, std={data_df[f].std():.1f}"
    )
plt.tight_layout()
plt.suptitle(
    "Histograms of the float features starting with F_4_*",
    y=1.02,
    fontsize=20
)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(
    data_df[
        feature_f_4_columns
    ].corr(),
    cmap='coolwarm',
    center=0,
    annot=True,
    fmt='.2f'
)
plt.suptitle('Correlation in F_4_* features')
plt.show()

**Observations**

- Features with `F_4_*` has the most variation.
- Features `F_4_1, F_4_4, F_4_5, F_4_7, F_4_12, F_4_13` has `Mean = 0.3 or -0.3` and `Standard Deviation=2.4`.
- Other features has `Mean as -0.2, -0.1, 0.0, 0.3, 0.6` and `Standard Deviation as 0.7, 0.8, 2.3, 5.0`

### Int Feature `F_2_*`

In [None]:
feature_f_2_columns = [
    f
    for f in data_df
    if data_df[f].dtype == 'int64' and f.startswith('F_2_')
]
print(f"{len(feature_f_2_columns)} columns are int columns.")

figure = plt.figure(figsize=(30, 16))
for i, f in enumerate(feature_f_2_columns):
    plt.subplot(5, 5, i + 1)
    ax = plt.gca()
    vc = data_df[f].value_counts()
    ax.bar(vc.index, vc)
    ax.set_xlabel(f'{f}: mean={data_df[f].mean():.1f}, std={data_df[f].std():.1f}')
    ax.xaxis.set_major_locator(
        MaxNLocator(integer=True)
    )
plt.suptitle(
    'Histograms of the float features starting with F_2_*',
    y=1.02,
    fontsize=20
)
figure.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(
    data_df[
        feature_f_2_columns
    ].corr(),
    cmap='coolwarm',
    center=0,
    annot=True,
    fmt='.2f'
)
plt.suptitle('Correlation in F_2_* features')
plt.show()

**Observations**

- `Mean` is in `[0.7, 3.3] range`.
- `Standard Deviation` is in `[1.0, 2.0] range`.

## Features with Null Values

- Find out rows where only 1 feature is null
- Find out rows where more than 1 features are null

In [None]:
float_features = [
    f
    for f in data_df
    if data_df[f].dtype == 'float64' and f != 'row_id'
]
display(data_df[data_df.isnull().any(axis=1)][float_features].isnull().sum(axis=1).value_counts())

In [None]:
sns.set()

fig = plt.figure(figsize=(24, 12))

sns.set_style("darkgrid")

ax = sns.countplot(
    x=data_df[data_df.isnull().any(axis=1)][float_features].isnull().sum(axis=1), 
    palette='RdBu'
)
ax.bar_label(
    ax.containers[0],
    padding=2
)
fig.suptitle(
    'Number of null values per row',
    horizontalalignment='center', verticalalignment='bottom', fontsize=15,
    y=.9
);

**Observations**

- There are `635226` rows with null values out of `1000000` total rows.
- Most of the rows either has 1 or 2 null values. 2 rows has 9 null values as well.

## Credits

- This notebook would have been possible without all the great people who have shared amazing notebooks in the previous Playground Series competition.
- I have learned alot reading and reproducing [@ambrosm](https://www.kaggle.com/ambrosm) notebooks from this year playground series competitions. Float and Integer values exploration is based on @ambrosm [May Playground EDA Notbook](https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense) notebook, do check that out for more intresting techniques to explore this type of dataset.


**Thanks for reading this notebook, feedbacks and comments are welcome. Enjoy this competition and Keep on Learing...**