In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
from pathlib import Path
from IPython.display import Markdown, display

from scipy.stats import norm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = 'ggplot2'

In [None]:
dpath = Path('../input/tabular-playground-series-feb-2021')
sample_sub = pd.read_csv(dpath / 'sample_submission.csv')

In [None]:
# for the EDA the id column is useless
full = pd.concat([pd.read_csv(dpath / 'train.csv'),
                  pd.read_csv(dpath / 'test.csv')]).drop(columns=['id'])

# I will often use a sample to test code on a reduced size of the data
sample = full.sample(frac=0.1)

In [None]:
cont_cols = [f'cont{i}' for i in range(14)]
cat_cols = [f'cat{i}' for i in range(10)]

First things first, there are neither NAs nor duplicates in the dataset.

In [None]:
# no NAs
full.isna().sum().to_frame(name='NA').query('NA > 0')

# Duplicates and duplicated categories

In [None]:
# no duplicates
full.duplicated().sum()

But only using the categorial columns, for almost all columns we find a duplicate.

In [None]:

full[cat_cols].duplicated(keep=False).sum()/len(full)

A brief look in the sorted by the duplicated categories reveals that even if all categories are identical, the target differs. (No duplicates in the continous variables)

In [None]:
count_categories = full[cat_cols].nunique()

fig = px.bar(count_categories,
             title='Number of different levels per category',
             labels={'index': '', 'value': 'count'})
fig.show()

In [None]:
# the number of possible categorial combinations
Markdown(f"""
Given the number of possible combinations of categories {np.prod(count_categories.values)} and the number of samples {len(full)}, it could be that duplicated categories are not happen by random.
""")

In [None]:
full[full[cat_cols].duplicated(keep=False)].sort_values(by=cat_cols).head(20)

However, when counting how often duplicates occur, the distribution looks ok.

In [None]:
full[cat_cols].sum(axis=1)
count_cat_combs = full.groupby('CombCat').size()
count_cat_combs.hist(bins=100, log=True);

# Target Distribution

The `target` distributuon looks bimodal, somhow like a double-gaussian. Sometime the reason for the bimodal distribution is a hidden variable which seperates the distribution into two
2-dim gaussians which don't overlap.

In [None]:
full['target'].hist(bins=100);

When splitting by the categorial features, the height of both `gaussians` change depending on the categorial variable. Unluckely, nont of them splits the data into two gaussians.

*Notes*:
* Maybe one can replace each category with the mean and std of a double gaussian fit for each categorial variable.

In [None]:
for cat in cat_cols:
    full.groupby(cat)['target'].hist(bins=100, density=True, alpha=0.4)
    plt.show()

Even when taking the 5 most common combinations of there is no clean split of the distribution

In [None]:
# take the 5 most common combinations
common_comb = count_cat_combs.nlargest(5).index
full[full['CombCat'].isin(common_comb)].groupby('CombCat')['target'].hist(bins=100, alpha=0.3);

In [None]:
full[full['CombCat'].isin(common_comb)].groupby('CombCat')['target'].agg(['mean', 'median', 'std'])

None of the categories create easily separatable gaussians. All centers lying close to each other, at least with respect to their width.

In [None]:

for cat in cat_cols:
    print(full.groupby(cat).apply(lambda x: norm.fit(x.dropna()['target'])))

In [None]:
full.groupby(['cat0', 'cat1']).apply(lambda x: norm.fit(x.dropna()['target']))

# Continous variables

None of the continoius variables have considerable outliers. Given that z < 3 means something like ~99.9%, the data is more centered as one would expect for a gaussian.
However, the target shows a few outlier, there is even a 0.0 within the data.

*Notes*:
* filter outlier in `target` at least the very extreme ones.

In [None]:
z_thr = 3

# scatterplot for all continious variables
for col in cont_cols + ['target']:
    z_score = ((full[col] - full[col].mean())/full[col].std(ddof=0)).abs()
    outlier = z_score > z_thr
    ax = full[~outlier].plot.scatter(x=col, y='target', alpha=0.2, title=f'col - outlier {(outlier).mean()}', )
    full[outlier].plot.scatter(x=col, y='target', alpha=0.2, ax=ax, color='red', title=f'col - outlier: {(outlier).sum()} ({(outlier).mean()})', )
    plt.show()

In [None]:
full.query('target < 2')

# Gaussian Mixture Model

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
train = full[full['target'].notna()][cont_cols + ['target']].copy(deep=True)

In [None]:
y = train['target']
x = train.drop(columns=['target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [None]:
m = GaussianMixture(9)
m.fit(x_train.values, y_train.values);

A dump Gaussian Mixture Model does not work, it's a pitty.

In [None]:
np.sqrt(mean_squared_error(m.predict(x_test), y_test))

# Histograms of continous variables.

In [None]:
for col in cont_cols + ['target']:
    full[col].hist(bins=80)
    plt.show()

# 2D Histograms continous variables vs. target

All two-dimensional histograms look mirrored around some axis=

In [None]:
for col in cont_cols:
    plt.hist2d(x=full.dropna()[col], y=full.dropna()['target'], bins=100);
    plt.show()

In [None]:
for col in cont_cols:
    for g, d in full.dropna().groupby('cat2'):
        plt.hist2d(x=d[col], y=d['target'], bins=100);
        plt.show()

# split train variable

In [None]:
train = full[full['target'].notna()].drop(columns=['CombCat']).copy(deep=True)

In [None]:
m = GaussianMixture(2)
m.fit(train['target'].values.reshape(-1, 1))
print(m.means_)
print(np.sum(m.means_)/2.0)

In [None]:
train['SET'] = (train['target'] < (np.sum(m.means_)/2.0)).astype(int)

In [None]:
train.groupby('SET').size()

# Train on target < 7.44...

At least training a simple RandomForest does not help to distinguish if target is < 7.44 or not.

*Notes*:
* Train better classifier on target < 7.44...

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train = pd.get_dummies(train)

train_x = train.drop(columns=['target', 'SET'])
train_y = train['SET']

train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.3)

In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    n_jobs=-1,
    oob_score=True
)
rf.fit(train_x, train_y)

In [None]:
print(accuracy_score(train_y, rf.predict(train_x)))
print(accuracy_score(test_y, rf.predict(test_x)))

# Correlation Matrix

As can seen from the correlation matric below, must of the continious features are not correlated much with the `target`.
`cont13` is almost not correlated to the target at all.

In [None]:
fig = px.imshow(full.corr())
fig.update_xaxes(side="top")
fig.show()