# Description

**Kaggle description for this dataset**: The original dataset deals with predicting identifying spam emails via various 
extracted features from the email. Although the features are anonymized, they have properties relating to real-world features.

**Notebook description**: This is mostly an EDA notebook, adding a simple classifcation model and prediction at the end.

In [None]:
import datatable as dt # for quicker loading of dataframes
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

# Data overview

In [None]:
train_df = dt.fread('/kaggle/input/tabular-playground-series-nov-2021/train.csv').to_pandas()

In [None]:
train_df.info()

In [None]:
train_df.head()

All columns are numeric (float), except for the target.

Id column is not needed, so it will be dropped

In [None]:
train_df.drop(columns='id', inplace=True)
train_df

Display an overview of all data

In [None]:
train_df.describe().T.style.background_gradient(cmap = 'Blues')\
                           .bar(subset = ["mean",], color = 'lightgreen')\
                           .bar(subset = ["std"], color = '#ee1f5f')\
                           .bar(subset = ["max"], color = '#FFA07A')

Check for null/missing values

In [None]:
train_df.isna().any().sum()

Check for duplicates

In [None]:
train_df.duplicated().any().sum()

Target is well balanced:

In [None]:
sns.countplot(x=train_df.target);

# Data correlations and distributions

Get a subsample of data to proceed faster with the analysis

In [None]:
# obtain a smaller subset of samples
train_sample = train_df.sample(n=25000, random_state=42)
train_sample

Evaluate Pearson correlation (on a subset of 250000 samples)

In [None]:
corr_all = train_sample.corr()
corr_all

In [None]:
sns.set(style="white", font_scale=1)
mask = np.zeros_like(corr_all, dtype=np.bool) # Generate a mask for the upper triangle
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(24, 18))
f.suptitle("Correlation Matrix", fontsize = 10)
cmap = sns.diverging_palette(220, 10, as_cmap=True) # Generate a custom diverging colormap
sns.heatmap(corr_all, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

There is little correlation between the features.

There appears to be some correlation between some features and the target, however it is minimal and the graphic above might lead to some confusion. See below:

In [None]:
corr_all.target.sort_values(ascending=False)[0:10]

In [None]:
corr_all.target.sort_values()[0:10]

In [None]:
train_sample.corrwith(train_sample.target).plot.bar(figsize=(20,5),
                  title='Correlation with Target Variable',
                  fontsize=10, rot=90,
                  grid=True);

Data distribution (on a subset of 25000 samples)

In [None]:
fig, axes = plt.subplots(10,10,figsize = (20, 15))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data = train_sample, x = f'f{idx}', fill = True, ax = ax)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()

Many of these distributions seem to be bimodal

Data distrubution considering the target:

In [None]:
fig, axes = plt.subplots(10,10,figsize = (20, 15))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.kdeplot(data = train_sample, x = f'f{idx}', fill = True, ax = ax, hue='target', legend=idx==0)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()

Box plot distributions

In [None]:
fig, axes = plt.subplots(20,5,figsize = (10, 20))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.boxplot(data = train_sample, x = f'f{idx}', ax = ax)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel(''); ax.set_ylabel('')
    ax.set_title(f'f{idx}', loc = 'right', fontsize = 12)
fig.tight_layout()
plt.show()

# Dimensionality reduction analysis

Will try with Linear Discriminant Analysis, given that many of the features seem to follow a normal distribution

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_df.drop(columns='target')
y = train_df.target

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis(n_components=1)

In [None]:
X_r = clf.fit(X_train, y_train).transform(X_train)

In [None]:
plt.figure(figsize=(20, 10))
plt.scatter(X_r, y=np.zeros(X_r.shape[0]), c=y_train)

At first sight seems like there might be some good data separation, let's try with box plots to get a better idea

In [None]:
X_r_positive = X_r[y_train==True]
X_r_negative = X_r[y_train==False]

In [None]:
plt.figure(figsize=(10, 10))
sns.boxplot(data=[X_r_positive, X_r_negative])

Class boundary is around 0

# Obtaining a prediction and submission

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_valid, y_valid))
print(roc_auc_score(y_valid, clf.predict(X_valid)))

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_valid, clf.predict_proba(X_valid)[:,1], pos_label=1)
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate');

In [None]:
test_df = dt.fread('/kaggle/input/tabular-playground-series-nov-2021/test.csv').to_pandas().drop(columns='id')

In [None]:
test_predictions = clf.predict_proba(test_df)
test_predictions[:, 1]

In [None]:
sub_df = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
sub_df['target'] = test_predictions[:, 1]
sub_df.to_csv("submission.csv", index=False)
sub_df.head()

Evaluation using only the bimodal and non-bimodal features

In [None]:
bimodals = [1,3,5,6,7,8,10,11,13,14,15,17,18,22,25,26,29,34,37,38,40,41,43,45,47,50,54,55,57,65,66,67,70,71,74,77,80,82,85,86,91,96,97]
bimodal_features = ['f' + str(i) for i in bimodals]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X[bimodal_features], y, test_size=0.2, random_state=42)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_valid, y_valid))

In [None]:
non_bimodals = set(bimodals)
non_bimodal_features = ['f' + str(i) for i in set(range(0, 100)).difference(set(bimodals))]

With only non bimodal features, around a 58% score is obtained

What about trying to remove outliers from the training data?

Using Local Outlier Factor, it removed around 2000 samples. Results didn't improve.

In [None]:
# %%time
# from sklearn.neighbors import LocalOutlierFactor

# lof = LocalOutlierFactor()
# yhat = lof.fit_predict(X_train)

# mask = yhat != -1

# X_train_filtered, y_train_filtered = X_train[mask], y_train[mask]

# print(X_train_filtered.shape, y_train_filtered.shape)
# clf.fit(X_train_filtered, y_train_filtered)

# print(clf.score(X_train_filtered, y_train_filtered))
# print(clf.score(X_valid, y_valid))
# print(roc_auc_score(y_valid, clf.predict(X_valid)))