In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt # plotting
%matplotlib inline 
print("matplotlib version: {}". format(matplotlib.__version__))

import seaborn as sns
print("seaborn version: {}". format(sns.__version__))

import sklearn # machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About
This notebook is my EDA for the Tabular Playground Series May 2021. No modelling yet.

Let's look at training and testing data.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv') #the sample submission predicts everybody survived
df_all = df_train.append(df_test, ignore_index = True) # created because sometimes it is convenient to work on train and test set together

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.describe().transpose()\
        .drop("id")\
        .style.bar(subset=['mean','std'])\
        .background_gradient(subset=['max'])

In [None]:
df_test.describe().transpose()\
        .drop("id")\
        .style.bar(subset=['mean','std'])\
        .background_gradient(subset=['max'])

In [None]:
# number of rows with any values below zero
display(df_train[(df_train.drop(["target"],axis=1) < 0).any(1)].shape)
df_test[(df_test < 0).any(1)].shape

In [None]:
# check the target variable
df_train.target.value_counts()

What do we have here? **Training data** with 50 anonymized numerical features and 100000 training examples, identified with an id. There are no missing values. Data type is integer. 

The **target** is categorical with four values. Data type object -> str. Class_2 is dominant with more than half of the examples. Class_1 has the fewest examples (less than 10%). We need to take care about this when modelling. 

Most features have a minimum values of 0. Features 19, 30, 31, 32, 35, 38, 39 and 42 have negative values. There are in total 227 training examples with one or more negative values, which is 0.23%.  The maximum value for any feature is 66.

The features differ in terms of their mean and standard deviation. So depending on my choice of model later on, mean normalization will be necessary. 

The **test data** consistis of 50000 examples. There are in total 109 training examples with negative values, which is 0,22%. Features 14, 19, 30, 31, 32, 38 and 39 have negative values. This is different from the training set!

Note to myself: the percentage of rows with negative values is very low. Check model performance with and without the negative rows.

Let's look at the **distributions**.

In [None]:
109*2

In [None]:
# code for mean normalization. all features are centered around 0 and have variance in the same order
#scaler = StandardScaler() 
#df_temp = pd.DataFrame(data=scaler.fit_transform(df_train.drop(["target"],axis=1)),columns=df_train.drop(["target"],axis=1).columns)
#df_temp.describe().transpose()

In [None]:
fsize = (18,18)
df_train.drop(["id","target"],axis=1).hist(figsize=fsize)
plt.suptitle("Distributions in training data (df_train)", fontsize=14)
plt.show()

In [None]:
fsize = (18,18)
df_test.drop(["id"],axis=1).hist(figsize=fsize)
plt.suptitle("Distributions in test data (df_test)", fontsize=14)
plt.show()

We can see here that all distributions are right skewed. For feature_38 there is a visible difference in distribution in train and test data. 

In [None]:
# choose feature for a closer look
current_feature = "feature_8"
current_df = df_train
print(current_feature)

fig = plt.figure(figsize=fsize) # create figure
fsize = (16,12)
ax0 = fig.add_subplot(2, 1, 1) # add subplot 1 (2 rows, 1columns, first plot)
ax1 = fig.add_subplot(2, 1, 2)
#current_df[current_feature].hist(figsize=fsize, ax=ax0)
sns.histplot(x=current_feature, data=current_df, ax=ax0) # just an alternative with sns instead of plt
sns.boxplot(x=current_feature, data=current_df, ax=ax1)
plt.show()

print(current_df[current_feature].value_counts())

In [None]:
# let's check if there are as many unique feature values as the range of values
df_features = df_train.drop(["id","target"],axis=1) # use df_all, df_test here depending on what you want to see

feature_range = df_features.max() - df_features.min()
no_unique_values = df_features.nunique()

pd.DataFrame(data={"feature_range": feature_range, "no_unique_values": no_unique_values})

Observations:

- There are no binary features

- There are features with few unique values (low cardinality), and with many unique values (high cardinality). Is it safe to guess that features with "few" unique values are categorical features and those with many are continous features???

- When looking at the training and testing data to check if the feature values are "continous", I found this to be the case for most features. However there are features where the values go like ...27, 28, 29, 31 (feature_1) or 31, 32, 33, 34, 36, 39 (feature_8). This can be best seen using the sns boxplot.

 In some cases the unused feature values are present in the test set (like for feature_1), in other cases only partly (like for feature_8). 

 I'm not sure what to make of this observation. I wonder if a feature like feature_8 is a real continous feature. Because, if it was a categorical, there should not be any missing labels...?

- About the features with negative values: Either they are categorical features that have been label encoded "by hand" to produce the negative values or they are continous features. However I still could not come up with a feature of an eCommerce product that has multiple negative values. I considered, that a negative value just means "missing data". But again, this does not explain multiple negative values for a feature. 

Thanks to OmarVivas who informed us about **duplicates** in this data set. Let's check.

There are really 4 duplicates in the training set with different target. I will remove this rows before training.

There are further 6 rows from training set, that are also present in the test set. Remember to check those rows in the prediction. 

In [None]:
# look only in training data
display(df_train[df_train.drop(columns=["id","target"]).duplicated(keep="first")])
display(df_train[df_train.drop(columns=["id","target"]).duplicated(keep="last")])
# 44423 - 13230, Class_4 vs Class_2
# 73244 - 25648, Class_2 vs Class_4
# 80571 - 44248, Class_4 vs Class_2
# 89009 - 87104, Class_1 vs Class_3

In [None]:
# look in combined data, keep=False shows both duplicated rows in the result
duplicates = df_all[df_all.drop(columns=["id","target"]).duplicated(keep=False)].drop([13230,25648,44248,87104,44423,73244,80571,89009],axis=0)
#display(df_all[df_all.drop(columns=["id","target"]).duplicated(keep="first")])
#display(df_all[df_all.drop(columns=["id","target"]).duplicated(keep="last")])
duplicates

In [None]:
# class_3 36458 -> 143136
# class_2 31717 -> 132016
# class_4 23272 -> 143298
# Class_2 62190 -> 114770
# Class_4 81438 -> 120754
# Class_4 63143 -> 101173

Baseline submission:

In [None]:
# predict everything Class_2
sample_submission.loc[36458]["Class_1"]=0
sample_submission.loc[36458]["Class_2"]=0
sample_submission.loc[36458]["Class_3"]=1
sample_submission.loc[36458]["Class_4"]=0
sample_submission.to_csv('submission_1.csv', index=False)
#-> public leaderboard score: 14.62209

In [None]:
df_train.target.value_counts(normalize=True)

In [None]:
# predict like train set probabilites
sample_submission.Class_1=0.08
sample_submission.Class_2=0.57
sample_submission.Class_3=0.21
sample_submission.Class_4=0.12
sample_submission.to_csv('submission_2.csv', index=False)
#-> public leaderboard score: 1.11369