In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# October 2021 Tabular Playground
This notebook aims to show an entire workflow of Data Science by using the dataset from October 2021 Tabular Playground. A competitive performance is ensured.
* [**Author**](https://www.linkedin.com/in/chi-wang-22a337207/)
* [**Dataset**](https://www.kaggle.com/c/tabular-playground-series-oct-2021/data)

# Tips
* It's a good strategy to compress the dataset by data type transformation. Eg. **Float64 --> Float16**
* Release variables that occupy huge memory. **del xxx; gc.collect()**
* Generate new features base on **(min/max/mean/std/mode)** could help to improve the results. 

# Issues
* Could try dimensional reduction(PCA, correlation Analysis) to speed-up.
* Stacking tech is worth to try.

# Table of Content
1. [Data Overview](#1)
    * [1. Load Data](#1.1)
    * [2. Data Type](#1.2)
    * [3. Statistical View](#1.3)
2. [Data Preprocessing](#2)
    * [1. Drop Irrelevant Columns](#2.1)
    * [2. Missing Value Detection](#2.2)
    * [3. New Feature Generation](#2.3)
3. [Data Analysis](#3)
    * [1. What is the distribution of label? ](#3.1)
    * [2. What is the distribution of numerical features on target? ](#3.2)
    * [3. What is the distribution of categorical features on target? ](#3.3)
4. [Modelling](#4)
    * [1. Train Test Split ](#4.1)
    * [2. Train Models ](#4.2)
        * [1. XGboost ](#4.2.1)
        * [2. CatBoost ](#4.2.2)
        * [3. LightGBM ](#4.2.3)
    * [3. Model Comparison ](#4.3)
    * [4. Best Model Explaination ](#4.4)
    * [5. Parameter/Feature Tuning ](#4.5)
5. [Prediction](#5)
    * [1. Load Data](#5.1)
    * [2. Drop Irrelevant Columns](#5.2)
    * [3. New Feature Generation](#5.3)
    * [4. Make Prediction](#5.4)
    * [5. Save the Prediction to CSV file](#5.5)

<a id="1"></a>
# 1. Data Overview

In [None]:
# Import packages
import time
import gc

## Basic data processing
import numpy as np
import pandas as pd

## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

## Modelling
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, plot_importance

## Model Explanatory
import shap  # package used to calculate Shap values
import eli5

## Settings
pd.set_option('display.max_columns', 500) # Able to display more columns.
pd.set_option('display.max_info_columns', 150) # Able to display more columns in info().

<a id="1.1"></a>
## 1.1. Load Data

In [None]:
# Load the dataset
data_df = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
data_df.info() # show entries, dtypes, memory useage.

In [None]:
# Have a look
data_df.head(5)

<a id="1.2"></a>
## 1.2. Data Type

> [NOIR](https://www.questionpro.com/blog/nominal-ordinal-interval-ratio/): Nominal, Ordinal, Interval, Ratio.  

As the features in this dataset have been anonymized, we just assume that the data type of each feature is what it looks like.
* numerical --> Ratio
* categorical -> Norminal/Ordinal

In [None]:
# Distinguish numerical and categorical features
numerical_features = data_df.select_dtypes(include="float64").columns
categorical_features = data_df.select_dtypes(include="int64").columns
len(numerical_features), len(categorical_features)

In [None]:
# Check the range of chosen datatype
print(np.iinfo(np.int8))
print(np.finfo(np.float16))

In [None]:
# Shrink the data type to save memory usage
data_df[numerical_features] = data_df[numerical_features].astype("float16")
data_df[categorical_features] = data_df[categorical_features].astype("int8")
data_df.info()

In [None]:
# Exclude id and label
categorical_features = categorical_features[~categorical_features.isin(["id", "target"])]

<a id="1.3"></a>
## 1.3. Statistical View 

In [None]:
# Basic statistic on labels
data_df["target"].astype("object").describe() # All the Nominal data can be treated as "object" type for simplicity.

In [None]:
# Basic statistic on numerical features
data_df.loc[:, numerical_features].describe()

As the declaration in this playground, all the features are scaled **[0, 1]**.

In [None]:
# Basic statistic on categorical features
data_df.loc[:, categorical_features].astype("object").describe()

All the categorical feature only contain two values (**binary**). Most of them are quite **unbalanced**.

<a id="2"></a>
# 2. Data Preprocessing

<a id="2.1"></a>
## 2.1. Drop Irrelevant Columns

In [None]:
# Irrelevant columns
'''
id: id is useless for analysis and modeling.
'''
irrelevant_columns = ['id']
data_preprocessed_df = data_df.drop(irrelevant_columns, axis=1)

<a id="2.2"></a>
## 2.2. Missing Value Detection

In [None]:
# Replace the empty data with NaN
data_preprocessed_df.replace("", float("NaN"), inplace=True)
data_preprocessed_df.replace(" ", float("NaN"), inplace=True)

# Count missing value(NaN, na, null, None) of each columns, Then transform the result to a pandas dataframe. 
count_missing_value = data_preprocessed_df.isna().sum() / data_preprocessed_df.shape[0] * 100
count_missing_value_df = pd.DataFrame(count_missing_value.sort_values(ascending=False), columns=['Missing%'])
count_missing_value_df.sum()

There is **no missing value** in this dataset.

<a id="2.3"></a>
## 2.3. New Feature Generation
* Numerical feature: min,max,mean,std
* Categorical feature: sum, mode

In [None]:
'''
Description: Generate new feature by several statistic methods
Args:
    dataset: The chosen dataset
    numerical_features: The numerical features in a list
    categorical_features: The categorical features in a list
Return: None
'''
def feature_generator(dataset, numerical_features, categorical_features):
    # Numerical feature
    dataset['n_min'] = dataset[numerical_features].min(axis=1)
    dataset['n_max'] = dataset[numerical_features].max(axis=1)
    dataset['n_std'] = dataset[numerical_features].std(axis=1)
    dataset['n_mean'] = dataset[numerical_features].mean(axis=1)
    # Categorical feature
    dataset['c_sum'] = dataset[categorical_features].sum(axis=1)
    dataset['c_mode'] = dataset[categorical_features].mode(axis=1)

In [None]:
# Generate new feature by several statistic methods
new_features = ['n_min', 'n_max', 'n_std', 'n_mean', 'c_sum', 'c_mode']
feature_generator(data_preprocessed_df, numerical_features, categorical_features)

In [None]:
# Copy a new dataframe for following phase
data_best_df = data_preprocessed_df.copy()

<a id="3"></a>
# 3. Data Analysis

<a id="3.1"></a>
## 3.1. What is the distribution of the label?

In [None]:
# Count the number of target(0/1), transform the result to pandas dataframe
target_counts = data_best_df["target"].value_counts()
target_counts_df = pd.DataFrame(target_counts)

# Visualize the distribution of the target(label)
target_fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{"type": "xy"}, {"type": "domain"}]])

target_fig.add_trace(go.Bar(x=target_counts_df.index, 
                           y=target_counts_df["target"],
                           text=target_counts_df["target"],
                           textposition='outside',
                           showlegend=False),
                           1, 1)

target_fig.add_trace(go.Pie(labels=target_counts_df.index, 
                           values=target_counts_df["target"],
                           showlegend=True),
                           1, 2)

target_fig.update_layout(
                  height=600, 
                  width=1000,
                  title={
                  'text': "The distribution of target",
                  'font': {'size': 24},
                  'y':0.95,
                  'x':0.5,
                  'xanchor': 'center',
                  'yanchor': 'top'},
                  xaxis1_title = 'target', 
                  yaxis1_title = 'Counts',
                  legend_title_text="target"
                 )
target_fig.update_xaxes(type='category')
target_fig.show()

The label is quite balanced.

<a id="3.2"></a>
## 3.2. What is the distribution of numerical features on target?

In [None]:
# Set up the matplotlib figure
f, axes = plt.subplots(40, 6, figsize=(30, 200))
for feature,number in zip(numerical_features, range(240)):
    yaxix_name = feature
    r_pos = number // 6
    c_pos = number % 6
    sns.boxplot(x='target', y=yaxix_name, data=data_best_df, ax=axes[r_pos, c_pos]).set_title(yaxix_name)

It seems the potential **important features** are: f1, f3, f7, f8, f19, f24, f40, f53, f54, f65, f92, f93, f112  
It seems the potential ***significant features*** are: **f44, f56, f58, f69, f139, f146, f150, f179, f181**

In [None]:
# Plot features we created
f, axes = plt.subplots(3, 2, figsize=(15, 15)) #suitable for two line with 6 graph.
for feature,number in zip(new_features, range(6)):
    yaxix_name = feature
    r_pos = number // 2
    c_pos = number % 2
    sns.boxplot(x='target', y=yaxix_name, data=data_best_df, ax=axes[r_pos, c_pos]).set_title(yaxix_name)

It seems that **n_mean** and **c_sum** are important features.

<a id="3.3"></a>
## 3.3. What is the distribution of categorical features on target?

In [None]:
# Set up the matplotlib figure
f, axes = plt.subplots(8, 6, figsize=(30, 40))
for feature,number in zip(categorical_features, range(45)):
    yaxix_name = feature
    r_pos = number // 6
    c_pos = number % 6
    sns.countplot(x="target", hue=yaxix_name, data=data_best_df, ax=axes[r_pos, c_pos]).set_title(yaxix_name)

It seems that **f22** is an important feature.

In [None]:
# Check the new feature: c_mode
sns.countplot(x="target", hue="c_mode", data=data_best_df).set_title("c_mode")

Overall:   
important features:  f1, f3, f7, f8, f19, f24, f40, f53, f54, f65, f92, f93, f112, n_mean  
significant features: **c_sum, f22, f44, f56, f58, f69, f139, f146, f150, f179, f181**

In [None]:
#signi_feature = ['f1', 'f3', 'f7', 'f8', 'f19', 'f24', 'f40', 'f53', 'f54', 'f65', 'f92', 'f93', 'f112', 'n_mean', 'c_sum', 'f22', 'f44', 'f56', 'f58', 'f69', 'f139', 'f146', 'f150', 'f179', 'f181','target']

<a id="4"></a>
# 4. Modelling

In [None]:
# Copy a new dataframe for following phase
data_modelling_df = data_best_df.copy()

In [None]:
# Release big variables that are not used in the following. Save Memory
del data_df, data_preprocessed_df, data_best_df
gc.collect()

<a id="4.1"></a>
## 4.1. Train Test Split

In [None]:
# Train/Test Split
X = data_modelling_df.drop("target", axis=1)
Y = data_modelling_df.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=0)

In [None]:
# Release big variables that are not used in the following. Save Memory
del data_modelling_df
gc.collect()

<a id="4.2"></a>
## 4.2. Train Models
> Let's use three state-of-art ensembled models to make prediction

* [XGBoost](https://xgboost.readthedocs.io/en/latest/)
* [CatBoost](https://catboost.ai/)
* [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html)

Thanks for the following Kernel(Author) to support the Parameter sets.
* https://www.kaggle.com/mehrankazeminia/1-tps-oct-21-lgbm-auc-evaluation/notebook#notebook-container
* https://www.kaggle.com/stevenrferrer/tps-oct-2021-baseline-lgbm-xgb-cb

<a id="4.2.1"></a>
### 4.2.1 XGboost

In [None]:
# Start time
start_time = time.time()

xgb_params = {
    "random_state": 0,
    "n_estimators": 10000,
    "learning_rate":0.008,
    "eval_metric": "auc",
    "objective":"binary:logistic",
    "use_label_encoder": False,
    "booster": "gbtree",
    # GPU
    "gpu_id": 0,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor"
}

xgbc = XGBClassifier(**xgb_params)
xgbc.fit(x_train, y_train, verbose=False)

# Calculate the training time
xgbc_time = time.time() - start_time

# xgbc.evals_result() #Return the evaluation results of eval_sets
predictions = xgbc.predict_proba(x_test)[:,1]
auc_xgbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_xgbc}')

<a id="4.2.2"></a>
### 4.2.2 CatBoost

In [None]:
# Start time
start_time = time.time()

catb_params = {
    "random_seed": 0,
    "iterations": 10000,
    "learning_rate":0.008,
    "eval_metric" : "AUC",
    "verbose": 0,
    # GPU
    "task_type" : "GPU",
    "devices" : "0",
}

catbc = CatBoostClassifier(**catb_params)
catbc.fit(x_train, y_train, verbose=False)

# Calculate the training time
catbc_time = time.time() - start_time
predictions = catbc.predict_proba(x_test)[:,1]
auc_catbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_catbc}')

<a id="4.2.3"></a>
### 4.2.3 LightGBM

In [None]:
# Start time
start_time = time.time()

lgbc_params = {
    "n_estimators":10000, 
    "learning_rate":0.008, 
    "objective":'binary',                      
    "metric":'auc',                       
    "reg_alpha":10,
    "reg_lambda":0.1,                     
    "num_leaves":31,
    "max_depth":-1,
    "subsample":0.6,
    "subsample_freq":1, 
    "colsample_bytree":0.4,
    "min_child_weight":256,
    "min_child_samples":20, 
    "random_state":0,
    # GPU
    "device": "gpu"
}

lgbc = LGBMClassifier(**lgbc_params)

lgbc.fit(x_train, y_train, eval_metric='auc', verbose=-1)

# Calculate the training time
lgbc_time = time.time() - start_time
predictions = lgbc.predict_proba(x_test)[:,1]
auc_lgbc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc_lgbc}')

<a id="4.3"></a>
## 4.3. Model Comparison

In [None]:
# Collect all the model performance
model_comparison = pd.DataFrame(data = [(auc_xgbc, xgbc_time), (auc_catbc, catbc_time), (auc_lgbc, lgbc_time)], 
                                index = ["XGboost", "CatBoost", "LGBM"],
                                columns=['AUC', 'Time'])\
                     .sort_values(by = "AUC", ascending=False)
model_comparison

<a id="4.4"></a>
## 4.4. Best Model Explaination

In [None]:
import lightgbm as lgb
lgb.plot_importance(lgbc, max_num_features=20, figsize=(10, 8))

The result partially matchs results from section3: EDA

<a id="4.5"></a>
## 4.5. Parameter/Feature Tuning  

In [None]:
# Release big variables that are not used in the following. Save Memory
del x_train, x_test, y_train, y_test
gc.collect()

In [None]:
# Make Prediction by classifiers
voting_clas = VotingClassifier(estimators=[('CatBoost', catbc), ('LGBoost',lgbc)], voting='soft', n_jobs=-1)
votingC = voting_clas.fit(X, Y)

In [None]:
# Release big variables that are not used in the following. Save Memory
del X, Y
gc.collect()

<a id="5"></a>
# 5. Prediction

<a id="5.1"></a>
## 5.1. Load Data

In [None]:
# Load the dataset
test_df = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
test_df.info() # show entries, dtypes, memory useage.

In [None]:
test_df[numerical_features] = test_df[numerical_features].astype("float16")
test_df[categorical_features] = test_df[categorical_features].astype("int8")
test_df.info()

In [None]:
# Have a look
test_df.head()

<a id="5.2"></a>
## 5.2. Drop Irrelevant Columns

In [None]:
# Drop Irrelevant columns
test_preprocessed_df = test_df.drop(irrelevant_columns, axis=1)

In [None]:
# Save the "id" column
id_df = test_df['id']

In [None]:
# Release big variables that are not used in the following. Save Memory
del test_df
gc.collect()

<a id="5.3"></a>
## 5.3. New Feature Generation

In [None]:
feature_generator(test_preprocessed_df, numerical_features, categorical_features)

<a id="5.4"></a>
## 5.4. Make Prediction

In [None]:
# Use trained model(best) to make predictions
predictions = votingC.predict_proba(test_preprocessed_df)[:,1]
predictions_df = pd.DataFrame(predictions, columns=['target'])
submission_df = pd.concat([id_df, predictions_df], axis=1)

<a id="5.5"></a>
## 5.5. Save the Prediction to CSV file

In [None]:
# Save aggregated predictions to .csv for project submission
submission_df.to_csv('submission.csv', index=False)

# Thanks for reading, Have a good day ~