In [None]:
import warnings

warnings.filterwarnings('ignore', 'SettingWithCopyWarning')
warnings.filterwarnings("ignore", 'Creating legend with loc="best" can be slow with large amounts of data.')

# Tabular Playground Series - Oct 2021

![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# Imports

In [None]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datatable as dt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn import metrics, model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from IPython.display import display, Markdown, Latex

# Configuration

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
class Cfg:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/tabular-playground-series-oct-2021/train.csv'
    TEST_DATA = '../input/tabular-playground-series-oct-2021/test.csv'
    SUBMISSION = '../input/tabular-playground-series-oct-2021/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    TEST_SIZE = 0.4
    SAMPLE_FRAC = 0.03
    N_FEATURE = 5 # 285
    
    INDEX = 'id'
    TARGET = 'target'
    FEATURES = ['f{}'.format(i) for i in range(1, 285)]
    
    @staticmethod
    def set_seed():
        random.seed(Cfg.RANDOM_STATE)
        np.random.seed(Cfg.RANDOM_STATE)

Cfg.set_seed()

# Import data

In [None]:
def read_data(
    train_file:str=Cfg.TRAIN_DATA, 
    test_file:str=Cfg.TEST_DATA
) -> (pd.DataFrame, pd.DataFrame):
    """
    """
    # read csv files
    train_df = dt.fread(train_file).to_pandas().set_index(Cfg.INDEX)
    test_df = dt.fread(test_file).to_pandas().set_index(Cfg.INDEX)
    
    # determine data types
    num_cols = train_df.dtypes[train_df.dtypes == 'float64'].index.to_list()
    binary_cols = train_df.dtypes[train_df.dtypes == 'bool'].index.drop(Cfg.TARGET).to_list()
    
    # reduce memory usage
    train_df[num_cols] = train_df[num_cols].astype(np.float32, copy=False)
    train_df[binary_cols] = train_df[binary_cols].astype(np.short, copy=False)
    train_df[Cfg.TARGET] = train_df[Cfg.TARGET].astype(np.short, copy=False)
    
    test_df[num_cols] = test_df[num_cols].astype(np.float32, copy=False)
    test_df[binary_cols] = test_df[binary_cols].astype(np.short, copy=False)
    
    return train_df, test_df

In [None]:
%%time
train_data, test_data = read_data()

In [None]:
Cfg.FEATURES = train_data.dtypes.index.drop(Cfg.TARGET).to_list()

Cfg.NUM_FEATURES = train_data.dtypes[train_data.dtypes == 'float32'].index.to_list()
Cfg.BINARY_FEATURES = train_data.dtypes[train_data.dtypes == 'short'].index.drop(Cfg.TARGET).to_list()

In [None]:
# memory usage
memory_usage = train_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (train): {:.2f} MB'.format(memory_usage.sum()))

memory_usage = test_data.memory_usage(deep=True) / 1024 ** 2
print('Memory (test) : {:.2f} MB'.format(memory_usage.sum()))

In [None]:
print('Features: {}'.format(len(Cfg.FEATURES)))
print('Numerical features: {}'.format(len(Cfg.NUM_FEATURES)))
print('Categorical features: {}'.format(len(Cfg.BINARY_FEATURES)))

In [None]:
train_data.head()

In [None]:
test_data.head()

## Missing values

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

## Data overview

* The training data contains 1000000 rows.

* The test data contains 500000 rows.

* There are 285 features `f0` - `f284`
    * 240 numerical features
    * 45 categorical features (All binary - 1/0).


* There are no missing values in both data sets.

* The target variable `target` is binary (1/0)

* The distribution of `target` is balanced.

# Exploratory data analysis (EDA)

In [None]:
def get_sample_data(
    data,
    split_target=True,
    frac=Cfg.SAMPLE_FRAC, 
    random_state=Cfg.RANDOM_STATE):
    """Select a sample subset from the data
    """
    idx = train_data.sample(frac=frac, random_state=random_state).index

    if split_target:
        X_data = train_data.iloc[idx][Cfg.FEATURES]
        y_data = train_data.iloc[idx][Cfg.TARGET]
    
        return X_data, y_data
    
    return train_data.iloc[idx]

In [None]:
stat_data = train_data.describe().drop('count')
stat_data.loc['var'] = stat_data.T['std']**2

stat_data.T.style.bar(
    subset=['mean'], 
    color='Bules'
).background_gradient(subset=['50%'], cmap='Blues')

## Target `target`

In [None]:
def plot_count(
    data:pd.DataFrame, 
    feature:str, 
    title='Countplot',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    
    sns.countplot(
        data=train_data,
        x=feature, 
        palette='Blues_r',
        ax=ax
    )
    
    ax.set_title(title)

    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')

    return ax

In [None]:
plot_count(train_data, Cfg.TARGET);

## Numerical features

In [None]:
def plot_pdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Histplot',
    bins=70,
    ax=None):
    """ Plots the estimated pdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.histplot(
        data=data[[feature, Cfg.TARGET]],
        x=feature,
        hue=Cfg.TARGET,
        bins=bins,
        palette='Blues_r',
        legend=True,
        kde=False,
        ax=ax
    )
    mean = np.mean(data[feature])
    ax.vlines(
        mean, 0, 1, 
        transform=ax.get_xaxis_transform(), 
        color='red', ls=':')
    
    ax.set_title(title)
    
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')
    
    return ax

In [None]:
def plot_boxplot(
    data:pd.DataFrame, 
    feature:str, 
    title='Boxplot',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    ax = sns.boxplot(
        x=Cfg.TARGET, 
        y=feature,
        palette='Blues_r',
        data=data
    )
    
    ax.set_title(title)
    
    ax.set_xlabel('Target {}'.format(Cfg.TARGET))
    ax.set_ylabel('Feature {}'.format(feature))
    
    return ax

In [None]:
X_data = get_sample_data(train_data, split_target=False)

for feature in Cfg.NUM_FEATURES[0:Cfg.N_FEATURE]:
    display(Markdown('### Feature `{}`'.format(feature)))
 
    info = np.round(train_data[feature].describe(), 4)
    
    format_str = '* mean: {}\n* std: {}\n* min: {}\n* 25%: {}\n* 50%: {}\n* 75%: {}\n* max: {}'
    display(Markdown(format_str.format(info['mean'], info['std'], info['min'], info['25%'], info['50%'], info['75%'], info['max'])))
    
    fig, ax = plt.subplots(1, 2, figsize=(13, 5))

    plot_pdf(X_data, feature, ax=ax[0])
    plot_boxplot(X_data, feature, ax=ax[1])
    
    plt.show()

## Binary features

In [None]:
def plot_heatmap(
    data:pd.DataFrame, 
    feature:str, 
    title='Heatmap',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))

    N, _ = data.shape
    sns.heatmap(
        pd.crosstab(data[feature], data['target']) / N,
        cmap='Blues_r',
        annot=True, 
        ax=ax
    )

    ax.set_title(title)
    return ax

In [None]:
for feature in Cfg.BINARY_FEATURES[0:Cfg.N_FEATURE]:
    display(Markdown('### Feature `{}`'.format(feature)))
            
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))

    plot_count(train_data, feature, ax=ax[0])
    plot_heatmap(train_data, feature, ax=ax[1])
    
    plt.show()

# Feature selection (FS) 

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.feature_selection import f_classif, chi2
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import make_column_selector, make_column_transformer

In [None]:
def plot_feature_importances(
    feature_imp, 
    feature_names,
    title='Feature importance',
    num=20, 
    ax=None
):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    df = pd.DataFrame({
        'feature': feature_names,
        'value': feature_imp
    }).sort_values('value', ascending=False).head(num)
    
    sns.barplot(
        x='value', 
        y='feature', 
        palette='Blues_r',
        data=df,
        ax=ax
    ) 

    ax.set_title(title)
    ax.set_ylabel("Features")

    return ax

## Variance Thresholding

In [None]:
 def plot_features_high_variance(
     stat_data, 
     features, 
     threshold, 
     title='Features hight variance'
 ):
    """
    """
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))

    # plot pdf
    sns.histplot(
        data=stat_data.loc['var'],
        bins=50,
        palette='Blues_r',
        legend=True,
        ax=ax[0]
    )

    # plot threshold line
    ax[0].vlines(
        threshold, 0, 1, 
        transform=ax[0].get_xaxis_transform(), 
        color='red', ls=':')

    plot_feature_importances(
        stat_data.T.loc[features]['var'], 
        feature_names=features,
        title=title,
        num=20, 
        ax=ax[1])

    plt.show()

In [None]:
num_threshold = 0.0025

plot_features_high_variance(
    stat_data.T.loc[Cfg.NUM_FEATURES].sort_values('var').T, 
    features=Cfg.NUM_FEATURES, 
    threshold=num_threshold, 
    title='Numerical Features hight variance'
)

In [None]:
bin_threshold = 0.175

plot_features_high_variance(
    stat_data.T.loc[Cfg.BINARY_FEATURES].sort_values('var').T, 
    features=Cfg.BINARY_FEATURES, 
    threshold=bin_threshold, 
    title='Binary Features hight variance'
)

## Recursive feature elimination (RFE)

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.001)

In [None]:
rf = RandomForestClassifier(random_state=Cfg.RANDOM_STATE)
rfe = RFECV(
    estimator=rf, 
    cv=StratifiedKFold(2),
    scoring='accuracy',
    min_features_to_select=1,
    step=3, 
    verbose=0
)

In [None]:
%%time

rfe.fit(X_data, y_data);
print('Optimal number of features: {}'.format(rfe.n_features_))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))

sns.lineplot(
    x=range(1, len(rfe.grid_scores_) + 1),
    y=rfe.grid_scores_,
    ax=ax
)

ax.set_title('Recursive feature elimination')
ax.set_xlabel('Number of features selected')
ax.set_ylabel('Cross validation score (accuracy)')

plt.show()

In [None]:
from sklearn.compose import make_column_selector, ColumnTransformer

feature_selector = make_column_transformer(
    ('drop', X_data.columns[~rfe.support_]), 
    remainder='passthrough'
)

# Linear discriminant analysis (LDA)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
idx = train_data.sample(frac=0.2, random_state=Cfg.RANDOM_STATE).index

X_data = train_data.iloc[idx][Cfg.FEATURES]
y_data = train_data.iloc[idx][Cfg.TARGET]

In [None]:
lda = LinearDiscriminantAnalysis()

lda_data = lda.fit_transform(X_data[Cfg.FEATURES], y_data)
num_lda_data = lda.fit_transform(X_data[Cfg.NUM_FEATURES], y_data)
bin_lda_data = lda.fit_transform(X_data[Cfg.BINARY_FEATURES], y_data)

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 9))

sns.scatterplot(
    x=lda_data.reshape(-1),
    y=0,
    hue=y_data,
    ax=ax[0],
    alpha=0.4
)
ax[0].set_title('All features')
ax[0].get_yaxis().set_visible(False)

sns.scatterplot(
    x=num_lda_data.reshape(-1),
    y=0,
    hue=y_data,
    ax=ax[1],
    alpha=0.4
)
ax[1].set_title('Numerical features')
ax[1].get_yaxis().set_visible(False)

sns.scatterplot(
    x=bin_lda_data.reshape(-1),
    y=0,
    hue=y_data,
    ax=ax[2],
    alpha=0.4
)
ax[2].set_title('Binary features')
ax[2].get_yaxis().set_visible(False)

plt.tight_layout()
plt.show()

## Correlation

work in process

# Principal component analysis (PCA)

In [None]:
from sklearn.decomposition import PCA, KernelPCA

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.01)

In [None]:
%%time

n_components=rfe.n_features_
kpca = make_pipeline(
    feature_selector,
    KernelPCA(
        n_components=n_components,
        kernel='poly',
        gamma=15,
        random_state=Cfg.RANDOM_STATE)
)

components = kpca.fit_transform(X_data, y_data)
pca_data = pd.DataFrame(
    components, 
    columns=['pc{}'.format(i) for i in range(1, n_components + 1)]
)

pca_data[Cfg.TARGET] = y_data.values

In [None]:
def plot_pca(data, x, y, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
        
    sns.scatterplot(
        data=data,
        x=x, 
        y=y,
        hue=Cfg.TARGET,
        legend='brief',
        alpha=0.2,
        ax=ax
    )

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(28, 5))

plot_pca(pca_data, 'pc1', 'pc2', ax=ax[0])
plot_pca(pca_data, 'pc2', 'pc3', ax=ax[1])
plot_pca(pca_data, 'pc3', 'pc4', ax=ax[2])
plot_pca(pca_data, 'pc4', 'pc5', ax=ax[3])
plot_pca(pca_data, 'pc5', 'pc6', ax=ax[4])

plt.tight_layout()
plt.show()

# Feature Engineering

work in process

# Feature importance

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.01)

In [None]:
rf = RandomForestClassifier(random_state=0).fit(X_data, y_data)
feature_imp = rf.estimators_[0].feature_importances_

In [None]:
fig, ax = plt.subplots(figsize=(20, 8))
plot_feature_importances(feature_imp, feature_names=Cfg.FEATURES, num=35, ax=ax)

fig.tight_layout()
plt.show()

# Modeling

In [None]:
import lightgbm as lgb

from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
def plot_model_proba(proba, ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(8, 5))

    sns.histplot(
        data=proba,
        palette='Blues_r',
        legend=True,
        bins=100,
        kde=True,
        ax=ax
    )

    ax.set_xlabel('Prediction probapility')
    ax.set_ylabel('Probabitity')

In [None]:
def plot_roc(model, X_val, y_val, ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    metrics.plot_roc_curve(model, X_val, y_val, ax=ax)

In [None]:
from sklearn import metrics

def plot_confusion_matrix(model, X_val, y_val, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)

    metrics.plot_confusion_matrix(
        model, 
        X_val, 
        y_val, 
        cmap=plt.cm.Blues,
        normalize='true', 
        ax=ax
    ) 

In [None]:
def display_model_result(model, X_val, y_val, y_pred, y_pred_proba):
    """
    """
    fig, ax = plt.subplots(1, 3, figsize=(20, 5))

    plot_roc(model, X_val, y_val, ax=ax[0])
    plot_model_proba(y_pred_proba, ax=ax[1])
    plot_confusion_matrix(model, X_val, y_val, ax=ax[2])

    plt.show()

    print(classification_report(y_val, y_pred))

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.1)

In [None]:
# spit data into train and validation data sets
X_train, X_val, y_train, y_val = train_test_split(
    X_data,
    y_data,
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE
)

In [None]:
print(f'train size: {X_train.shape[0]} rows')
print(f'val size  : {X_val.shape[0]} rows')

In [None]:
preprocess = make_pipeline(
    feature_selector,
    #PCA(n_components=200)
)

### Model `LinearDiscriminant`

In [None]:
lda_model = make_pipeline(
    feature_selector,
    #preprocess,
    LinearDiscriminantAnalysis()
)

In [None]:
%%time

y_lda_pred = lda_model.fit(X_train, y_train).predict(X_val)
y_lda_pred_proba = lda_model.predict_proba(X_val)[:, 1]

display_model_result(lda_model, X_val, y_val, y_lda_pred, y_lda_pred_proba)

### Model `DecisionTree`

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = make_pipeline(
    feature_selector,
    #preprocess,
    DecisionTreeClassifier(max_depth=5)
)

In [None]:
%%time

y_dt_pred = dt_model.fit(X_train, y_train).predict(X_val)
y_dt_pred_proba = dt_model.predict_proba(X_val)[:, 1]

display_model_result(dt_model, X_val, y_val, y_dt_pred, y_dt_pred_proba)

### Model `RandomForest` 

In [None]:
rf_model = make_pipeline(
    feature_selector,
    #preprocess,
    RandomForestClassifier(
        max_depth=5, 
        n_estimators=20
    )
)

In [None]:
 %%time

y_rf_pred = rf_model.fit(X_train, y_train).predict(X_val)
y_rf_pred_proba = rf_model.predict_proba(X_val)[:, 1]

display_model_result(rf_model, X_val, y_val, y_rf_pred, y_rf_pred_proba)

### Model `AdaBoost` 

In [None]:
ada_model = make_pipeline(
    feature_selector,
    #preprocess,
    AdaBoostClassifier()
)

In [None]:
%%time

y_ada_pred = ada_model.fit(X_train, y_train).predict(X_val)
y_ada_pred_proba = ada_model.predict_proba(X_val)[:, 1]

display_model_result(ada_model, X_val, y_val, y_ada_pred, y_ada_pred_proba)

### Model `LGBM` 

In [None]:
lgbm_model = make_pipeline(
    feature_selector,
    #preprocess,
    lgb.LGBMClassifier(
        learning_rate=0.05,
        n_estimators=1000,
        reg_lambda = 1
    )
)

In [None]:
%%time

y_lgbm_pred = lgbm_model.fit(X_train, y_train).predict(X_val)
y_lgbm_pred_proba = lgbm_model.predict_proba(X_val)[:, 1]

display_model_result(lgbm_model, X_val, y_val, y_lgbm_pred, y_lgbm_pred_proba)

### Model `XGB`

In [None]:
xgb_model = make_pipeline(
    feature_selector,
    #preprocess,
    XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='rmse',
        random_state=Cfg.RANDOM_STATE
    )
)

In [None]:
%%time

y_xgb_pred = xgb_model.fit(X_train, y_train).predict(X_val)
y_xgb_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

display_model_result(xgb_model, X_val, y_val, y_xgb_pred, y_xgb_pred_proba)

In [None]:
estimators = [
    ('dt', DecisionTreeClassifier(max_depth=5)),
    ('lda', LinearDiscriminantAnalysis()),
    ('rf', RandomForestClassifier(
            max_depth=5, 
            n_estimators=20)),
    ('ada', AdaBoostClassifier()),
    ('lgbm', lgb.LGBMClassifier(
        learning_rate=0.05,
        n_estimators=100,
        reg_lambda = 1)),
    ('xgb', XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='rmse',
        random_state=Cfg.RANDOM_STATE))
]
    
model = make_pipeline(
    feature_selector,
    #preprocess,
    StackingClassifier(
        estimators=estimators, 
        final_estimator=LinearDiscriminantAnalysis(),
        cv=3,
        n_jobs=-1,
        stack_method='predict_proba',
        verbose=0
    )
)


In [None]:
%%time

y_pred = model.fit(X_train, y_train).predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

display_model_result(model, X_val, y_val, y_pred, y_pred_proba)

# Submission

In [None]:
y_pred_submission = model.predict_proba(test_data)[:, 1]

In [None]:
submission_data = pd.DataFrame({
    Cfg.INDEX: test_data.index,
    Cfg.TARGET: y_pred_submission,
}).set_index(Cfg.INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(Cfg.SUBMISSION_FILE)