# Importing libraries, basic functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [None]:
def show_me_the_clock():
    return (datetime.utcnow() + timedelta(hours=+9)).strftime('%Y-%m-%d %H:%M:%S')

def load_dataset(train_or_test='train'):
    file_path = f'../input/tabular-playground-series-nov-2021/{train_or_test}.csv'
    df = pd.read_csv(file_path)
    print(show_me_the_clock(), f'Dataset loaded, shape={df.shape}')
    df = tame_data(df)
    print(show_me_the_clock(), f'Dataset tamed, shape={df.shape}')
    if train_or_test == 'train':
        y = df['target'].to_numpy()
        del df['target']
        X = df.to_numpy()
        print(show_me_the_clock(), f'Dataset prepared, X.shape={X.shape}, y.shape={y.shape}')
        return X, y
    else:
        X = df.to_numpy()
        return X
    return df

def tame_data(df):
    df = df.copy()
    for i in range(100):
        q1, q2, q3 = df['f%d' % i].quantile([0.25, 0.5, 0.75]).iloc
        
        # normalization by median and IQR
        df['f%d' % i] = (df['f%d' % i] - q2) / (q3 - q1) 
        
        # concentrating data toward median
        df['f%d' % i] = df['f%d' % i].apply(lambda x: np.sign(x)*np.log1p(np.log1p(np.abs(x))))
    del df['id']
    df = df.rename(columns=lambda name: name + '_tamed' if name != 'target' else name)
    return df

# Data preparation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(*load_dataset('train'), test_size=0.2)

# Training and testing simple models
**(1) Logistic Regression - L1 regularized**

All of these exhibit low performances, and their metrics are similar.

Some features (e.g. f34, f27, f43, etc.) are found influential in each of those models.

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for C, ax_coef, ax_roc, ax_pr in zip([0.0001, 0.0003, 0.001, 0.003, 0.01], *axs):
    model = LogisticRegression(penalty='l1', C=C, solver='saga')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    acc_train = accuracy_score(y_train, y_pred_train)
    auc = roc_auc_score(y_test, y_proba)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    height = model.coef_.ravel()
    ax_coef.bar(x=list(range(100)), height=height, label='Coefficients')
    for h_ind in np.argsort(-np.abs(height)):
        ax_coef.text(h_ind, height[h_ind], f'{h_ind}')
    ax_coef.set_title(f'Linear model(L1 reg, C={C})', fontsize=16)
    ax_coef.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()
    
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label='PR curve', color='green')
    ax_pr.text(0.2, 0.2, f'precision={prec:.3f}\nrecall={rec:.3f}', fontsize=12)
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

**(2) Logistic Regression - L2 regularized**

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for C, ax_coef, ax_roc, ax_pr in zip([1e-7, 1e-6, 1e-5, 1e-4, 1e-3], *axs):
    model = LogisticRegression(penalty='l2', C=C)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    acc_train = accuracy_score(y_train, y_pred_train)
    auc = roc_auc_score(y_test, y_proba)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    height = model.coef_.ravel()
    ax_coef.bar(x=list(range(100)), height=height, label='Coefficients')
    for h_ind in np.argsort(-np.abs(height)):
        ax_coef.text(h_ind, height[h_ind], f'{h_ind}')
    ax_coef.set_title(f'Linear model(L2 reg, C={C})', fontsize=16)
    ax_coef.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()
    
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label='PR curve', color='green')
    ax_pr.text(0.2, 0.2, f'precision={prec:.3f}\nrecall={rec:.3f}', fontsize=12)
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

**(3) Decision tree**

a. varying "max_depth"

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for max_depth, ax_fi, ax_roc, ax_pr in zip([5, 10, 15, 20, 25], *axs):
    model = DecisionTreeClassifier(max_depth=max_depth, 
                                   min_samples_split=2, 
                                   min_samples_leaf=1,
                                   max_features=None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

    fi = model.feature_importances_
    ax_fi.bar(list(range(100)), fi, label='feature importances')
    for f_num in np.argsort(-fi):
        ax_fi.text(f_num, fi[f_num], f'{f_num}')
    ax_fi.set_title(f'Decision tree, max_depth={max_depth}', fontsize=16)
    ax_fi.set_xlabel('features')
    ax_fi.set_ylabel('importance')
    ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

b. varying "min_samples_split"

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for min_samples_split, ax_fi, ax_roc, ax_pr in zip([1e-3, 1e-2, 1e-1, 2e-1, 3e-1], *axs):
    model = DecisionTreeClassifier(max_depth=10, 
                                   min_samples_split=min_samples_split, 
                                   min_samples_leaf=1,
                                   max_features=None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

    fi = model.feature_importances_
    ax_fi.bar(list(range(100)), fi, label='feature importances')
    for f_num in np.argsort(-fi):
        ax_fi.text(f_num, fi[f_num], f'{f_num}')
    ax_fi.set_title(f'Decision tree, min_samples_split={min_samples_split}', fontsize=16)
    ax_fi.set_xlabel('features')
    ax_fi.set_ylabel('importance')
    ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

c. varying "min_samples_leaf"

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for min_samples_leaf, ax_fi, ax_roc, ax_pr in zip([1e-5, 1e-4, 1e-3, 1e-2, 1e-1], *axs):
    model = DecisionTreeClassifier(max_depth=10, 
                                   min_samples_split=1e-3, 
                                   min_samples_leaf=min_samples_leaf,
                                   max_features=None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

    fi = model.feature_importances_
    ax_fi.bar(list(range(100)), fi, label='feature importances')
    for f_num in np.argsort(-fi):
        ax_fi.text(f_num, fi[f_num], f'{f_num}')
    ax_fi.set_title(f'Decision tree, min_samples_leaf={min_samples_leaf}', fontsize=16)
    ax_fi.set_xlabel('features')
    ax_fi.set_ylabel('importance')
    ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

d. varying "max_features"

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for max_features, ax_fi, ax_roc, ax_pr in zip([8, 16, 32, 64, 82], *axs):
    model = DecisionTreeClassifier(max_depth=10, 
                                   min_samples_split=1e-3, 
                                   min_samples_leaf=1e-3,
                                   max_features=max_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

    fi = model.feature_importances_
    ax_fi.bar(list(range(100)), fi, label='feature importances')
    for f_num in np.argsort(-fi):
        ax_fi.text(f_num, fi[f_num], f'{f_num}')
    ax_fi.set_title(f'Decision tree, max_features={max_features}', fontsize=16)
    ax_fi.set_xlabel('features')
    ax_fi.set_ylabel('importance')
    ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

e. What if criterion is replaced by "entropy"?

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for max_features, ax_fi, ax_roc, ax_pr in zip([8, 16, 32, 64, 82], *axs):
    model = DecisionTreeClassifier(criterion='entropy', 
                                   max_depth=10, 
                                   min_samples_split=1e-3, 
                                   min_samples_leaf=1e-3,
                                   max_features=max_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

    fi = model.feature_importances_
    ax_fi.bar(list(range(100)), fi, label='feature importances')
    for f_num in np.argsort(-fi):
        ax_fi.text(f_num, fi[f_num], f'{f_num}')
    ax_fi.set_title(f'Decision tree, max_features={max_features}', fontsize=16)
    ax_fi.set_xlabel('features')
    ax_fi.set_ylabel('importance')
    ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')

**(4) kernel SVM**

This takes too long, even with much smaller training set .

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve
from sklearn.svm import SVC

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(24, 12))
for C, ax_fi, ax_roc, ax_pr in zip([1e-4, 1e-2, 1e0, 1e2, 1e4], *axs):
    model = SVC(C=C, kernel='rbf', probability=True)
    model.fit(X_train[::100], y_train[::100])
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    y_pred_train = model.predict(X_train)
    y_proba_train = model.predict_proba(X_train)[:, 1]
    acc_train = accuracy_score(y_train, y_pred_train)
    auc_train = roc_auc_score(y_train, y_proba_train)
    prec_train = precision_score(y_train, y_pred_train)
    rec_train = recall_score(y_train, y_pred_train)

#     fi = model.coef_[1]
#     ax_fi.bar(list(range(100)), fi, label='feature importances')
#     for f_num in np.argsort(-fi):
#         ax_fi.text(f_num, fi[f_num], f'{f_num}')
#     ax_fi.set_title(f'SVC with rbf, C={C}', fontsize=16)
#     ax_fi.set_xlabel('features')
#     ax_fi.set_ylabel('importance')
#     ax_fi.legend()
    
    ax_roc.plot(*roc_curve(y_train, y_proba_train)[:2], label=f'ROC curve(train)\naccuracy={acc_train:.3f}\nauc={auc_train:.3f}', color='green')
    ax_roc.plot(*roc_curve(y_test, y_proba)[:2], label=f'ROC curve(test)\naccuracy={acc:.3f}\nauc={auc:.3f}', color='red')
    ax_roc.set_xlim([0, 1])
    ax_roc.set_ylim([0, 1])
    ax_roc.set_xlabel('FPR')
    ax_roc.set_ylabel('TPR')
    ax_roc.legend()

    ax_pr.plot(*precision_recall_curve(y_train, y_proba_train)[:2], label=f'PR curve(train)\nprecision={prec_train:.3f}\nrecall={rec_train:.3f}', color='green')
    ax_pr.plot(*precision_recall_curve(y_test, y_proba)[:2], label=f'PR curve(test)\nprecision={prec:.3f}\nrecall={rec:.3f}', color='red')
    ax_pr.set_xlim([0, 1])
    ax_pr.set_ylim([0, 1])
    ax_pr.set_xlabel('precision')
    ax_pr.set_ylabel('recall')
    ax_pr.legend()

plt.tight_layout()
plt.show()
plt.close('all')