![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

# Setup

In [None]:
import warnings
warnings.filterwarnings('ignore', 'SettingWithCopyWarning')

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from xgboost import XGBClassifier

from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn import metrics, model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.mixture import GaussianMixture

from IPython.display import display, Markdown, Latex

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")

In [None]:
class Cfg:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/tabular-playground-series-nov-2021/train.csv'
    TEST_DATA = '../input/tabular-playground-series-nov-2021/test.csv'
    SUBMISSION = '../input/tabular-playground-series-nov-2021/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    TEST_SIZE = 0.4
    SAMPLE_FRAC = 0.03
    N_FEATURE = 5 # 285
    
    INDEX = 'id'
    TARGET = 'target'
    FEATURES = ['f{}'.format(i) for i in range(0, 100)]
    
    @staticmethod
    def set_seed():
        random.seed(Cfg.RANDOM_STATE)
        np.random.seed(Cfg.RANDOM_STATE)

Cfg.set_seed()

# Read data

In [None]:
def read_data(
    train_file:str=Cfg.TRAIN_DATA, 
    test_file:str=Cfg.TEST_DATA
) -> (pd.DataFrame, pd.DataFrame):
    """Reads the train and test data files
    """
    # read csv files
    train_df = pd.read_csv(train_file).set_index(Cfg.INDEX).astype(np.float32)
    train_df[Cfg.TARGET] = train_df[Cfg.TARGET].astype(np.short, copy=False)
    test_df = pd.read_csv(test_file).set_index(Cfg.INDEX).astype(np.float32)
    
    return train_df, test_df

In [None]:
%%time
train_data, test_data = read_data()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print('Train data: {} rows'.format(len(train_data)))
print('Test data: {} rows'.format(len(test_data)))

### Notice

* The training data contains 600,000 rows.

* The test data contains 540,000 rows.

* There are 100 features `f0` - `f99`

* The target variable `target` is binary (1/0)

# Missing values

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

### Notice

* There are no missing values in both data sets.

# Exploratory data analysis (EDA)

In [None]:
def get_sample_data(
    data,
    split_target=True,
    features=Cfg.FEATURES,
    target=Cfg.TARGET,
    frac=Cfg.SAMPLE_FRAC, 
    random_state=Cfg.RANDOM_STATE):
    """Select a sample subset from data
    """
    idx = data.sample(frac=frac, random_state=random_state).index

    if split_target:
        X_data = data.iloc[idx][features]
        y_data = data.iloc[idx][target]
    
        return X_data, y_data
    
    return train_data.iloc[idx]

In [None]:
stat_data = train_data.describe().drop('count')
stat_data.loc['var'] = stat_data.T['std']**2

stat_data.T.style.bar(
    subset=['mean'], 
    color='Bules'
).background_gradient(subset=['50%'], cmap='Blues')

## Target variable

In [None]:
def plot_count(
    data:pd.DataFrame, 
    feature:str, 
    title='Countplot',
    ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    
    sns.countplot(
        data=train_data,
        x=feature, 
        ax=ax)
    
    ax.set_title(title)

    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')

    return ax

In [None]:
plot_count(train_data, Cfg.TARGET, title='Target countplot');

### Notice

* The distribution of the target is balanced.

## Features `f0` - `f99`

In [None]:
def plot_pdf(
    data:pd.DataFrame, 
    feature:str,
    target=Cfg.TARGET,
    title='Histplot',
    bins=70,
    ax=None):
    """ Plots the estimated pdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.histplot(
        data=data,
        x=feature,
        hue=target,
        bins=bins,
        legend=True,
        kde=True,
        ax=ax)
    
    mean = np.mean(data[feature])
    ax.vlines(
        mean, 0, 1, 
        transform=ax.get_xaxis_transform(), 
        color='red', ls=':')
    
    ax.set_title(title)
    
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Count')
    
    return ax

In [None]:
def plot_boxplot(
    data:pd.DataFrame, 
    feature:str, 
    title='Boxplot',
    ax=None):

    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    ax = sns.boxplot(
        x=Cfg.TARGET, 
        y=feature,
        data=data
    )
    
    ax.set_title(title)
    
    ax.set_xlabel('Target {}'.format(Cfg.TARGET))
    ax.set_ylabel('Feature {}'.format(feature))
    
    return ax

In [None]:
def plot_ecdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Empirical distribution',
    ax=None):
    """Displays the ECDF
    """    
    if ax == None:
        fig, ax = plt.subplots(1, 1)
        
    target_0 = data[data[Cfg.TARGET] == 0][feature]
    target_1 = data[data[Cfg.TARGET] == 1][feature]
    
    ecdf_0 = ECDF(target_0)
    ecdf_1 = ECDF(target_1)

    ax.plot(ecdf_0.x, ecdf_0.y)
    ax.plot(ecdf_1.x, ecdf_1.y)
    
    ax.set_title(title)
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('ecdf')

    return ax

In [None]:
for feature in Cfg.FEATURES:
    display(Markdown('### Feature `{}`'.format(feature)))
 
    info = np.round(train_data[feature].describe(), 4)
    
    format_str = '* mean: {}\n* std: {}\n* min: {}\n* 25%: {}\n* 50%: {}\n* 75%: {}\n* max: {}'
    display(Markdown(format_str.format(
        info['mean'], 
        info['std'], 
        info['min'], 
        info['25%'], 
        info['50%'], 
        info['75%'], 
        info['max'])))
    
    fig, ax = plt.subplots(1, 3, figsize=(20, 5))

    plot_pdf(train_data, feature, ax=ax[0])
    plot_ecdf(train_data, feature, ax=ax[1])
    plot_boxplot(train_data, feature, ax=ax[2])
    
    plt.show()

### Summary

* There any some features with a bimodal distribution, e.g. `f1`, `f3`, etc.
* Some distributions have a low variance, e.g. `f2`, `f4`, etc. Features with low variance usually provide less information. 

## Features with bimodal distribution


In [None]:
# Features with a bimodel distribution
BIMODAL_DIST = [
    'f1', 'f3', 'f5', 'f6', 'f7', 'f8', 'f10', 'f11', 
    'f13', 'f14', 'f15', 'f17', 'f18', 'f22', 'f25', 
    'f26', 'f29', 'f34', 'f37', 'f38', 'f40', 'f41', 
    'f43', 'f45', 'f47', 'f50', 'f54', 'f55', 'f57', 
    'f65', 'f66', 'f67', 'f70', 'f71', 'f74', 'f77', 
    'f80', 'f82', 'f85', 'f86', 'f91', 'f96', 'f97'
]

In [None]:
bimodal_df = train_data[BIMODAL_DIST]
bimodal_stats = bimodal_df.describe().T

mu = np.round(bimodal_stats['mean'].mean(), 3)
std =  np.round(bimodal_stats['mean'].std(), 3)

print(f'The average mean of all bimodal features is {mu} with a standard deviation of {std}.')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

plot_pdf(bimodal_stats, 'mean', target=None, bins=12, title='mean of bimodale features', ax=ax[0])
plot_pdf(bimodal_stats, 'std', target=None, bins=12, title='std of bimodale features', ax=ax[1])

plt.show()

In [None]:
def plot_model_proba(proba, ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=(8, 5))

    sns.histplot(
        data=proba,
        legend=True,
        bins=100,
        kde=True,
        ax=ax
    )

    ax.set_xlabel('Prediction probapility')
    ax.set_ylabel('Probabitity')

In [None]:
def plot_roc(model, X_val, y_val, ax=None):
    """Displays the ROC
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    metrics.plot_roc_curve(model, X_val, y_val, ax=ax)

In [None]:
from sklearn import metrics

def plot_confusion_matrix(model, X_val, y_val, ax=None):
    """Displays confusion matrix
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)

    metrics.plot_confusion_matrix(
        model, 
        X_val, 
        y_val, 
        cmap=plt.cm.Blues,
        normalize='true', 
        ax=ax
    ) 

In [None]:
def display_model_result(model, X_val, y_val, y_pred, y_pred_proba=np.array([])):
    """
    """
    n_to_show = 3 if len(y_pred_proba) != 0 else 2
    figsize = (15, 5) if len(y_pred_proba) != 0 else (10, 5)
        
    fig, ax = plt.subplots(1, n_to_show, figsize=figsize)

    plot_roc(model, X_val, y_val, ax=ax[0])
    plot_confusion_matrix(model, X_val, y_val, ax=ax[1])
    
    if len(y_pred_proba) != 0:
        plot_model_proba(y_pred_proba, ax=ax[2])

    plt.tight_layout()
    plt.show()

    print(classification_report(y_val, y_pred))

In [None]:
X_data, y_data = get_sample_data(train_data, features=BIMODAL_DIST, frac=1)

X_train, X_val, y_train, y_val = train_test_split(
    X_data,
    y_data,
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE)

In [None]:
%%time

model = make_pipeline(
    StandardScaler(), 
    LogisticRegression(C=0.2, solver='liblinear')
)

y_pred = model.fit(X_train, y_train).predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

display_model_result(model, X_val, y_val, y_pred, y_pred_proba)

### Notice

* There are 43 features with bimodal distribution.
* These features result in an accuracy = 0.68 for the target variable.
* The average mean of all bimodal features is 2.555 with a standard deviation of 0.076.



## Features with low variance

Features with low variance usually provide less information. 

In [None]:
LOW_VARIANCE_FEATURES = list(set(Cfg.FEATURES) - set(BIMODAL_DIST))

In [None]:
X_data, y_data = get_sample_data(train_data, features=LOW_VARIANCE_FEATURES, frac=1)

X_train, X_val, y_train, y_val = train_test_split(
    X_data,
    y_data,
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE)

In [None]:
%%time

model = make_pipeline(
    StandardScaler(), 
    LogisticRegression(C=0.2, solver='liblinear')
)

y_pred = model.fit(X_train, y_train).predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

display_model_result(model, X_val, y_val, y_pred, y_pred_proba)

### Notice

* There are 57 features with bimodal distribution.
* These features result in an accuracy = 0.58 for the target variable

## Gaussian mixture

A bimodal distribution most commonly arises as a mixture of two different unimodal distributions (see: https://www.wikiwand.com/en/Multimodal_distribution). 

In [None]:
def get_gaussian_mixture(data):
    """
    """
    gm = GaussianMixture(n_components=2, random_state=Cfg.RANDOM_STATE).fit(data)
    gm_proba = np.round(gm.predict_proba(data)[:, 0], 4)
    
    df = pd.DataFrame({
        'id': data.index,
        'gm_proba': gm_proba,
        'gm': (gm_proba > 0.5).astype(np.int),
        'target': data[Cfg.TARGET]
    }).set_index('id')
    
    return df

In [None]:
%%time

bimodal_data = get_gaussian_mixture(train_data[BIMODAL_DIST + [Cfg.TARGET]])
bimodal_data.head()

In [None]:
print(classification_report(bimodal_data['target'], bimodal_data['gm']))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 4))

N, _ = bimodal_data.shape
sns.heatmap(
    pd.crosstab(bimodal_data['gm'], bimodal_data['target']) / N,
    cmap='Blues_r',
    annot=True, 
    ax=ax
)

plt.tight_layout()
plt.show()

### Notice

* Unfortunately, it is not possible to make a prediction based on the gaussian mixture feature `gm`.

## Correlation

Now we will identify the features that have a high correlation with the target variable.

In [None]:
high_corr = train_data.sample(frac=0.01).corr().abs()[[Cfg.TARGET]]
high_corr.columns = ['corr']
high_corr = high_corr.sort_values(by='corr', ascending=False).head(10)

high_corr

In [None]:
df = train_data.sample(frac=0.001)[high_corr.index]
g = sns.pairplot(
    data=df, 
    hue=Cfg.TARGET,
    corner=True)

g.fig.set_size_inches(15, 15)

fig.tight_layout()
plt.show()

# Feature importance

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.inspection import permutation_importance

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.1)
X_train, X_val, y_train, y_val = train_test_split(
    X_data,
    y_data,
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE)

print(f'train size: {X_train.shape[0]} rows')
print(f'val size  : {X_val.shape[0]} rows')

## Baseline model

In [None]:
baseline_model = RandomForestClassifier(
    n_estimators=30, 
    random_state=Cfg.RANDOM_STATE).fit(X_train, y_train)

 ## Permutation importance

In [None]:
%%time
result = permutation_importance(baseline_model, X_val, y_val, n_repeats=10, random_state=Cfg.RANDOM_STATE)

In [None]:
n_to_show = 20

importance_df = pd.DataFrame({
    'feature': X_val.columns.tolist(),
    'weight': result.importances_mean,
    'std': result.importances_std
}).set_index('feature').sort_values(by='weight', ascending=False)

importance_df.head(n_to_show)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 8))

df = importance_df.head(n_to_show)
sns.barplot(
    data=df,
    x='weight', 
    y=df.index, 
    palette='Blues_r',
    ax=ax) 

ax.set_title('Permutation importance')
ax.set_xlabel("Weights")
ax.set_ylabel("Features")

plt.show()

## Recursive feature elimination (RFE)

In [None]:
rfe = RFECV(
    estimator=baseline_model, 
    cv=StratifiedKFold(2),
    scoring='accuracy',
    min_features_to_select=1,
    step=3, 
    verbose=0
)

In [None]:
%%time

rfe.fit(X_data, y_data);
print('Optimal number of features: {}'.format(rfe.n_features_))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

sns.lineplot(
    x=range(1, len(rfe.grid_scores_) + 1),
    y=rfe.grid_scores_,
    ax=ax
)

ax.set_title('Recursive feature elimination')
ax.set_xlabel('Number of features selected')
ax.set_ylabel('Cross validation score (accuracy)')

plt.show()

# Linear discriminant analysis (LDA)

In [None]:
X_data, y_data = train_data[Cfg.FEATURES], train_data[Cfg.TARGET] 
lda = LinearDiscriminantAnalysis()

lda_data = pd.DataFrame({
    'lda': lda.fit_transform(X_data, y_data).reshape(-1),
    'target': y_data
})

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 3))

sns.scatterplot(
    data=lda_data.sample(frac=0.6),
    x='lda',
    y=0,
    hue='target',
    ax=ax,
    alpha=0.4
)
ax.set_title('LDA')
ax.get_yaxis().set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

plot_pdf(lda_data, 'lda', ax=ax[0])
plot_pdf(lda_data[lda_data[Cfg.TARGET] == 0], 'lda', ax=ax[1])
plot_pdf(lda_data[lda_data[Cfg.TARGET] == 1], 'lda', ax=ax[2])

plt.tight_layout()
plt.show()

# Principal component analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
def plot_pca(data, x, y, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)
        
    sns.scatterplot(
        data=data,
        x=x, 
        y=y,
        hue=Cfg.TARGET,
        legend='brief',
        alpha=0.4,
        ax=ax)

In [None]:
X_data, y_data = get_sample_data(train_data, frac=0.2)

n_components=100
pca = make_pipeline(
    StandardScaler(), 
    PCA(n_components=n_components, random_state=Cfg.RANDOM_STATE)
)

pca_df = pd.DataFrame(
    pca.fit_transform(X_data, y_data), 
    columns=['pc{}'.format(i) for i in range(1, n_components + 1)])

pca_df[Cfg.TARGET] = y_data.values

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(28, 5))

plot_pca(pca_df, 'pc1', 'pc2', ax=ax[0])
plot_pca(pca_df, 'pc2', 'pc3', ax=ax[1])
plot_pca(pca_df, 'pc3', 'pc4', ax=ax[2])
plot_pca(pca_df, 'pc4', 'pc5', ax=ax[3])
plot_pca(pca_df, 'pc5', 'pc6', ax=ax[4])

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sns.lineplot(
    x=range(1, n_components+1),
    y=pca['pca'].explained_variance_ratio_,
    ax=ax[0])

ax[0].set_title('Explained variance ratio')
ax[0].set_xlabel("# of components")
ax[0].set_ylabel("Variance ratio")

sns.lineplot(
    x=range(1, n_components+1),
    y=np.cumsum(pca['pca'].explained_variance_ratio_),
    ax=ax[1])

ax[1].set_title('Cumulative explained variance ratio')
ax[1].set_xlabel("# of components")
ax[1].set_ylabel("Cumulative ratio") 

plt.tight_layout()
plt.show()

#  Feature engineering

In [None]:
def add_agg_features(data):
    """Create aggregated features
    """
    df = data.copy()
    
    df.loc[:, 'min'] = np.min(data[Cfg.FEATURES], axis=1)
    df.loc[:, 'max'] = np.max(data[Cfg.FEATURES], axis=1)
    df.loc[:, 'var'] = np.var(data[Cfg.FEATURES], axis=1)
    df.loc[:, 'std'] = np.std(data[Cfg.FEATURES], axis=1)
    df.loc[:, 'sum'] = np.sum(data[Cfg.FEATURES], axis=1)
    df.loc[:, 'mean'] = np.mean(data[Cfg.FEATURES], axis=1)
    
    return df

In [None]:
def add_gaussian_mixture(data):
    """
    """
    df = data.copy()
    
    gm = GaussianMixture(n_components=2, random_state=Cfg.RANDOM_STATE).fit(df[BIMODAL_DIST])
    df.loc[:, 'gm'] =  gm.predict_proba(df[BIMODAL_DIST])[:, 0]
    
    return df

In [None]:
feature_engineering = make_pipeline(
    FunctionTransformer(add_agg_features),
    FunctionTransformer(add_gaussian_mixture),
    StandardScaler()
)

# Modeling

In [None]:
X_data, y_data = get_sample_data(train_data, frac=1)

X_train, X_val, y_train, y_val = train_test_split(
    train_data[Cfg.FEATURES],
    train_data[Cfg.TARGET],
    test_size=Cfg.TEST_SIZE, 
    random_state=Cfg.RANDOM_STATE)

print(f'train size: {X_train.shape[0]} rows')
print(f'val size  : {X_val.shape[0]} rows')

## Model `LogisticRegression`

In [None]:
%%time

lr_model = make_pipeline(
    feature_engineering, 
    LogisticRegression(C=0.2, solver='liblinear'))

y_pred = lr_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = lr_model.predict_proba(X_val)[:, 1]

display_model_result(lr_model, X_val, y_val, y_pred, y_pred_proba)

## Model `LinearDiscriminant`

In [None]:
%%time

lda_model = make_pipeline(
    feature_engineering, 
    LinearDiscriminantAnalysis())

y_pred = lda_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = lda_model.predict_proba(X_val)[:, 1]

display_model_result(lda_model, X_val, y_val, y_pred, y_pred_proba)

### Model `SGD`

In [None]:
%%time

from sklearn.linear_model import SGDClassifier

sgd_model = make_pipeline(
    feature_engineering, 
    SGDClassifier(loss='log'))

y_pred = sgd_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = sgd_model.predict_proba(X_val)[:, 1]

display_model_result(sgd_model, X_val, y_val, y_pred, y_pred_proba)

### Model `Ridge`

In [None]:
%%time

from sklearn.linear_model import RidgeClassifier

ridge_model = make_pipeline(
    feature_engineering,  
    RidgeClassifier())

y_pred = ridge_model.fit(X_train, y_train).predict(X_val)
display_model_result(ridge_model, X_val, y_val, y_pred)

## Model `DecisionTree`

In [None]:
%%time

dt_model = make_pipeline(
    feature_engineering,  
    DecisionTreeClassifier(max_depth=5)
)

y_pred = dt_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = dt_model.predict_proba(X_val)[:, 1]

display_model_result(dt_model, X_val, y_val, y_pred, y_pred_proba)

## Model `RandomForest`

In [None]:
%%time

rf_model = make_pipeline(
    feature_engineering, 
    RandomForestClassifier(
        max_depth=5, 
        n_estimators=20)
)

y_pred = rf_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = rf_model.predict_proba(X_val)[:, 1]

display_model_result(rf_model, X_val, y_val, y_pred, y_pred_proba)

## Model `LGBM`

In [None]:
%%time

lgbm_model = make_pipeline(
    feature_engineering,  
    lgb.LGBMClassifier(
        learning_rate=0.05,
        n_estimators=1000,
        reg_lambda = 1)
)

y_pred = lgbm_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = lgbm_model.predict_proba(X_val)[:, 1]

display_model_result(lgbm_model, X_val, y_val, y_pred, y_pred_proba)

## Model `XGB`

In [None]:
%%time

xgb_model = make_pipeline(
    feature_engineering,  
    XGBClassifier(
        n_estimators=100,
        use_label_encoder=False,
        eval_metric='rmse',
        random_state=Cfg.RANDOM_STATE)
)

y_pred = xgb_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

display_model_result(xgb_model, X_val, y_val, y_pred, y_pred_proba)

## Model `AdaBoost`

In [None]:
%%time

ada_model = make_pipeline(
    StandardScaler(), 
    AdaBoostClassifier())

y_pred = ada_model.fit(X_train, y_train).predict(X_val)
y_pred_proba = ada_model.predict_proba(X_val)[:, 1]

display_model_result(xgb_model, X_val, y_val, y_pred, y_pred_proba)

## Stacking model 

In [None]:
estimators = [
    ('lda', LinearDiscriminantAnalysis()),
    ('lr',  LogisticRegression(C=0.2, solver='liblinear')),
    ('sgd',  SGDClassifier(loss='log')),
    ('ridge', RidgeClassifier())
]

model = make_pipeline(
    feature_engineering, 
    StackingClassifier(
        estimators=estimators, 
        final_estimator=LogisticRegression(),
        cv=3,
        verbose=0)
)

In [None]:
%%time

y_pred = model.fit(X_train, y_train).predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

display_model_result(model, X_val, y_val, y_pred, y_pred_proba)

# Submission

In [None]:
y_pred_submission = model.predict_proba(test_data)[:, 1]

In [None]:
submission_data = pd.DataFrame({
    Cfg.INDEX: test_data.index,
    Cfg.TARGET: y_pred_submission,
}).set_index(Cfg.INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(Cfg.SUBMISSION_FILE)

Thank you for reading.