# Import

In [1]:
import pandas as pd 
import os
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression

## Import local module

In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('../skl_sampling_bayesian_transformer'))

In [3]:
import sampling_bayesian_encoder

## Remove warnings

In [4]:
import warnings
warnings.filterwarnings('ignore')


# Benchmark

In [5]:
train_raw = pd.read_csv('csv/input/cat-in-the-dat/train.csv')
test_raw = pd.read_csv('csv/input/cat-in-the-dat/test.csv')
target = train_raw['target']
train_raw.drop(['target', 'id'], axis=1, inplace=True)
test_raw.drop('id', axis=1, inplace=True)

all_features = list(train_raw.columns)

In [6]:
high_cardinal=[c for c in all_features if train_raw[c].nunique()>100]
low_cardinal=list(set(all_features)-set(high_cardinal))

In [7]:
low_cardinal

['nom_2',
 'ord_2',
 'nom_4',
 'ord_3',
 'ord_0',
 'bin_0',
 'bin_3',
 'bin_4',
 'ord_1',
 'month',
 'day',
 'nom_0',
 'bin_1',
 'ord_4',
 'bin_2',
 'nom_3',
 'nom_1']

In [8]:
high_cardinal

['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5']

In [9]:
features = high_cardinal

In [10]:
train = train_raw[features]

In [11]:
encoder_list = [OrdinalEncoder(), WOEEncoder(), TargetEncoder(), MEstimateEncoder(), JamesSteinEncoder(),
                    LeaveOneOutEncoder(), CatBoostEncoder()]
sampling_encoder = sampling_bayesian_encoder.SamplingBayesianEncoder(n_draws=10)

## Logistic regression

In [12]:
def score_all_encoders_lr(train, target, encoders, sampling_encoder):
    X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=97)
    for encoder in encoders:
        print_score_lr_encoder(encoder, X_train, X_val, y_train, y_val)
    print_score_lr_sampling(sampling_encoder, X_train, X_val, y_train, y_val)


def print_score_lr_encoder(encoder, X_train, X_val, y_train, y_val):
    print("Test {} : ".format(str(encoder).split('(')[0]), end=" ")
    train_enc = encoder.fit_transform(X_train, y_train)
    val_enc = encoder.transform(X_val)
    lr = LogisticRegression(C=0.1, solver="lbfgs", max_iter=1000)
    lr.fit(train_enc, y_train)
    lr_pred = lr.predict_proba(val_enc)[:, 1]
    score = auc(y_val, lr_pred)
    print("score: ", score)


def print_score_lr_sampling(encoder, X_train, X_val, y_train, y_val):
    print("Test {} : ".format(str(encoder).split('(')[0]), end=" ")
    lr = LogisticRegression(C=0.1, solver="lbfgs")  # , max_iter=1000)
    ew = sampling_bayesian_encoder.EncoderWrapper(encoder, lr)
    ew.fit(X_train, y_train)
    lr_pred = ew.predict_proba(X_val)
    score = auc(y_val, lr_pred)
    print(" Sampling bayesian score: ", score)

In [None]:
score_all_encoders_lr(train, target, encoder_list, sampling_encoder)

Test OrdinalEncoder :  score:  0.5047167727113783
Test WOEEncoder :  score:  0.6511724089376556
Test TargetEncoder :  score:  0.6502783426722238
Test MEstimateEncoder :  score:  0.6503511216531279
Test JamesSteinEncoder :  score:  0.6447155893417477
Test LeaveOneOutEncoder :  score:  0.6707955636756081
Test CatBoostEncoder :  score:  0.6666014837276445
Test SamplingBayesianEncoder :  

## Logistic regression with cross validation

In [None]:
def run_cv_encoding_wraper(train, test, target, sampling_encoder, lr_params):
    label = str(sampling_encoder).split('(')[0]
    kf = KFold(n_splits=5)
    fold_splits = kf.split(train, target)
    lr = LogisticRegression(**lr_params)
    wrapper = sampling_bayesian_encoder.EncoderWrapper(sampling_encoder, lr)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started {} fold {}/5'.format(label, i))
        dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        wrapper.fit(dev_X, dev_y)
        pred_val_y = wrapper.predict_proba(val_X)
        pred_test_y = wrapper.predict_proba(test)

        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        cv_score = auc(val_y, pred_val_y)
        cv_scores.append(cv_score)
        print(label + ' cv score {}: {}'.format(i, cv_score))
        i += 1

    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores}
    return results


def run_cv_lr(train, test, target, encoder, lr_params):
    label = str(encoder).split('(')[0]
    kf = KFold(n_splits=5)
    fold_splits = kf.split(train, target)
    model = LogisticRegression(**lr_params)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started {} fold {}/5'.format(label, i))
        dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        dev_X_enc = encoder.fit_transform(dev_X, dev_y)
        val_X_enc = encoder.transform(val_X)
        model.fit(dev_X_enc, dev_y)
        pred_val_y = model.predict_proba(val_X_enc)[:, 1]
        test_enc = encoder.transform(test)
        pred_test_y = model.predict_proba(test_enc)[:, 1]

        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        cv_score = auc(val_y, pred_val_y)
        cv_scores.append(cv_score)
        print(label + ' cv score {}: {}'.format(i, cv_score))
        i += 1

    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores}
    return results


In [None]:
lr_params = {'solver': 'lbfgs', 'C': 0.1}
results = list()
train = train_raw[features]
test = test_raw[features]

for encoder in encoder_list:
    result = run_cv_lr(train, test, target, encoder, lr_params)
    results.append(result)

sampling_encoder = sampling_bayesian_encoder.SamplingBayesianEncoder(n_draws=10)
result = run_cv_encoding_wraper(train, test, target, sampling_encoder, lr_params)
results.append(result)

results = pd.DataFrame(results)
results['cv_mean'] = results['cv'].apply(lambda l: np.mean(l))
results['cv_std'] = results['cv'].apply(lambda l: np.std(l))
print(results[['label', 'cv_mean', 'cv_std']].head(9))