<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/subscriber_prediction_hackathon/notebooks/04_sklearn_spot_checking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [93]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

In [94]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

In [95]:
import sklearn.ensemble as ele
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB, ComplementNB
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.metrics import log_loss

In [96]:
SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [97]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/subscriber_prediction_hackathon/data'

train = pd.read_csv(f'{DATA_URL}/processed/train.csv') #processed dataset from notebook 00
test = pd.read_csv(f'{DATA_URL}/processed/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [98]:
TARGET = 'y_bool'

In [99]:
features = list(test.columns)
num_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
cat_features = [f for f in features if f not in num_features]

In [100]:
train[cat_features] = train[cat_features].astype('int8')
test[cat_features] = test[cat_features].astype('int8')

In [101]:
#feature sets
original_features = ['age', 'job', 'marital', 'education', 'default', 'balance',
                     'housing', 'loan', 'contact', 'day', 'month', 'duration', 
                     'campaign', 'pdays', 'previous', 'poutcome']

cat_only_features = ['age_bins', 'job_groups', 'marital', 'education', 'default',
                     'balance_bins', 'housing', 'loan', 'contact', 'day_bins', 
                     'month_bins', 'duration_bins', 'campaign_bins', 'pdays_bins',
                     'pdays_bool', 'previous_bins', 'previous_bool', 'poutcome']

In [102]:
from google.colab import drive
drive.mount('/content/drive')

NOTEBOOK = '04'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/subscriber_prediction_hackathon/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Cross-validation

In [103]:
def cross_validate_predict(feature_set, model):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X = train[feature_set]
    y = train[TARGET]
    X_test = test[feature_set]

    cv_start = time.time()
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        fold_start = time.time()
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        val_preds = model.predict_proba(X_val)[:, 1]
        fold_end = time.time()

        oof_preds.update(dict(zip(val_idx, val_preds)))
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

        score = log_loss(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: Score = {score:.5f} ' \
              f'[{fold_end - fold_start:.2f} secs]')
        _ = gc.collect()
        
    cv_end = time.time()
    print(f'\nAvg. Score = {np.mean(scores):.5f} +/- {np.std(scores):.5f} ' \
          f'[{cv_end - cv_start:.2f} secs]\n')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)

    return oof_preds, test_preds

In [104]:
def create_submission_files(test_preds, model_name, feature_set):
    path = f'{SUBMISSION_PATH}/{model_name}'
    if not os.path.isdir(path):
        os.makedirs(path)
        
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{path}/{feature_set}_{col}.csv', index=False)

In [105]:
oof = pd.DataFrame() #Out-of-fold predictions

# Linear

### log_loss (Logistic Regression)

In [106]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal_cat', MaxAbsScaler(), make_column_selector(dtype_include='int8')),
        ('numerical', StandardScaler(), make_column_selector(dtype_exclude='int8'))
    ],
    remainder='passthrough'
)

classifier = SGDClassifier(
    loss='log', # 'log' for version 1.0.2, log_loss for >1.1
    penalty='elasticnet',
    max_iter= 5000,
    alpha=0.01,
    learning_rate='optimal',
    early_stopping=True,
    n_iter_no_change=25,
    n_jobs=-1,
    random_state=SEED
)

model = make_pipeline(preprocessor, classifier)

oof['sgdlog_all'], tp_sgdlog_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58180 [0.40 secs]
Fold #1: Score = 0.58038 [0.42 secs]
Fold #2: Score = 0.57988 [0.41 secs]
Fold #3: Score = 0.58161 [0.41 secs]
Fold #4: Score = 0.58228 [0.42 secs]

Avg. Score = 0.58119 +/- 0.00091 [2.59 secs]



In [107]:
oof['sgdlog_og'], tp_sgdlog_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58192 [0.30 secs]
Fold #1: Score = 0.58056 [0.32 secs]
Fold #2: Score = 0.57988 [0.32 secs]
Fold #3: Score = 0.58189 [0.34 secs]
Fold #4: Score = 0.58234 [0.32 secs]

Avg. Score = 0.58132 +/- 0.00094 [2.13 secs]



In [108]:
oof['sgdlog_cat'], tp_sgdlog_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58176 [0.32 secs]
Fold #1: Score = 0.58143 [0.33 secs]
Fold #2: Score = 0.58082 [0.33 secs]
Fold #3: Score = 0.58146 [0.35 secs]
Fold #4: Score = 0.58166 [0.32 secs]

Avg. Score = 0.58143 +/- 0.00033 [2.17 secs]



In [109]:
# create_submission_files(tp_sgdlog_all, 'sgdlog', 'all')
# create_submission_files(tp_sgdlog_og, 'sgdlog', 'og')
# create_submission_files(tp_sgdlog_cat, 'sgdlog', 'cat')

### modified_huber loss

In [110]:
classifier = SGDClassifier(
    loss='modified_huber',
    penalty='elasticnet',
    max_iter= 5000,
    alpha=0.01,
    learning_rate='optimal',
    early_stopping=True,
    n_iter_no_change=25,
    n_jobs=-1,
    random_state=SEED
)

model = make_pipeline(preprocessor, classifier)

In [111]:
oof['sgdhub_all'], tp_sgdhub_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58137 [0.39 secs]
Fold #1: Score = 0.58176 [0.38 secs]
Fold #2: Score = 0.58001 [0.41 secs]
Fold #3: Score = 0.58395 [0.67 secs]
Fold #4: Score = 0.58204 [0.84 secs]

Avg. Score = 0.58183 +/- 0.00127 [3.43 secs]



In [112]:
oof['sgdhub_og'], tp_sgdhub_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58149 [0.72 secs]
Fold #1: Score = 0.58080 [0.57 secs]
Fold #2: Score = 0.58051 [0.61 secs]
Fold #3: Score = 0.58426 [0.78 secs]
Fold #4: Score = 0.58328 [0.79 secs]

Avg. Score = 0.58207 +/- 0.00146 [4.37 secs]



In [113]:
oof['sgdhub_cat'], tp_sgdhub_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58172 [0.69 secs]
Fold #1: Score = 0.58116 [0.60 secs]
Fold #2: Score = 0.57999 [0.70 secs]
Fold #3: Score = 0.58264 [0.78 secs]
Fold #4: Score = 0.58325 [0.75 secs]

Avg. Score = 0.58175 +/- 0.00114 [4.50 secs]



In [114]:
# create_submission_files(tp_sgdhub_all, 'sgdhub', 'all')
# create_submission_files(tp_sgdhub_og, 'sgdhub', 'og')
# create_submission_files(tp_sgdhub_cat, 'sgdhub', 'cat')

# Tree

### DecisionTree

In [115]:
model = make_pipeline(
    preprocessor, 
    DecisionTreeClassifier(random_state=SEED)
)

In [116]:
oof['dt_all'], tp_dt_all = cross_validate_predict(features, model)

Fold #0: Score = 13.54430 [0.53 secs]
Fold #1: Score = 14.00482 [0.42 secs]
Fold #2: Score = 14.00482 [0.46 secs]
Fold #3: Score = 13.89791 [0.49 secs]
Fold #4: Score = 13.86503 [0.58 secs]

Avg. Score = 13.86338 +/- 0.16913 [3.35 secs]



In [117]:
oof['dt_og'], tp_dt_og = cross_validate_predict(original_features, model)

Fold #0: Score = 13.81568 [0.42 secs]
Fold #1: Score = 13.28937 [0.39 secs]
Fold #2: Score = 14.13640 [0.34 secs]
Fold #3: Score = 13.71699 [0.37 secs]
Fold #4: Score = 14.01304 [0.31 secs]

Avg. Score = 13.79430 +/- 0.29198 [2.53 secs]



In [118]:
oof['dt_cat'], tp_dt_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 13.94300 [0.09 secs]
Fold #1: Score = 14.48530 [0.09 secs]
Fold #2: Score = 13.68873 [0.09 secs]
Fold #3: Score = 13.79365 [0.10 secs]
Fold #4: Score = 13.82754 [0.09 secs]

Avg. Score = 13.94764 +/- 0.28081 [0.87 secs]



In [119]:
# create_submission_files(tp_dt_all, 'dt', 'all')
# create_submission_files(tp_dt_og, 'dt', 'og')
# create_submission_files(tp_dt_cat, 'dt', 'cat')

### ExtraTree

In [120]:
model = make_pipeline(
    preprocessor, 
    ExtraTreeClassifier(random_state=SEED)
)

In [121]:
oof['et_all'], tp_et_all = cross_validate_predict(features, model)

Fold #0: Score = 13.47851 [0.06 secs]
Fold #1: Score = 13.40450 [0.06 secs]
Fold #2: Score = 13.68409 [0.06 secs]
Fold #3: Score = 13.71699 [0.06 secs]
Fold #4: Score = 13.91436 [0.06 secs]

Avg. Score = 13.63969 +/- 0.18148 [0.73 secs]



In [122]:
oof['et_og'], tp_et_og = cross_validate_predict(original_features, model)

Fold #0: Score = 13.70054 [0.05 secs]
Fold #1: Score = 13.93903 [0.05 secs]
Fold #2: Score = 13.44561 [0.05 secs]
Fold #3: Score = 13.63476 [0.06 secs]
Fold #4: Score = 14.10350 [0.05 secs]

Avg. Score = 13.76469 +/- 0.23152 [0.71 secs]



In [123]:
oof['et_cat'], tp_et_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 13.57342 [0.04 secs]
Fold #1: Score = 13.70447 [0.04 secs]
Fold #2: Score = 13.82023 [0.04 secs]
Fold #3: Score = 13.31158 [0.04 secs]
Fold #4: Score = 13.65625 [0.04 secs]

Avg. Score = 13.61319 +/- 0.17065 [0.64 secs]



In [124]:
# create_submission_files(tp_et_all, 'et', 'all')
# create_submission_files(tp_et_og, 'et', 'og')
# create_submission_files(tp_et_cat, 'et', 'cat')

In [125]:
tree_mean = oof[['dt_all', 'dt_og', 'dt_cat', 'et_all', 'et_og', 'et_cat']].mean(axis=1)

In [126]:
log_loss(train[TARGET], tree_mean)

2.1939642756632245

In [127]:
tree_mode = oof[['dt_all', 'dt_og', 'dt_cat', 'et_all', 'et_og', 'et_cat']].mode(axis=1)[0]

In [128]:
log_loss(train[TARGET], tree_mode)

10.644572585354481

# Naive Bayes

### GaussianNB

In [129]:
model = make_pipeline(
    preprocessor,
    GaussianNB()
)

In [130]:
oof['gssnb_all'], tp_gssnb_all = cross_validate_predict(features, model)

Fold #0: Score = 0.71694 [0.04 secs]
Fold #1: Score = 0.64457 [0.04 secs]
Fold #2: Score = 0.67824 [0.04 secs]
Fold #3: Score = 0.74986 [0.04 secs]
Fold #4: Score = 0.72367 [0.05 secs]

Avg. Score = 0.70266 +/- 0.03700 [0.66 secs]



In [131]:
oof['gssnb_og'], tp_gssnb_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.66764 [0.04 secs]
Fold #1: Score = 0.62500 [0.04 secs]
Fold #2: Score = 0.62145 [0.03 secs]
Fold #3: Score = 0.69109 [0.03 secs]
Fold #4: Score = 0.70017 [0.03 secs]

Avg. Score = 0.66107 +/- 0.03269 [0.62 secs]



In [132]:
oof['gssnb_cat'], tp_gssnb_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.62237 [0.03 secs]
Fold #1: Score = 0.59207 [0.03 secs]
Fold #2: Score = 0.64144 [0.03 secs]
Fold #3: Score = 0.62181 [0.03 secs]
Fold #4: Score = 0.60422 [0.03 secs]

Avg. Score = 0.61638 +/- 0.01693 [0.56 secs]



In [133]:
# create_submission_files(tp_gssnb_all, 'gssnb', 'all')
# create_submission_files(tp_gssnb_og, 'gssnb', 'og')
# create_submission_files(tp_gssnb_cat, 'gssnb', 'cat')

### CategoricalNB

In [134]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('ordinal_cat', OrdinalEncoder(), make_column_selector(dtype_include='int8')),
            ('numerical', StandardScaler(), make_column_selector(dtype_exclude='int8'))
        ],
        remainder='passthrough'
    ),
    CategoricalNB()
)

In [135]:
oof['catnb_cat'], tp_catnb_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58380 [0.07 secs]
Fold #1: Score = 0.58176 [0.06 secs]
Fold #2: Score = 0.57980 [0.06 secs]
Fold #3: Score = 0.58440 [0.06 secs]
Fold #4: Score = 0.58018 [0.06 secs]

Avg. Score = 0.58199 +/- 0.00186 [0.80 secs]



In [136]:
# create_submission_files(tp_catnb_cat, 'catnb', 'cat')

### ComplementNB

In [137]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('ordinal_cat', OrdinalEncoder(), make_column_selector(dtype_include='int8')),
            ('numerical', MinMaxScaler(), make_column_selector(dtype_exclude='int8'))
        ],
        remainder='passthrough'
    ),
    ComplementNB()
)

In [138]:
oof['cmpnb_all'], tp_cmpnb_all = cross_validate_predict(features, model)

Fold #0: Score = 0.69555 [0.08 secs]
Fold #1: Score = 0.69083 [0.09 secs]
Fold #2: Score = 0.69435 [0.07 secs]
Fold #3: Score = 0.69461 [0.08 secs]
Fold #4: Score = 0.69300 [0.07 secs]

Avg. Score = 0.69367 +/- 0.00164 [1.04 secs]



In [139]:
oof['cmpnb_og'], tp_cmpnb_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.69267 [0.05 secs]
Fold #1: Score = 0.69268 [0.06 secs]
Fold #2: Score = 0.69253 [0.06 secs]
Fold #3: Score = 0.69306 [0.05 secs]
Fold #4: Score = 0.69298 [0.05 secs]

Avg. Score = 0.69278 +/- 0.00020 [0.86 secs]



In [140]:
oof['cmpnb_cat'], tp_cmpnb_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.69469 [0.06 secs]
Fold #1: Score = 0.69105 [0.06 secs]
Fold #2: Score = 0.69419 [0.06 secs]
Fold #3: Score = 0.69372 [0.06 secs]
Fold #4: Score = 0.69269 [0.06 secs]

Avg. Score = 0.69327 +/- 0.00129 [0.89 secs]



In [141]:
# create_submission_files(tp_cmpnb_all, 'cmpnb', 'all')
# create_submission_files(tp_cmpnb_og, 'cmpnb', 'og')
# create_submission_files(tp_cmpnb_cat, 'cmpnb', 'cat')

# Neural network

In [151]:
model = make_pipeline(
    ColumnTransformer(
        transformers=[
            ('ordinal_cat', MaxAbsScaler(), make_column_selector(dtype_include='int8')),
            ('numerical', MaxAbsScaler(), make_column_selector(dtype_exclude='int8'))
        ],
        remainder='passthrough'
    ),
    MLPClassifier(
        learning_rate_init=0.001,
        early_stopping=True,
        n_iter_no_change=25,
        max_iter=1000
    )
)

In [152]:
oof['mlp_all'], tp_mlp_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58447 [3.53 secs]
Fold #1: Score = 0.58215 [6.56 secs]
Fold #2: Score = 0.58342 [3.47 secs]
Fold #3: Score = 0.58408 [3.42 secs]
Fold #4: Score = 0.58466 [3.49 secs]

Avg. Score = 0.58376 +/- 0.00091 [21.08 secs]



In [153]:
oof['mlp_og'], tp_mlp_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58314 [3.17 secs]
Fold #1: Score = 0.58485 [3.23 secs]
Fold #2: Score = 0.58286 [3.15 secs]
Fold #3: Score = 0.58475 [3.12 secs]
Fold #4: Score = 0.58729 [3.15 secs]

Avg. Score = 0.58458 +/- 0.00158 [16.41 secs]



In [154]:
oof['mlp_cat'], tp_mlp_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58469 [7.88 secs]
Fold #1: Score = 0.58315 [3.20 secs]
Fold #2: Score = 0.57961 [3.23 secs]
Fold #3: Score = 0.58475 [3.23 secs]
Fold #4: Score = 0.58368 [3.34 secs]

Avg. Score = 0.58318 +/- 0.00188 [21.56 secs]



In [155]:
# create_submission_files(tp_mlp_all, 'mlp', 'all')
# create_submission_files(tp_mlp_og, 'mlp', 'og')
# create_submission_files(tp_mlp_cat, 'mlp', 'cat')

# Ensemble

### AdaBoost

In [193]:
model = ele.AdaBoostClassifier(random_state=SEED)

In [194]:
oof['ada_all'], tp_ada_all = cross_validate_predict(features, model)

Fold #0: Score = 0.68843 [1.84 secs]
Fold #1: Score = 0.68839 [1.00 secs]
Fold #2: Score = 0.68836 [1.07 secs]
Fold #3: Score = 0.68840 [1.65 secs]
Fold #4: Score = 0.68847 [1.02 secs]

Avg. Score = 0.68841 +/- 0.00004 [7.59 secs]



In [195]:
oof['ada_og'], tp_ada_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.68844 [0.81 secs]
Fold #1: Score = 0.68840 [0.82 secs]
Fold #2: Score = 0.68835 [0.83 secs]
Fold #3: Score = 0.68844 [0.81 secs]
Fold #4: Score = 0.68843 [0.80 secs]

Avg. Score = 0.68841 +/- 0.00004 [4.92 secs]



In [196]:
oof['ada_cat'], tp_ada_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.68847 [0.61 secs]
Fold #1: Score = 0.68843 [0.63 secs]
Fold #2: Score = 0.68839 [0.64 secs]
Fold #3: Score = 0.68847 [0.62 secs]
Fold #4: Score = 0.68842 [0.65 secs]

Avg. Score = 0.68844 +/- 0.00003 [3.99 secs]



In [197]:
# create_submission_files(tp_ada_all, 'ada', 'all')
# create_submission_files(tp_ada_og, 'ada', 'og')
# create_submission_files(tp_ada_cat, 'ada', 'cat')

### Bagging

In [210]:
model = ele.BaggingClassifier(
    n_estimators=150,
    max_samples=0.5,
    max_features=0.5,
    n_jobs=-1, 
    random_state=SEED
)

In [211]:
oof['bag_all'], tp_bag_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58804 [5.94 secs]
Fold #1: Score = 0.58971 [5.48 secs]
Fold #2: Score = 0.58997 [5.56 secs]
Fold #3: Score = 0.58799 [5.36 secs]
Fold #4: Score = 0.59124 [5.57 secs]

Avg. Score = 0.58939 +/- 0.00124 [32.36 secs]



In [212]:
oof['bag_og'], tp_bag_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58534 [4.87 secs]
Fold #1: Score = 0.58888 [6.19 secs]
Fold #2: Score = 0.58947 [4.77 secs]
Fold #3: Score = 0.58938 [6.39 secs]
Fold #4: Score = 0.59042 [5.27 secs]

Avg. Score = 0.58870 +/- 0.00175 [32.29 secs]



In [213]:
oof['bag_cat'], tp_bag_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58285 [2.74 secs]
Fold #1: Score = 0.58545 [3.74 secs]
Fold #2: Score = 0.58276 [4.15 secs]
Fold #3: Score = 0.58609 [3.58 secs]
Fold #4: Score = 0.58483 [2.30 secs]

Avg. Score = 0.58440 +/- 0.00136 [20.59 secs]



In [214]:
# create_submission_files(tp_bag_all, 'bag', 'all')
# create_submission_files(tp_bag_og, 'bag', 'og')
# create_submission_files(tp_bag_cat, 'bag', 'cat')

### ExtraTrees

In [233]:
model = ele.ExtraTreesClassifier(
    n_estimators=200,
    max_depth=7,
    n_jobs=-1,
    random_state=SEED
)

In [234]:
oof['ets_all'], tp_ets_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58067 [1.60 secs]
Fold #1: Score = 0.58058 [1.59 secs]
Fold #2: Score = 0.57910 [1.60 secs]
Fold #3: Score = 0.58095 [1.60 secs]
Fold #4: Score = 0.58105 [1.59 secs]

Avg. Score = 0.58047 +/- 0.00071 [9.54 secs]



In [235]:
oof['ets_og'], tp_ets_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58159 [1.40 secs]
Fold #1: Score = 0.58110 [1.38 secs]
Fold #2: Score = 0.58060 [1.40 secs]
Fold #3: Score = 0.58170 [1.39 secs]
Fold #4: Score = 0.58217 [1.40 secs]

Avg. Score = 0.58143 +/- 0.00054 [8.43 secs]



In [236]:
oof['ets_cat'], tp_ets_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58082 [1.49 secs]
Fold #1: Score = 0.58112 [1.50 secs]
Fold #2: Score = 0.57925 [1.49 secs]
Fold #3: Score = 0.58128 [1.49 secs]
Fold #4: Score = 0.58160 [1.49 secs]

Avg. Score = 0.58081 +/- 0.00082 [8.89 secs]



In [237]:
# create_submission_files(tp_ets_all, 'ets', 'all')
# create_submission_files(tp_ets_og, 'ets', 'og')
# create_submission_files(tp_ets_cat, 'ets', 'cat')

### GradientBoosting

In [252]:
model = ele.GradientBoostingClassifier(
    max_depth=2,
    n_iter_no_change=25,
    random_state=SEED
)

In [253]:
oof['gb_all'], tp_gb_all = cross_validate_predict(features, model)

Fold #0: Score = 0.58044 [2.29 secs]
Fold #1: Score = 0.58024 [0.88 secs]
Fold #2: Score = 0.57997 [1.51 secs]
Fold #3: Score = 0.58216 [1.18 secs]
Fold #4: Score = 0.58033 [0.78 secs]

Avg. Score = 0.58063 +/- 0.00078 [7.13 secs]



In [254]:
oof['gb_og'], tp_gb_og = cross_validate_predict(original_features, model)

Fold #0: Score = 0.58113 [1.37 secs]
Fold #1: Score = 0.58080 [0.67 secs]
Fold #2: Score = 0.58003 [0.87 secs]
Fold #3: Score = 0.58234 [0.96 secs]
Fold #4: Score = 0.58035 [0.54 secs]

Avg. Score = 0.58093 +/- 0.00080 [4.89 secs]



In [255]:
oof['gb_cat'], tp_gb_cat = cross_validate_predict(cat_only_features, model)

Fold #0: Score = 0.58114 [1.14 secs]
Fold #1: Score = 0.58042 [1.03 secs]
Fold #2: Score = 0.57895 [0.52 secs]
Fold #3: Score = 0.58276 [1.09 secs]
Fold #4: Score = 0.58091 [0.33 secs]

Avg. Score = 0.58083 +/- 0.00123 [4.59 secs]



In [256]:
# create_submission_files(tp_gb_all, 'gb', 'all')
# create_submission_files(tp_gb_og, 'gb', 'og')
# create_submission_files(tp_gb_cat, 'gb', 'cat')