In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
import sklearn
import catboost
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope 

HYPEROPT_ALGO = tpe.suggest

In [4]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [49]:
# DIRECTIVES
MOMENT_FEATURES = True
KNN_FEATURES = False
LOGREG_FEATURES = False
RF_FEATURES = False
ADA_FEATURES = False
NB_FEATURES = False
SVC_FEATURES = False
MLP_FEATURES = False
XGB_FEATURES = False

FT_IMP_SELECTION = False

N_CATBOOST_SEARCH = 200
N_XGBOOST_SEARCH = 5000
N_RF_SEARCH = 500

TRAIN_PATH = 'data/train.csv'
TEST_PATH = 'data/test.csv'
USERS_PATH = 'data/users.csv'

In [50]:
column_ls = ['subject_line_length',
       'last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days']
users_cols = ['attr_1', 'attr_2', 'attr_3', 'age', 'domain']

agg_1_cols = ['last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days']
agg_2_cols = ['weekday', 'attr_1', 'attr_2', 'attr_3', 'age', 'domain']

In [51]:
train_df = pd.read_csv(TRAIN_PATH, na_values=['Never open', 'Never checkout', 'Never login'])
test_df = pd.read_csv(TEST_PATH, na_values=['Never open', 'Never checkout', 'Never login'])
users_df = pd.read_csv(USERS_PATH)

In [52]:
train_user_df = train_df.set_index('user_id').join(users_df.set_index('user_id'))
test_user_df = test_df.set_index('user_id').join(users_df.set_index('user_id'))

In [53]:
# agg_1_cols = ['last_open_day', 'last_login_day']
# agg_2_cols = ['weekday', 'domain']

In [54]:
def preprocess(df):
    df['weekday'] = pd.to_datetime(df.grass_date).dt.weekday
#     df['dayofyear'] = pd.to_datetime(df.grass_date).dt.dayofyear
#     df['daysinmonth'] = pd.to_datetime(df.grass_date).dt.daysinmonth
#     df['is_leap_year'] = pd.to_datetime(df.grass_date).dt.is_leap_year.astype('int')
#     df['is_month_end'] = pd.to_datetime(df.grass_date).dt.is_month_end.astype('int')
#     df['is_month_start'] = pd.to_datetime(df.grass_date).dt.is_month_start.astype('int')
#     df['is_quarter_end'] = pd.to_datetime(df.grass_date).dt.is_quarter_end.astype('int')
#     df['is_quarter_start'] = pd.to_datetime(df.grass_date).dt.is_quarter_start.astype('int')
    df['month'] = pd.to_datetime(df.grass_date).dt.month
#     df['weekofyear'] = pd.to_datetime(df.grass_date).dt.weekofyear
    
    for col in column_ls+users_cols:
        na_count = pd.isnull(df[col]).sum()
        if na_count > 0:
            na_col = col+'_isnull'
            df[na_col] = pd.isnull(df[col]).astype('int')
        if col in users_cols:
            df[col] = df[col].fillna(-1)
    
    if MOMENT_FEATURES:
        for agg_1_col in agg_1_cols:
            for agg_2_col in agg_2_cols:
                for agg_op in ['mean', 'std', 'min', 'max', 'median']:
                    agg_name = agg_1_col + '__' + agg_2_col + '__' + agg_op
                    if agg_op == 'mean':
                        agg = df.groupby([agg_2_col])[agg_1_col].mean().rename(agg_name)
                        df[agg_name] = df[[agg_1_col, agg_2_col]].set_index(agg_2_col).join(agg)[agg_name].tolist()
                    elif agg_op == 'std':
                        agg = df.groupby([agg_2_col])[agg_1_col].std().rename(agg_name)
                        df[agg_name] = df[[agg_1_col, agg_2_col]].set_index(agg_2_col).join(agg)[agg_name].tolist()
                    elif agg_op == 'min':
                        agg = df.groupby([agg_2_col])[agg_1_col].min().rename(agg_name)
                        df[agg_name] = df[[agg_1_col, agg_2_col]].set_index(agg_2_col).join(agg)[agg_name].tolist()
                    elif agg_op == 'max':
                        agg = df.groupby([agg_2_col])[agg_1_col].max().rename(agg_name)
                        df[agg_name] = df[[agg_1_col, agg_2_col]].set_index(agg_2_col).join(agg)[agg_name].tolist()
                    elif agg_op == 'median':
                        agg = df.groupby([agg_2_col])[agg_1_col].median().rename(agg_name)
                        df[agg_name] = df[[agg_1_col, agg_2_col]].set_index(agg_2_col).join(agg)[agg_name].tolist()
    
    df = df.fillna(9999)
    df = df.drop(columns=['row_id', 'grass_date'])
    df = df.reset_index(drop=True)
    df = pd.get_dummies(df, columns=['domain'])
    df = df.drop(columns=['domain_other'])
    return df

In [55]:
train = preprocess(train_user_df)
test = preprocess(test_user_df)

In [56]:
train.shape

(73539, 397)

In [57]:
test.shape

(55970, 396)

# FT Engr

In [58]:
X_train_df = train[train['month'] != 9].drop(columns=['open_flag', 'month'])
y_train = train[train['month'] != 9].open_flag
X_val_df = train[train['month'] == 9].drop(columns=['open_flag', 'month'])
y_val = train[train['month'] == 9].open_flag
X_test_df = test.drop(columns='month')

In [59]:
print('X_train', X_train_df.shape)
print('y_train', y_train.shape)
print('X_val', X_val_df.shape)
print('y_val', y_val.shape)
print('X_test', X_test_df.shape)

X_train (66659, 395)
y_train (66659,)
X_val (6880, 395)
y_val (6880,)
X_test (55970, 395)


In [60]:
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
X_train = scl.fit_transform(X_train_df)
X_val = scl.transform(X_val_df)
X_test = scl.transform(X_test_df)

In [61]:
X_train.shape

(66659, 395)

In [62]:
y_train.shape

(66659,)

In [63]:
if FT_IMP_SELECTION:
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    
    ft_imp = pd.DataFrame({
        'col': X_train_df.columns,
        'imp': clf.feature_importances_
    })

    ft_sort = ft_imp.sort_values(by='imp', ascending=False).reset_index(drop=True)
    ft_sort['cumsum'] = ft_sort.imp.cumsum()
    ft = ft_sort[ft_sort['cumsum'] < 0.905].col.tolist()
    print(ft)
    
    X_train_df = train[train['month'] != 9].drop(columns=['open_flag', 'month'])[ft]
    X_val_df = train[train['month'] == 9].drop(columns=['open_flag', 'month'])[ft]
    X_test_df = test.drop(columns='month')[ft]

    scl = StandardScaler()
    X_train = scl.fit_transform(X_train_df)
    X_val = scl.transform(X_val_df)
    X_test = scl.transform(X_test_df)


In [64]:
print('X_train', X_train_df.shape)
print('y_train', y_train.shape)
print('X_val', X_val_df.shape)
print('y_val', y_val.shape)
print('X_test', X_test_df.shape)

X_train (66659, 395)
y_train (66659,)
X_val (6880, 395)
y_val (6880,)
X_test (55970, 395)


In [65]:
new_ft = dict()
val_ft = dict()
test_ft = dict()

if KNN_FEATURES:
    from sklearn.neighbors import KNeighborsClassifier
    knn_models = dict()

    for k in tqdm([5, 50, 500, 5000]):
        knn_col = 'knn' + str(k)
        knn_models[knn_col] = KNeighborsClassifier(n_neighbors=k, n_jobs=-1).fit(X_train, y_train)
    
    for k in tqdm(knn_models):
        model = knn_models[k]
        preds = model.predict(X_train)
        new_ft[k] = preds
    
    for k in tqdm(knn_models):
        model = knn_models[k]
        preds = model.predict(X_val)
        val_ft[k] = preds

    for k in tqdm(knn_models):
        model = knn_models[k]
        preds = model.predict(X_test)
        test_ft[k] = preds

In [66]:
if LOGREG_FEATURES:
    # LogisticRegression
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(max_iter=2000)
    model.fit(X_train, y_train)
    model_name = 'LogisticRegression'

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [67]:
if RF_FEATURES:
    # Random forest
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    model_name = 'RandomForestClassifier'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [68]:
if ADA_FEATURES:
    # AdaBOOST
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier()
    model_name = 'AdaBoostClassifier'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [69]:
if SVC_FEATURES:
    from sklearn.svm import SVC
    model = SVC(kernel="linear")
    model_name = 'SVC'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [70]:
if NB_FEATURES:
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model_name = 'GaussianNB'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [71]:
if MLP_FEATURES:
    from sklearn.neural_network import MLPClassifier

    model = MLPClassifier(alpha=0.1, max_iter=1000, early_stopping=True, n_iter_no_change=50)
    model_name = 'MLPClassifier'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [72]:
if XGB_FEATURES:
    from xgboost import XGBClassifier

    model = XGBClassifier()
    model_name = 'XGBClassifier'

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    new_ft[model_name] = y_train_pred

    y_val_pred = model.predict(X_val)
    val_ft[model_name] = y_val_pred

    y_test_pred = model.predict(X_test)
    test_ft[model_name] = y_test_pred

In [73]:
if len(new_ft) + len(val_ft) + len(test_ft) > 0:
    for col, lst in new_ft.items():
        X_train = np.append(X_train, lst.reshape(-1,1), axis=1)

    for col, lst in val_ft.items():
        X_val = np.append(X_val, lst.reshape(-1,1), axis=1)

    for col, lst in test_ft.items():
        X_test = np.append(X_test, lst.reshape(-1,1), axis=1)

In [74]:
print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_val', X_val.shape)
print('y_val', y_val.shape)
print('X_test', X_test.shape)

X_train (66659, 395)
y_train (66659,)
X_val (6880, 395)
y_val (6880,)
X_test (55970, 395)


# SKlearn

In [75]:
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef

In [76]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)

In [77]:
clf = logreg.fit(X_train, y_train)

In [78]:
y_pred = clf.predict(X_val)

In [79]:
matthews_corrcoef(y_val, y_pred)

0.47579915097071235

# CatBoost

In [151]:
from catboost import CatBoostClassifier, Pool

In [152]:
from skopt.searchcv import BayesSearchCV

from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

In [153]:
train_data = Pool(X_train, y_train)
val_data = Pool(X_val, y_val)

In [154]:
base_model = CatBoostClassifier(task_type = "GPU", verbose=False)
base_model.fit(X_train,y_train)
preds_class = base_model.predict(val_data)
print(matthews_corrcoef(y_val, preds_class))

0.5109690977839323


In [156]:
D_sub = catboost.Pool(X_test, None)

In [157]:
D_train = catboost.Pool(X_train, y_train)
D_test = catboost.Pool(X_val, y_val)

In [158]:
space = {
    'iterations': hp.uniform('iterations', 100, 5000),
    'learning_rate': hp.loguniform('learning_rate', -2.0, 0),
    'depth': hp.quniform("depth", 3, 8, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 50),
    'border_count': hp.uniform ('border_count', 1, 64),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.9, 1.0),
       }

def get_catboost_params(space):
    params = dict()
    params['iterations'] = int(space['iterations'])
    params['learning_rate'] = space['learning_rate']
    params['depth'] = int(space['depth'])
    params['l2_leaf_reg'] = space['l2_leaf_reg']
    params['border_count'] = int(space['border_count'])
    params['bagging_temperature'] = space['bagging_temperature']
    params['scale_pos_weight'] = space['scale_pos_weight']
    return params

In [159]:
def objective(space):
    params = get_catboost_params(space)
    model = catboost.CatBoostClassifier(iterations=params['iterations'],
                                        learning_rate=params['learning_rate'],
                                        depth=int(params['depth']),
                                        loss_function='Logloss',
                                        use_best_model=True,
                                        task_type="GPU",
                                        eval_metric='MCC',
                                        l2_leaf_reg=params['l2_leaf_reg'],
                                        early_stopping_rounds=500,
                                        border_count=params['border_count'],
                                        bagging_temperature=params['bagging_temperature'],
                                        scale_pos_weight=params['scale_pos_weight'],
                                        od_type="Iter",
                                        verbose=False
                                        )
    
    model.fit(D_train, eval_set=D_test, verbose=False)
    y_pred = model.predict_proba(D_test.get_features())
    test_loss = sklearn.metrics.log_loss(D_test.get_label(), y_pred, labels=[0, 1])
    acc = sklearn.metrics.accuracy_score(D_test.get_label(), np.argmax(y_pred, axis=1))
    auc = sklearn.metrics.roc_auc_score(D_test.get_label(), y_pred[:,1])
    mcc = matthews_corrcoef(D_test.get_label(), np.argmax(y_pred, axis=1))

    return{'loss':-mcc, 'status': STATUS_OK }

In [161]:
trials = Trials()
best_cb = hyperopt.fmin(fn=objective,
                     space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_CATBOOST_SEARCH,
                     trials=trials,
                     verbose=True)

100%|██████████| 200/200 [05:41<00:00,  1.71s/trial, best loss: -0.5356284078763499]


In [162]:
print(best_cb)

{'bagging_temperature': 0.7497169755340393, 'border_count': 46.22616874915195, 'depth': 3.0, 'iterations': 1376.429093384665, 'l2_leaf_reg': 18.272379993248038, 'learning_rate': 0.7063005433614764, 'scale_pos_weight': 0.9314224762928681}


In [163]:
best_cb.update({'border_count': int(best_cb['border_count'])})
best_cb.update({'iterations': int(best_cb['iterations'])})

In [165]:
model = catboost.CatBoostClassifier(loss_function='Logloss',
                                    use_best_model=True,
                                    task_type="GPU",
                                    eval_metric='MCC',
                                    early_stopping_rounds=500,
                                    od_type="Iter",
                                    verbose=2000,
                                    **best_cb
                                    )
model.fit(D_train, eval_set=D_test, verbose=2000)

0:	learn: 0.4940484	test: 0.5035394	best: 0.5035394 (0)	total: 4.15ms	remaining: 5.71s
bestTest = 0.5327667006
bestIteration = 34
Shrink model to first 35 iterations.


<catboost.core.CatBoostClassifier at 0x7f05857a85c0>

In [166]:
pred = model.predict_proba(D_test.get_features())
print("auc = ", sklearn.metrics.roc_auc_score(D_test.get_label(), pred[:,1]))
print("mcc = ", sklearn.metrics.matthews_corrcoef(D_test.get_label(), np.argmax(pred, axis=1)))
print("loss = ", sklearn.metrics.log_loss(D_test.get_label(), pred, labels=[0, 1]))
mcc_cb = sklearn.metrics.matthews_corrcoef(D_test.get_label(), np.argmax(pred, axis=1))

auc =  0.8889065114100247
mcc =  0.5356284078763499
loss =  0.26688234670453126


In [169]:
with open(f'cb_{mcc_cb}.pkl', 'wb') as f:
    pickle.dump(model, f)

In [170]:
sub =model.predict_proba(X_test)
sub_label = np.argmax(sub, axis=1)
sub_df = pd.read_csv('data/sample_submission_0_1.csv')
sub_df['open_flag'] = sub_label
sub_df.to_csv(f'sub_cb_val_{mcc_cb}.csv', index=False)

# Tuning RF

In [171]:
rf_space = {
    'max_depth': hp.uniform('max_depth', 3, 30),
    'min_samples_split': hp.uniform('min_samples_split', 2, 100),
    'max_leaf_nodes': hp.uniform('max_leaf_nodes', 5, 25),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 5, 50),
    'n_estimators': hp.uniform('n_estimators', 50, 1000),
    'max_samples': hp.uniform('max_samples', 0.0, 1.0),
    'max_features': hp.uniform('max_features', 0.0, 1.0)
       }

def get_rf_params(rf_space):
    params = dict()
    params['max_depth'] = int(rf_space['max_depth'])
    params['min_samples_split'] = int(rf_space['min_samples_split'])
    params['max_leaf_nodes'] = int(rf_space['max_leaf_nodes'])
    params['min_samples_leaf'] = int(rf_space['min_samples_leaf'])
    params['n_estimators'] = int(rf_space['n_estimators'])
    params['max_samples'] = rf_space['max_samples']
    params['max_features'] = rf_space['max_features']
    return params

In [193]:
obj_call_count = 0
cur_best_loss = np.inf

def objective(rf_space):
    params = get_rf_params(rf_space)
    model = RandomForestClassifier(max_depth=params['max_depth'],
                                    min_samples_split=params['min_samples_split'],
                                    max_leaf_nodes=params['max_leaf_nodes'],
                                    min_samples_leaf=params['min_samples_leaf'],
                                    n_estimators=params['n_estimators'],
                                    max_samples=params['max_samples'],
                                    max_features=params['max_features'],
                                    n_jobs=-1,
                                    random_state=42
                                    )
    
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)
    pred_label = np.where(y_pred > 0.5, 1, 0)
    test_loss = sklearn.metrics.log_loss(y_val, y_pred, labels=[0, 1])
    acc = sklearn.metrics.accuracy_score(y_val, np.argmax(y_pred, axis=1))
    auc = sklearn.metrics.roc_auc_score(y_val, y_pred[:,1])
    mcc = matthews_corrcoef(y_val, np.argmax(y_pred, axis=1))

    return{'loss':-mcc, 'status': STATUS_OK }

In [173]:
trials = Trials()
best_rf = hyperopt.fmin(fn=objective,
                     space=rf_space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_RF_SEARCH,
                     trials=trials,
                     verbose=True)

100%|██████████| 500/500 [16:04<00:00,  1.93s/trial, best loss: -0.5075210345604432]


In [185]:
best_rf['max_depth'] = int(best_rf['max_depth'])
best_rf['min_samples_split'] = int(best_rf['min_samples_split'])
best_rf['max_leaf_nodes'] = int(best_rf['max_leaf_nodes'])
best_rf['min_samples_leaf'] = int(best_rf['min_samples_leaf'])
best_rf['n_estimators'] = int(best_rf['n_estimators'])

In [188]:
model = RandomForestClassifier(n_jobs=-1, **best_rf)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=7, max_features=0.9331650332166556,
                       max_leaf_nodes=16, max_samples=0.20107640093697762,
                       min_samples_leaf=28, min_samples_split=41,
                       n_estimators=258, n_jobs=-1)

In [189]:
pred = model.predict_proba(X_val)
print("auc = ", sklearn.metrics.roc_auc_score(y_val, pred[:,1]))
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1)))
print("loss = ", sklearn.metrics.log_loss(y_val, pred, labels=[0, 1]))
mcc_rf = sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1))

auc =  0.8818327143143945
mcc =  0.5015213252825286
loss =  0.27871095530737533


In [190]:
with open(f'rf_{mcc_rf}.pkl', 'wb') as f:
    pickle.dump(model, f)

In [191]:
sub =model.predict_proba(X_test)
sub_label = np.argmax(sub, axis=1)
sub_df = pd.read_csv('data/sample_submission_0_1.csv')
sub_df['open_flag'] = sub_label
sub_df.to_csv(f'sub_rf_val_{mcc_rf}.csv', index=False)

# Tuning XGBoost

In [80]:
from xgboost import XGBClassifier

In [81]:
def build_xgb(space):
    model = XGBClassifier(
        objective='binary:logistic',
        booster='gbtree',
        tree_method='gpu_hist',
        nthread=-1,
        n_estimators=space['n_estimators'],
        eta=space['eta'],
        max_depth=int(space['max_depth']),
        min_child_weight=int(space['min_child_weight']),
        gamma=space['gamma'],
        colsample_bytree=space['colsample_bytree'],
        scale_pos_weight=space['scale_pos_weight']
    )
    return model

xgb_space = {
    'booster': hp.choice('booster', ['gbtree', 'gblinear']),
    'n_estimators': scope.int(hp.uniform('n_estimators', 100, 1000)),
    'eta': hp.loguniform('eta', -5.0, 3),
    'max_depth': scope.int(hp.uniform('max_depth', 2, 20)),
    'min_child_weight': scope.int(hp.uniform('min_child_weight', 1, 50)),
    'gamma': hp.uniform('gamma', 0.0, 50.0),
    'subsample': hp.uniform('subsample', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.0, 1.0),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.0, 1.0),
    }

def get_xgb_params(space):
    params = dict()
    params['n_estimators'] = space['n_estimators']
    params['eta'] = space['eta']
    params['max_depth'] = space['max_depth']
    params['min_child_weight'] = space['min_child_weight']
    params['gamma'] = space['gamma']
    params['subsample'] = space['subsample']
    params['colsample_bytree'] = space['colsample_bytree']
    params['scale_pos_weight'] = space['scale_pos_weight']
    return params

In [82]:
def objective(space):
    params = get_xgb_params(space)
    model = build_xgb(params)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)
    pred_label = np.where(y_pred > 0.5, 1, 0)
    test_loss = sklearn.metrics.log_loss(y_val, y_pred, labels=[0, 1])
    acc = sklearn.metrics.accuracy_score(y_val, np.argmax(y_pred, axis=1))
    auc = sklearn.metrics.roc_auc_score(y_val, y_pred[:,1])
    mcc = sklearn.metrics.matthews_corrcoef(y_val, np.argmax(y_pred, axis=1))
    
    return{'loss':-mcc, 'status': STATUS_OK }

In [83]:
trials = Trials()
best_xgb = hyperopt.fmin(fn=objective,
                     space=xgb_space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_XGBOOST_SEARCH,
                     trials=trials,
                     verbose=True)

100%|██████████| 5000/5000 [6:06:54<00:00,  4.40s/trial, best loss: -0.5463900126078486]   


In [84]:
print(best_xgb)

{'colsample_bytree': 0.14070641610979773, 'eta': 1.1182226516769986, 'gamma': 5.336064981920235, 'max_depth': 13.011760013638906, 'min_child_weight': 19.393325052586604, 'scale_pos_weight': 0.9823427013639542, 'subsample': 0.10261415984429015}

'-0.5429062310404649'

{'booster': 1, 'colsample_bytree': 0.8117933531497206, 'eta': 0.08840198872497015, 'gamma': 0.9580402932450544, 'max_depth': 2.30386257875966, 'min_child_weight': 20.02693734064389, 'n_estimators': 968.33514077476, 'scale_pos_weight': 0.9831028495929607, 'subsample': 0.6146205973547038}


'-0.5429062310404649'

In [86]:
best_xgb['n_estimators'] = int(best_xgb['n_estimators'])

In [87]:
model = build_xgb(best_xgb)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8117933531497206,
              eta=0.08840198872497015, gamma=0.9580402932450544, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.0884019881, max_delta_step=0, max_depth=2,
              min_child_weight=20, missing=nan,
              monotone_constraints='(0,0,0,0,0,...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=968, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=0.9831028495929607, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [88]:
pred = model.predict_proba(X_val)
print("auc = ", sklearn.metrics.roc_auc_score(y_val, pred[:,1]))
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1)))
print("loss = ", sklearn.metrics.log_loss(y_val, pred, labels=[0, 1]))
mcc_xgb = sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1))

auc =  0.8866876631198555
mcc =  0.5463900126078486
loss =  0.2719779053870805


In [89]:
with open(f'xgb_{mcc_xgb}.pkl', 'wb') as f:
    pickle.dump(model, f)

In [90]:
sub =model.predict_proba(X_test)
sub_label = np.argmax(sub, axis=1)
sub_df = pd.read_csv('data/sample_submission_0_1.csv')
sub_df['open_flag'] = sub_label
sub_df.to_csv(f'sub_xgb_val_{mcc_xgb}.csv', index=False)

# LightGBM

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

In [None]:
baseline_params = {'boosting_type': 'gbdt',
                  'max_depth' : -1,
                  'objective': 'binary',
                  'nthread': -1, # Updated from nthread
                  'num_leaves': 64,
                  'learning_rate': 0.05,
                  'max_bin': 512,
                  'subsample_for_bin': 200,
                  'subsample': 1,
                  'subsample_freq': 1,
                  'colsample_bytree': 0.8,
                  'reg_alpha': 5,
                  'reg_lambda': 10,
                  'min_split_gain': 0.5,
                  'min_child_weight': 1,
                  'min_child_samples': 5,
                  'scale_pos_weight': 1,
                  'num_class' : 1,
                  'metric' : 'binary_error'}

In [None]:
def build_lgb(params):
    model = lgb.LGBMClassifier(boosting_type= 'gbdt',
                                objective = 'binary',
                                n_jobs = -1, # Updated from 'nthread'
                                silent = True,
                                #device='gpu',
                                max_depth = params['max_depth'],
                                max_bin = params['max_bin'],
                                subsample_for_bin = params['subsample_for_bin'],
                                subsample = params['subsample'],
                                subsample_freq = params['subsample_freq'],
                                min_split_gain = params['min_split_gain'],
                                min_child_weight = params['min_child_weight'],
                                min_child_samples = params['min_child_samples'],
                                scale_pos_weight = params['scale_pos_weight'],
                                
                                learning_rate=params['learning_rate'],
                                n_estimators=params['n_estimators'],
                                num_leaves=params['num_leaves'],
                                colsample_bytree=params['colsample_bytree'],
                                reg_alpha=params['reg_alpha'],
                                reg_lambda=params['reg_lambda']
                              )

    return model

lgb_space = {
    'max_depth': scope.int(hp.uniform('max_depth', 2, 20)),
    'max_bin': scope.int(hp.uniform('max_bin', 10, 500)),
    'subsample_for_bin': scope.int(hp.uniform('subsample_for_bin', 10, 500)),
    'subsample': hp.uniform('subsample', 0.1, 1.0),
    'subsample_freq': hp.choice('subsample_freq', [0, 1]),
    'min_split_gain': hp.uniform('min_split_gain', 0.1, 1.0),
    'min_child_weight': scope.int(hp.uniform('min_child_weight', 1, 20)),
    'min_child_samples': scope.int(hp.uniform('min_child_samples', 1, 20)),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.1, 1.0),
    
    'learning_rate': hp.loguniform('learning_rate', -5.0, 1),
    'n_estimators': scope.int(hp.uniform('n_estimators', 10, 1000)),
    'num_leaves': scope.int(hp.uniform('num_leaves', 2, 100)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0.1, 10.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 10.0)
    
    }

def get_lgb_params(space):
#     print(space)
    params = dict()
    params['max_depth'] = space['max_depth']
    params['max_bin'] = space['max_bin']
    params['subsample_for_bin'] = space['subsample_for_bin']
    params['subsample'] = space['subsample']
    params['subsample_freq'] = space['subsample_freq']
    params['min_split_gain'] = space['min_split_gain']
    params['min_child_weight'] = space['min_child_weight']
    params['min_child_samples'] = space['min_child_samples']
    params['scale_pos_weight'] = space['scale_pos_weight']

    params['learning_rate']=space['learning_rate']
    params['n_estimators']=space['n_estimators']
    params['num_leaves']=space['num_leaves']
    params['colsample_bytree']=space['colsample_bytree']
    params['reg_alpha']=space['reg_alpha']
    params['reg_lambda']=space['reg_lambda']
    return params

In [None]:
obj_call_count = 0
cur_best_loss = np.inf

def objective(space):
    params = get_lgb_params(space)
    model = build_lgb(params)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)
    pred_label = np.where(y_pred > 0.5, 1, 0)
    test_loss = sklearn.metrics.log_loss(y_val, y_pred, labels=[0, 1])
    acc = sklearn.metrics.accuracy_score(y_val, np.argmax(y_pred, axis=1))
    auc = sklearn.metrics.roc_auc_score(y_val, y_pred[:,1])
    mcc = matthews_corrcoef(y_val, np.argmax(y_pred, axis=1))

    return{'loss':-mcc, 'status': STATUS_OK }

In [None]:
trials = Trials()
best_lgb = hyperopt.fmin(fn=objective,
                     space=lgb_space,
                     algo=HYPEROPT_ALGO,
                     max_evals=1000,
                     trials=trials,
                     verbose=True)

In [None]:
print(best_lgb)

In [None]:
best_lgb['max_depth'] = int(best_lgb['max_depth'])
best_lgb['max_bin'] = int(best_lgb['max_bin'])
best_lgb['subsample_for_bin'] = int(best_lgb['subsample_for_bin'])
best_lgb['min_child_weight'] = int(best_lgb['min_child_weight'])
best_lgb['min_child_samples'] = int(best_lgb['min_child_samples'])
best_lgb['n_estimators']=int(best_lgb['n_estimators'])
best_lgb['num_leaves']=int(best_lgb['num_leaves'])

In [None]:
model = build_lgb(best_lgb)
model.fit(X_train, y_train)

In [None]:
pred = model.predict_proba(X_val)
print("auc = ", sklearn.metrics.roc_auc_score(y_val, pred[:,1]))
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1)))
print("loss = ", sklearn.metrics.log_loss(y_val, pred, labels=[0, 1]))
mcc_lgb = sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1))

In [None]:
with open(f'lgb_{mcc_lgb}.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
sub =model.predict_proba(X_test)
sub_label = np.argmax(sub, axis=1)
sub_df = pd.read_csv('data/sample_submission_0_1.csv')
sub_df['open_flag'] = sub_label
sub_df.to_csv(f'sub_lgb_val_{mcc_lgb}.csv', index=False)

# Ensemble (incomplete)

In [199]:
model_name = 'CatBoost'

y_train_pred = model.predict(X_train)
new_ft[model_name] = y_train_pred

y_val_pred = model.predict(X_val)
val_ft[model_name] = y_val_pred

y_test_pred = model.predict(X_test)
test_ft[model_name] = y_test_pred

In [225]:
en_train = pd.DataFrame(new_ft)
en_val = pd.DataFrame(val_ft)
en_test = pd.DataFrame(test_ft)

In [226]:
en_train.corr()

Unnamed: 0,knn5,knn50,knn500,knn5000,LogisticRegression,RandomForestClassifier,AdaBoostClassifier,SVC,GaussianNB,MLPClassifier,CatBoost,XGBClassifier
knn5,1.0,0.71623,0.663867,0.551709,0.665778,0.605036,0.696128,0.629544,0.62505,0.717149,0.72709,0.722538
knn50,0.71623,1.0,0.868556,0.70952,0.844415,0.494196,0.820278,0.809794,0.702634,0.86612,0.819446,0.733129
knn500,0.663867,0.868556,1.0,0.782221,0.889732,0.453222,0.790409,0.877775,0.6754,0.826961,0.768528,0.677816
knn5000,0.551709,0.70952,0.782221,1.0,0.735098,0.374631,0.642194,0.815064,0.528501,0.671502,0.634526,0.555555
LogisticRegression,0.665778,0.844415,0.889732,0.735098,1.0,0.462473,0.814124,0.898788,0.71588,0.833682,0.770239,0.683712
RandomForestClassifier,0.605036,0.494196,0.453222,0.374631,0.462473,1.0,0.502344,0.432547,0.468817,0.506217,0.5407,0.649491
AdaBoostClassifier,0.696128,0.820278,0.790409,0.642194,0.814124,0.502344,1.0,0.758159,0.728224,0.865417,0.850873,0.752298
SVC,0.629544,0.809794,0.877775,0.815064,0.898788,0.432547,0.758159,1.0,0.647443,0.785304,0.730776,0.643281
GaussianNB,0.62505,0.702634,0.6754,0.528501,0.71588,0.468817,0.728224,0.647443,1.0,0.717257,0.680485,0.641304
MLPClassifier,0.717149,0.86612,0.826961,0.671502,0.833682,0.506217,0.865417,0.785304,0.717257,1.0,0.870604,0.766336


In [227]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = RandomForestClassifier()
clf.fit(en_train, y_train)

RandomForestClassifier()

In [228]:
en_ft_imp = pd.DataFrame({
    'col': en_train.columns,
    'imp': clf.feature_importances_
}).sort_values('imp', ascending=False)

In [229]:
en_ft_imp

Unnamed: 0,col,imp
5,RandomForestClassifier,0.681678
11,XGBClassifier,0.118329
0,knn5,0.082807
10,CatBoost,0.038984
9,MLPClassifier,0.036688
6,AdaBoostClassifier,0.021585
8,GaussianNB,0.014709
1,knn50,0.003946
4,LogisticRegression,0.000446
2,knn500,0.00039


In [211]:
pred = clf.predict_proba(en_val)


In [221]:
print("auc = ", sklearn.metrics.roc_auc_score(y_val, pred[:,1]))
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, np.argmax(pred, axis=1)))
print("loss = ", sklearn.metrics.log_loss(y_val, pred, labels=[0, 1]))


auc =  0.869682041758683
mcc =  0.49024669104854995
loss =  0.30488173033899657


In [195]:
# pred = en_val.mean(axis=1).tolist()

pred_label = (en_val.mean(axis=1) > 0.5).astype('int').tolist()
print("auc = ", sklearn.metrics.roc_auc_score(y_val, pred))
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, pred_label))
print("loss = ", sklearn.metrics.log_loss(y_val, pred, labels=[0, 1]))

auc =  0.7080238238640166
mcc =  0.480781309298576
loss =  3.624581234815747


In [202]:
pred_label = en_val['CatBoost'].tolist()
print("mcc = ", sklearn.metrics.matthews_corrcoef(y_val, pred_label))


mcc =  0.5224347157193547


In [203]:
print(metrics.classification_report(y_val, pred_label))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5860
           1       0.75      0.44      0.56      1020

    accuracy                           0.90      6880
   macro avg       0.83      0.71      0.75      6880
weighted avg       0.89      0.90      0.88      6880



# Submission

In [363]:
X_train.shape

(66659, 14)

In [364]:
X_test.shape

(55970, 14)

In [365]:
sub =model.predict_proba(X_test)
sub_label = np.argmax(sub, axis=1)
sub_df = pd.read_csv('data/sample_submission_0_1.csv')
sub_df['open_flag'] = sub_label
sub_df.to_csv(f'sub_val_{mcc}.csv', index=False)

In [366]:
len(sub_label)

55970