In [None]:
!pip install bayesian-optimization
!pip install keras-tuner
!pip install catboost
!pip install vecstack

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from os import path
import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from bayes_opt import BayesianOptimization

from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss

from sklearn.feature_selection import SelectPercentile

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow import keras
import kerastuner as kt

from itertools import combinations
from scipy.stats.mstats import gmean

In [None]:
train_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_train_selectp.csv')
test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv')
y_target = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/y_train.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_f, y_target.LABEL, random_state=0, stratify=y_target.LABEL, test_size=.3)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['F20','F30','F40','M20','M30','M40'])
y_train_le = le.transform(y_train)
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train_le, random_state=0, stratify=y_train, test_size=.3)

In [None]:
model = LogisticRegression(random_state=0)
skf = StratifiedKFold(4, shuffle=False, random_state=0)
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
score = cross_val_score(model, train_f, y_target.LABEL, scoring=myscore, cv=skf)
score.mean()

### Model Tuning

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = {
    'n_estimators':(50,800),
    'learning_rate':(0.001,1.5),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'subsample':(0.5, 0.95),
    'colsample_bytree':(0.5, 0.95),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 50),
    'reg_alpha':(0.001, 50)
}
def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha,
        'max_bin':int(max_bin)
    }
    lgbm = LGBMClassifier(random_state=0, **params, n_jobs=-1)
    score = cross_val_score(lgbm, train_f, y_target.LABEL, scoring=myscore, cv=skf)
    return np.mean(score)
BO_lgbm = BayesianOptimization(lgbm_opt, pbounds, random_state=0)
BO_lgbm.maximize(init_points=10, n_iter=10)

In [None]:
max_params_lgbm = BO_lgbm.max['params']
max_params_lgbm

In [None]:
max_params_lgbm = {'colsample_bytree': 0.6864183475316015,
 'learning_rate': 0.09715709702682776,
 'max_bin': 349.3113384913097,
 'max_depth': 18.998043626197255,
 'n_estimators': 249.04211820458406,
 'num_leaves': 34.44137931493538,
 'reg_alpha': 4.697931597411326,
 'reg_lambda': 28.79774883131341,
 'subsample': 0.9181832889092962}

max_params_lgbm['n_estimators'] = int(max_params_lgbm['n_estimators'])
max_params_lgbm['max_depth'] = int(max_params_lgbm['max_depth'])
max_params_lgbm['max_bin'] = int(max_params_lgbm['max_bin'])
max_params_lgbm['num_leaves'] = int(max_params_lgbm['num_leaves'])

In [None]:
lgbm = LGBMClassifier(random_state=0, **max_params_lgbm, n_jobs=-1)
lgbm.fit(X_train,y_train)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lgbm_selectp.pickle','wb') as f:
    pickle.dump(lgbm, f)

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train,y_train)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lr_selectp.pickle','wb') as f:
    pickle.dump(lr, f)

In [None]:
max_params_ada = BO_ada.max['params']
max_params_ada

In [None]:
max_params_ada['n_estimators'] = int(max_params_ada['n_estimators'])

In [None]:
ada = AdaBoostClassifier(**max_params_ada, learning_rate=0.0000000001, random_state=0)
ada.fit(X_train,y_train)

In [None]:
with open('ada_selectp.pickle','wb') as f:
    pickle.dump(ada, f)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = { 'n_estimators': (50,250),
            'max_depth': (5,15), 
            'max_features': (0.8,0.95),
            'min_samples_leaf': (1, 5)}

def rf_opt(n_estimators, max_depth, max_features, min_samples_leaf):
    
    params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf))
    }

    rf = RandomForestClassifier(**params, n_jobs=2, random_state=50)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=False, random_state=50)
    
    score = cross_val_score(rf, train_f, y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
    
    return np.mean(score)


BO_rf = BayesianOptimization(f = rf_opt, pbounds = pbounds, random_state=0)
BO_rf.maximize(init_points=10, n_iter=10)

In [None]:
max_params_rf = BO_rf.max['params']
max_params_rf

In [None]:
max_params_rf['n_estimators'] = int(max_params_rf['n_estimators'])
max_params_rf['max_depth'] = int(max_params_rf['max_depth'])
max_params_rf['min_samples_leaf'] = int(max_params_rf['min_samples_leaf'])

In [None]:
rf = RandomForestClassifier(**max_params_rf, n_jobs=-1, random_state=50)
rf.fit(X_train,y_train)

In [None]:
with open('rf_selectp.pickle','wb') as f:
    pickle.dump(rf, f)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = { 'learning_rate': (0.01, 1.5),
            'n_estimators': (50, 100),
            'max_depth': (5,15),   
            'subsample': (0.8,0.95),  
            'colsample': (0.75,0.95),   
            'gamma': (0, 5)}

def xgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample, gamma):
    
    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample': colsample,   
        'gamma': gamma
    }
    
    xgb = XGBClassifier(**params, n_jobs=-1, random_state=777)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=False, random_state=50)

    score = cross_val_score(xgb,train_f, y_target.LABEL, scoring=myscore, cv=skf)
    
    return np.mean(score)

BO_xgb = BayesianOptimization(f = xgb_opt, pbounds = pbounds, random_state=0)
BO_xgb.maximize(init_points=10, n_iter=10)

In [None]:
max_params_xgb = BO_xgb.max['params']
max_params_xgb

In [None]:
max_params_xgb['n_estimators'] = int(max_params_xgb['n_estimators'])
max_params_xgb['max_depth'] = int(max_params_xgb['max_depth'])

In [None]:
xgb = XGBClassifier(**max_params_xgb, n_jobs=-1, random_state=777)
xgb.fit(X_train,y_train)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/xgb_selectp.pickle','wb') as f:
    pickle.dump(xgb, f)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = { 'C': (0.0000000000000001,10),
            'degree': (1,15)}

def svc_opt(C, degree):
    
    params = {
        'C' : C,
        'degree' : int(round(degree))
    }

    svc = SVC(**params, random_state=0)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=False, random_state=50)
    
    score = cross_val_score(svc, train_f, y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
    
    return np.mean(score)


BO_svc = BayesianOptimization(f = svc_opt, pbounds = pbounds, random_state=0)
BO_svc.maximize(init_points=10, n_iter=10)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
gnb = GaussianNB()
cross_val_score(gnb, train_f, y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1, verbose=1)

In [None]:
catb = CatBoostClassifier(random_state=0)
catb.fit(X_train, y_train)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/catb_selectp.pickle','wb') as f:
    pickle.dump(catb, f)

In [None]:
catb.predict_proba(X_test)

In [None]:
log_loss(y_test,catb.predict_proba(X_test))

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
pbounds = { 'n_components': [1,2,3,4,5]}
lda = LinearDiscriminantAnalysis()
skf = StratifiedKFold(n_splits=4 , shuffle=False, random_state=50)
grid_cv_lda = GridSearchCV(lda, pbounds, scoring=myscore, cv=skf, n_jobs=-1)
grid_cv_lda.fit(X_train, y_train)

In [None]:
max_params_lda = grid_cv_lda.best_estimator_
max_params_lda

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['F20','F30','F40','M20','M30','M40'])
y_train_le = le.transform(y_train)
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train_le, random_state=0, stratify=y_train, test_size=.3)

In [None]:
tf.random.set_seed(0)
np.random.seed(1)
random.seed(2)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
tf.random.set_seed(0)
np.random.seed(1)
random.seed(2)

best_hps = tuner.get_best_hyperparameters(1)[0]
model = tuner.hypermodel.build(best_hps)
hist = model.fit(X_train2, y_train2, validation_data=(X_val, y_val), epochs=500,
                 callbacks=[tf.keras.callbacks.EarlyStopping(patience=10),
                    tf.keras.callbacks.ModelCheckpoint(filepath='/content/drive/MyDrive/D&A_ML_Competition/model.keras',monitor='val_loss',mode='min',save_best_only=True)])

# Visualize training history
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="validation")
plt.legend()
plt.xlabel('epoch')
plt.title("Loss")
plt.show()

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(99)
np.random.seed(9)
random.seed(16)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
tf.random.set_seed(99)
np.random.seed(9)
random.seed(16)

best_hps = tuner.get_best_hyperparameters(1)[0]
model = tuner.hypermodel.build(best_hps)
hist = model.fit(X_train2, y_train2, validation_data=(X_val, y_val), epochs=500,
                 callbacks=[tf.keras.callbacks.EarlyStopping(patience=10),
                    tf.keras.callbacks.ModelCheckpoint(filepath='/content/drive/MyDrive/D&A_ML_Competition/model1.keras',monitor='val_loss',mode='min',save_best_only=True)])

# Visualize training history
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="validation")
plt.legend()
plt.xlabel('epoch')
plt.title("Loss")
plt.show()

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model1.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(11)
np.random.seed(12)
random.seed(99)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
tf.random.set_seed(11)
np.random.seed(12)
random.seed(99)

best_hps = tuner.get_best_hyperparameters(1)[0]
model = tuner.hypermodel.build(best_hps)
hist = model.fit(X_train2, y_train2, validation_data=(X_val, y_val), epochs=500,
                 callbacks=[tf.keras.callbacks.EarlyStopping(patience=10),
                    tf.keras.callbacks.ModelCheckpoint(filepath='/content/drive/MyDrive/D&A_ML_Competition/model2.keras',monitor='val_loss',mode='min',save_best_only=True)])

# Visualize training history
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="validation")
plt.legend()
plt.xlabel('epoch')
plt.title("Loss")
plt.show()

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model2_good.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(990916)
np.random.seed(991112)
random.seed(210526)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
model.save('/content/drive/MyDrive/D&A_ML_Competition/model3.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(21)
np.random.seed(20)
random.seed(55)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
model.save('/content/drive/MyDrive/D&A_ML_Competition/model4.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(160616)
np.random.seed(210526)
random.seed(190813)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
model.save('/content/drive/MyDrive/D&A_ML_Competition/model5.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
tf.random.set_seed(23503)
np.random.seed(2138056)
random.seed(21867)

def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 128, step=16), activation='relu')(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])))
    return model

tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')

tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
model = tuner.get_best_models(1)[0]
model.summary()

In [None]:
model.save('/content/drive/MyDrive/D&A_ML_Competition/model6.keras')

In [None]:
log_loss(y_test,model.predict(X_test))

In [None]:
dnn = model

### Real DNN

In [None]:
def model_fn(hp):
    inputs = keras.Input(shape=(X_train2.shape[1],))
    x = inputs
    for i in range(hp.Int('num_layers', 2, 4)):
        x = keras.layers.Dense(hp.Int('unit_'+str(i), 16, 256, step=16), activation=hp.Choice('act', ['relu','elu','selu']))(x)
        x = keras.layers.Dropout(hp.Float('dropout_'+str(i), 0, 0.5, step=0.25, default=0.5))(x)
    outputs = keras.layers.Dense(6, activation='softmax')(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=hp.Choice('optimizer', ['adam','nadam', 'rmsprop']))
    return model

In [None]:
N = 10
preds = []

for i in tqdm(range(N)):
    tf.random.set_seed(i*3)
    np.random.seed(i*7+5)
    random.seed(2**i)
    tuner = kt.Hyperband(model_fn,
                     objective=kt.Objective('val_loss', direction="min"), 
                     max_epochs=10,
                     hyperband_iterations=2,
                     overwrite=True,
                     directory='dnn_tuning')
    tuner.search(X_train2, y_train2, validation_data=(X_val, y_val), 
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=2)], verbose=0)
    model = tuner.get_best_models(1)[0]
    preds.append(model.predict(X_test).flatten())
    model.save(f'/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model{i}.keras')

In [None]:
test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv')

In [None]:
dnn0 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model0.keras')
dnn1 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model1.keras')
dnn2 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model2.keras')
dnn3 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model3.keras')
dnn4 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model4.keras')
dnn5 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model5.keras')
dnn6 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model6.keras')
dnn7 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model7.keras')
dnn8 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model8.keras')
dnn9 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model9.keras')
dnn10 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model_2248.keras')
dnn11 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model1_2265.keras')
dnn12= keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model2_2190.keras')
dnn13 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model3_2189.keras')
dnn14 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model4_2191.keras')
dnn15 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model5_2186.keras')
dnn16 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/model6_2180.keras')

In [None]:
dnn0_val = dnn0.predict(X_test)
dnn1_val = dnn1.predict(X_test)
dnn2_val = dnn2.predict(X_test)
dnn3_val = dnn3.predict(X_test)
dnn4_val = dnn4.predict(X_test)
dnn5_val = dnn5.predict(X_test)
dnn6_val = dnn6.predict(X_test)
dnn7_val = dnn7.predict(X_test)
dnn8_val = dnn8.predict(X_test)
dnn9_val = dnn9.predict(X_test)
dnn10_val = dnn10.predict(X_test)
dnn11_val = dnn11.predict(X_test)
dnn12_val = dnn12.predict(X_test)
dnn13_val = dnn13.predict(X_test)
dnn14_val = dnn14.predict(X_test)
dnn15_val = dnn15.predict(X_test)
dnn16_val = dnn16.predict(X_test)

In [None]:
proba_corrs = []
for i in range(6):
    preds = pd.DataFrame(np.concatenate([globals()[f'dnn{j}_val'][:,i].reshape(-1,1) for j in range(16)], axis=1), columns=[f'dnn{j}' for j in range(16)])
    proba_corrs.append(preds.corr().mean())
dnns = dict(zip([f'dnn{j}' for j in range(16)], zip(np.mean(proba_corrs, axis=0), [log_loss(y_test, globals()[f'dnn{i}_val']) for i in range(16)])))
dnns = pd.DataFrame(dnns, index=['corr','score']).T

In [None]:
g = sns.scatterplot(x="corr", y="score", data=dnns, s=40, color='red')
for line in range(0, dnns.shape[0]):
     g.text(dnns['corr'][line], dnns.score[line], 
            dnns.index[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
plt.show()

In [None]:
for i in range(16):
    print(log_loss(y_test, globals()[f'dnn{i}_val']))

In [None]:
proba_list = [('dnn6',dnn6_val), ('dnn4',dnn4_val), ('dnn7',dnn7_val), ('dnn0',dnn0_val), ('dnn12',dnn12_val), ('dnn13',dnn13_val), ('dnn14',dnn14_val), ('dnn2',dnn2_val), ('dnn5',dnn5_val), ('dnn3',dnn3_val), ('dnn1',dnn1_val), ('dnn10',dnn10_val), ('dnn11',dnn11_val)]
min_score = 100
for p in tqdm([0, 1]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(y_test, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
dnn0_pred = dnn0.predict(test_f)
dnn1_pred = dnn1.predict(test_f)
dnn3_pred = dnn3.predict(test_f)
dnn4_pred = dnn4.predict(test_f)
dnn6_pred = dnn6.predict(test_f)
dnn7_pred = dnn7.predict(test_f)

In [None]:
dnn12_pred = dnn12.predict(test_f)
dnn13_pred = dnn13.predict(test_f)
dnn14_pred = dnn14.predict(test_f)

In [None]:
for i in range(7):
    np.save(f'/content/drive/MyDrive/D&A_ML_Competition/dnn_models/dnn{i}_pred.csv', globals()[f'dnn{i}_pred'])

In [None]:
np.save('/content/drive/MyDrive/D&A_ML_Competition/dnn_models/dnn7_pred.npy', dnn7_pred)

In [None]:
for i in range(8):
    globals()[f'dnn{i}_pred'] = np.load(f'/content/drive/MyDrive/D&A_ML_Competition/dnn_models/dnn{i}_pred.npy')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

clfs_predict = [dnn0_pred,dnn1_pred,dnn3_pred,dnn4_pred,dnn6_pred,dnn7_pred,dnn12_pred,dnn13_pred,dnn14_pred]
pred = gmean(clfs_predict, axis=0)

# 테스트 데이터 예측
pred = pd.DataFrame(pred)

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_gmean_dnns_yj_211121_2.csv',index=False)

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lr_selectp_oof{0}.pickle','rb') as f:
        globals()[f'lr{i}'] = pickle.load(f)
lrs = [globals()[f'lr{i}'] for i in range(5)]
lr_probas = [model.predict_proba(test_f) for model in catbs]

for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lgbm_selectp_oof{0}.pickle','rb') as f:
        globals()[f'lgbm{i}'] = pickle.load(f)
lgbms = [globals()[f'lgbm{i}'] for i in range(5)]
lgbm_probas = [model.predict_proba(test_f) for model in lgbms]

for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/catb_selectp_oof{0}.pickle','rb') as f:
        globals()[f'catb{i}'] = pickle.load(f)
catbs = [globals()[f'catb{i}'] for i in range(5)]
catb_probas = [model.predict_proba(test_f) for model in catbs]

In [None]:
clfs_predict = [dnn0_pred,dnn1_pred,dnn3_pred,dnn4_pred,dnn6_pred,dnn7_pred,dnn12_pred,dnn13_pred,dnn14_pred]
dnns_pred = gmean(clfs_predict, axis=0)

lr_pred = gmean(lr_probas, axis=0)
lgbm_pred = gmean(lgbm_probas, axis=0)
catb_pred = gmean(catb_probas, axis=0)

In [None]:
#test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')
# 테스트 데이터 예측
clfs_predict = [lr_pred, lgbm_pred, catb_pred, dnns_pred]
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oofgmean_gmean_lrlgbmcatbdnns_yj_211122.csv',index=False)

In [None]:
# 테스트 데이터 예측
clfs_predict = [lgbm_pred, catb_pred]
pred = pd.DataFrame(gmean([gmean(clfs_predict, axis=0), dnns_pred], axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oofgmean_gmean_lgbmcatb_dnns_yj_211122.csv',index=False)

In [None]:
# 테스트 데이터 예측
clfs_predict = [lgbm_pred, catb_pred, lgbm_pred, catb_pred, dnns_pred]
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oofgmean_gmean_lgbm2catb2dnns1_yj_211122.csv',index=False)

In [None]:
# 테스트 데이터 예측
clfs_predict = [lgbm_pred, catb_pred, dnns_pred, dnns_pred]
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oofgmean_gmean_lgbm1catb1dnns2_yj_211122.csv',index=False)

### Ensemble

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/ada_selectp.pickle','rb') as f:
    ada = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/lgbm_selectp.pickle','rb') as f:
    lgbm = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/lr_selectp.pickle','rb') as f:
    lr = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/rf_selectp.pickle','rb') as f:
    rf = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/xgb_selectp.pickle','rb') as f:
    xgb = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/catb_selectp.pickle','rb') as f:
    catb = pickle.load(f)

In [None]:
models = [lr, rf, xgb, lgbm, catb]
clfs_tuned = []
for x in tqdm(models):
    proba = x.predict_proba(X_test)f
    clfs_tuned.append((type(x).__name__, x, log_loss(y_test, proba), proba))
clfs_tuned.append(('dnn', dnn, log_loss(y_test, dnn.predict(X_test)),dnn.predict(X_test)))

In [None]:
ensemble_results = []
for i in tqdm(range(5)):
    globals()[f'pred_results_{i}'] = []
    for name, clf, clf_score, proba in clfs_tuned:
        pred = proba[:,i]
        name = f'{name} \n({clf_score:.4f})'
        globals()[f'pred_results_{i}'].append(pd.Series(pred, name=name))
    globals()[f'ensemble_results_{i}'] = pd.concat(globals()[f'pred_results_{i}'], axis=1).corr()
    ensemble_results.append(globals()[f'ensemble_results_{i}'])

In [None]:
for i in range(5):
    globals()[f'pred_results_{i}'] = []
    for name, clf, clf_score, proba in clfs_tuned:
        pred = proba[:,i]
        name = f'{name} \n({clf_score:.4f})'
        globals()[f'pred_results_{i}'].append(pd.Series(pred, name=name))
    globals()[f'ensemble_results_{i}'] = pd.concat(globals()[f'pred_results_{i}'], axis=1).corr()
ensemble_results = (ensemble_results_1 + ensemble_results_2 + ensemble_results_3 + ensemble_results_4 + ensemble_results_0)/5

# 모형의 예측값 간의 상관관계를 보기 위해 heattmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results, annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
aucs = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'auc': aucs, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="auc", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.auc[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.auc.min()-0.01,df.auc.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('ROC-AUC')
plt.grid()
plt.show()

In [None]:
proba_list = [(name, proba) for name, clf, score, proba in clfs_tuned]
min_score = 100
for p in tqdm([0, 1, 2.56]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(y_test, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv')
models = [lr, lgbm, catb]
clfs_predict = [clf.predict_proba(test_f) for clf in models]

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

# 테스트 데이터 예측
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_gmean_yj_211117.csv',index=False)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

# 테스트 데이터 예측
pred = pd.DataFrame(lgbm.predict_proba(test_f))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_lgbm_yj_211115.csv',index=False)

In [None]:
test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv')
models = [lgbm, catb]
clfs_predict = [clf.predict_proba(test_f) for clf in models]
clfs_predict.append(dnn.predict(test_f))

# 테스트 데이터 예측
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_gmean_lgbmcatbdnn_yj_211120.csv',index=False)

#### dnn

In [None]:
dnn1 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model_2248.keras')
dnn2 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model1_2265.keras')
dnn3 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model2_2190.keras')
dnn4 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model3_2189.keras')
dnn5 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model4_2191.keras')
dnn6 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model5_2186.keras')
dnn7 = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model6_2180.keras')

In [None]:
dnn1_pred = dnn1.predict(X_test)
dnn2_pred = dnn2.predict(X_test)
dnn3_pred = dnn3.predict(X_test)
dnn4_pred = dnn4.predict(X_test)
dnn5_pred = dnn5.predict(X_test)
dnn6_pred = dnn6.predict(X_test)
dnn7_pred = dnn7.predict(X_test)

In [None]:
proba_list = [('dnn1',dnn1_pred),('dnn2',dnn2_pred),('dnn3',dnn3_pred)]
min_score = 100
for p in tqdm([0, 1]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(y_test, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
models = [lr, rf, xgb, lgbm, catb]
clfs_tuned = []
for x in tqdm(models):
    proba = x.predict_proba(X_test)
    clfs_tuned.append((type(x).__name__, x, log_loss(y_test, proba), proba))
clfs_tuned.append(('dnn', _, log_loss(y_test, gmean([dnn1.predict(X_test),dnn3.predict(X_test)], axis=0)),gmean([dnn1.predict(X_test),dnn3.predict(X_test)], axis=0)))

In [None]:
proba_list = [(name, proba) for name, clf, score, proba in clfs_tuned]
min_score = 100
for p in tqdm([0, 1]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(y_test, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

# 테스트 데이터 예측
pred = pd.DataFrame(gmean([dnn1.predict(test_f),dnn3.predict(test_f)], axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_endnn_yj_211121.csv',index=False)

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lgbm_selectp_oof{0}.pickle','rb') as f:
        globals()[f'lgbm{i}'] = pickle.load(f)
lgbms = [globals()[f'lgbm{i}'] for i in range(5)]
lgbm_probas = [model.predict_proba(test_f) for model in lgbms]

for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/catb_selectp_oof{0}.pickle','rb') as f:
        globals()[f'catb{i}'] = pickle.load(f)
catbs = [globals()[f'catb{i}'] for i in range(5)]
catb_probas = [model.predict_proba(test_f) for model in catbs]

In [None]:
lgbm_pred = np.mean(lgbm_probas, axis=0)
catb_pred = np.mean(catb_probas, axis=0)

In [None]:
clfs_predict = [lgbm_pred, catb_pred, dnn_pred]

# 테스트 데이터 예측
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_gmean_lgbmcatbendnn_yj_211121.csv',index=False)

## Stacking

In [None]:
models = [lr, rf, xgb, lgbm, catb]
clfs_predict = [(type(clf).__name__ ,pd.DataFrame(clf.predict_proba(train_f), columns=[type(clf).__name__ + '_' + str(i) for i in range(6)])) for clf in models]
predicts = [pred for name, pred in clfs_predict]

In [None]:
stk_res = []
for i in tqdm(range(2,len(clfs_predict)+1)):
    for comb_ in combinations(clfs_predict,i):
        comb = [proba for n, proba in comb_]
        df_comb = pd.concat(comb, axis=1)
        
        model = LogisticRegression(random_state=0)
        score = cross_val_score(model, df_comb, y_target.LABEL, cv=skf, scoring=myscore, n_jobs=-1).mean()
        stk_res.append(([n for n, proba in comb_], score))

In [None]:
stk_res[np.argmin([s for n, s in stk_res])]

In [None]:
models = [lr, rf, xgb, lgbm]
clfs_predict = [(type(clf).__name__ ,pd.DataFrame(clf.predict_proba(train_f), columns=[type(clf).__name__ + '_' + str(i) for i in range(6)])) for clf in models]
predicts = [pred for name, pred in clfs_predict]

In [None]:
train_f_stk = pd.concat(predicts, axis=1)
X_train_stk, X_test_stk, y_train_stk, y_test_stk = train_test_split(train_f_stk, y_target.LABEL, test_size=.3, stratify=y_target.LABEL, random_state=0)

#### Tuning

In [None]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train_stk, y_train_stk)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lr_selectp_stk.pickle','wb') as f:
    pickle.dump(lr, f)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = {
    'n_estimators':(50,800),
    'learning_rate':(0.001,1.5),
    'max_depth':(2, 32),
    'num_leaves':(2, 64),
    'subsample':(0.5, 0.95),
    'colsample_bytree':(0.5, 0.95),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 50),
    'reg_alpha':(0.001, 50)
}
def lgbm_opt(n_estimators, learning_rate, max_depth, num_leaves,
             subsample, colsample_bytree, max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":int(round(n_estimators)), 
        "learning_rate":learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha,
        'max_bin':int(max_bin)
    }
    lgbm = LGBMClassifier(random_state=0, **params, n_jobs=-1)
    score = cross_val_score(lgbm, train_f_stk, y_target.LABEL, scoring=myscore, cv=skf)
    return np.mean(score)
BO_lgbm = BayesianOptimization(lgbm_opt, pbounds, random_state=0)
BO_lgbm.maximize(init_points=10, n_iter=10)

In [None]:
max_params_lgbm = BO_lgbm.max['params']
max_params_lgbm

In [None]:
max_params_lgbm['n_estimators'] = int(max_params_lgbm['n_estimators'])
max_params_lgbm['max_depth'] = int(max_params_lgbm['max_depth'])
max_params_lgbm['max_bin'] = int(max_params_lgbm['max_bin'])
max_params_lgbm['num_leaves'] = int(max_params_lgbm['num_leaves'])

In [None]:
lgbm = LGBMClassifier(random_state=0, **max_params_lgbm, n_jobs=-1)
lgbm.fit(X_train_stk,y_train_stk)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lgbm_selectp_stk.pickle','wb') as f:
    pickle.dump(lgbm, f)

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
skf = StratifiedKFold(n_splits=4, random_state=50, shuffle=True)
pbounds = { 'n_estimators': (50,250),
            'max_depth': (5,15), 
            'max_features': (0.8,0.95),
            'min_samples_leaf': (1, 5)}

def rf_opt(n_estimators, max_depth, max_features, min_samples_leaf):
    
    params = {
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'min_samples_leaf' : int(round(min_samples_leaf))
    }

    rf = RandomForestClassifier(**params, n_jobs=2, random_state=50)
    
    score = cross_val_score(rf, train_f_stk, y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
    
    return np.mean(score)


BO_rf = BayesianOptimization(f = rf_opt, pbounds = pbounds, random_state=0)
BO_rf.maximize(init_points=10, n_iter=10)

In [None]:
max_params_rf = BO_rf.max['params']
max_params_rf

In [None]:
max_params_rf['n_estimators'] = int(max_params_rf['n_estimators'])
max_params_rf['max_depth'] = int(max_params_rf['max_depth'])
max_params_rf['min_samples_leaf'] = int(max_params_rf['min_samples_leaf'])

In [None]:
rf = RandomForestClassifier(**max_params_rf, n_jobs=-1, random_state=50)
rf.fit(X_train_stk,y_train_stk)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/rf_selectp_stk.pickle','wb') as f:
    pickle.dump(rf, f)

In [None]:
pbounds = { 'learning_rate': (0.01, 1.5),
            'n_estimators': (50, 100),
            'max_depth': (5,15),   
            'subsample': (0.8,0.95),  
            'colsample': (0.75,0.95),   
            'gamma': (0, 5)}

def xgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample, gamma):
    
    params = {
        'learning_rate': learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth' : int(round(max_depth)),
        'subsample': subsample,
        'colsample': colsample,   
        'gamma': gamma
    }
    
    xgb = XGBClassifier(**params, n_jobs=-1, random_state=777)
    
    skf = StratifiedKFold(n_splits=4 , shuffle=True, random_state=50)

    score = cross_val_score(xgb,train_f_stk, y_target.LABEL, scoring=myscore, cv=skf)
    
    return np.mean(score)

BO_xgb = BayesianOptimization(f = xgb_opt, pbounds = pbounds, random_state=0)
BO_xgb.maximize(init_points=10, n_iter=10)

In [None]:
max_params_xgb = BO_xgb.max['params']
max_params_xgb

In [None]:
max_params_xgb['n_estimators'] = int(max_params_xgb['n_estimators'])
max_params_xgb['max_depth'] = int(max_params_xgb['max_depth'])

In [None]:
xgb = XGBClassifier(**max_params_xgb, n_jobs=-1, random_state=777)
xgb.fit(X_train_stk,y_train_stk)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/xgb_selectp_stk.pickle','wb') as f:
    pickle.dump(xgb, f)

In [None]:
catb = CatBoostClassifier(random_state=0)
catb.fit(X_train_stk, y_train_stk)

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/catb_selectp_stk.pickle','wb') as f:
    pickle.dump(catb, f)

#### Modeling

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lgbm_selectp_stk.pickle','rb') as f:
    lgbm_stk = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/lr_selectp_stk.pickle','rb') as f:
    lr_stk = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/rf_selectp_stk.pickle','rb') as f:
    rf_stk = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/xgb_selectp_stk.pickle','rb') as f:
    xgb_stk = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/catb_selectp_stk.pickle','rb') as f:
    catb_stk = pickle.load(f)

In [None]:
X_train_stk, X_test_stk, y_train_stk, y_test_stk = train_test_split(train_f_stk, y_target.LABEL, test_size=.3, stratify=y_target.LABEL, random_state=0)

In [None]:
models = [lr_stk, rf_stk, xgb_stk, lgbm_stk, catb_stk]
clfs_tuned = []
for x in tqdm(models):
    proba = x.predict_proba(X_test_stk)
    clfs_tuned.append((type(x).__name__, x, log_loss(y_test_stk, proba), proba))

In [None]:
ensemble_results = []
for i in tqdm(range(5)):
    globals()[f'pred_results_{i}'] = []
    for name, clf, clf_score, proba in clfs_tuned:
        pred = proba[:,i]
        name = f'{name} \n({clf_score:.4f})'
        globals()[f'pred_results_{i}'].append(pd.Series(pred, name=name))
    globals()[f'ensemble_results_{i}'] = pd.concat(globals()[f'pred_results_{i}'], axis=1).corr()
    ensemble_results.append(globals()[f'ensemble_results_{i}'])

In [None]:
for i in range(5):
    globals()[f'pred_results_{i}'] = []
    for name, clf, clf_score, proba in clfs_tuned:
        pred = proba[:,i]
        name = f'{name} \n({clf_score:.4f})'
        globals()[f'pred_results_{i}'].append(pd.Series(pred, name=name))
    globals()[f'ensemble_results_{i}'] = pd.concat(globals()[f'pred_results_{i}'], axis=1).corr()
ensemble_results = (ensemble_results_1 + ensemble_results_2 + ensemble_results_3 + ensemble_results_4 + ensemble_results_0)/5

# 모형의 예측값 간의 상관관계를 보기 위해 heattmap을 도식한다.
plt.figure(figsize = (8,6))
g = sns.heatmap(ensemble_results, annot=True, cmap='Blues')
g.set_title("Correlation between models")
plt.show()

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index
aucs = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'auc': aucs, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="auc", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.auc[line]-0.003, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.auc.min()-0.01,df.auc.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('ROC-AUC')
plt.grid()
plt.show()