In [None]:
!pip install bayesian-optimization
!pip install keras-tuner
!pip install catboost
!pip install vecstack

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from os import path
import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from bayes_opt import BayesianOptimization

from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss

from sklearn.feature_selection import SelectPercentile

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow import keras
import kerastuner as kt

from itertools import combinations
from scipy.stats.mstats import gmean

In [None]:
train_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_train_selectp.csv')
test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv')
y_target = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/y_train.csv')

In [None]:
with open('/content/drive/MyDrive/D&A_ML_Competition/lgbm_selectp.pickle','rb') as f:
    lgbm = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/lr_selectp.pickle','rb') as f:
    lr = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/rf_selectp.pickle','rb') as f:
    rf = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/xgb_selectp.pickle','rb') as f:
    xgb = pickle.load(f)
with open('/content/drive/MyDrive/D&A_ML_Competition/catb_selectp.pickle','rb') as f:
    catb = pickle.load(f)

In [None]:
lgbm_params = lgbm.get_params()
rf_params = rf.get_params()
xgb_params = xgb.get_params()

In [None]:
scores = []
models_lr = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    train_ix, test_ix = idx[0], idx[1]
    # get data
    train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
    train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
    # fit model
    globals()[f'model{i}'] = LogisticRegression(random_state=0).fit(train_X, train_y)
    models_lr.append(globals()[f'model{i}'])
    # evaluate model
    yhat = globals()[f'model{i}'].predict_proba(test_X)
    score = log_loss(test_y, yhat)
    # store score
    scores.append(score)

In [None]:
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
for i in tqdm(range(len(models_lr))):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lr_selectp_oof{i}.pickle', 'wb') as f:
        pickle.dump(models_lr[i],f)

In [None]:
scores = []
models_rf = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    train_ix, test_ix = idx[0], idx[1]
    # get data
    train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
    train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
    # fit model
    globals()[f'model{i}'] = RandomForestClassifier(**rf_params).fit(train_X, train_y)
    models_rf.append(globals()[f'model{i}'])
    # evaluate model
    yhat = globals()[f'model{i}'].predict_proba(test_X)
    score = log_loss(test_y, yhat)
    # store score
    scores.append(score)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
for i in tqdm(range(len(models_rf))):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/rf_selectp_oof{i}.pickle', 'wb') as f:
        pickle.dump(models_rf[i],f)

In [None]:
scores = []
models_xgb = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    train_ix, test_ix = idx[0], idx[1]
    # get data
    train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
    train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
    # fit model
    globals()[f'model{i}'] = XGBClassifier(**xgb_params).fit(train_X, train_y)
    models_xgb.append(globals()[f'model{i}'])
    # evaluate model
    yhat = globals()[f'model{i}'].predict_proba(test_X)
    score = log_loss(test_y, yhat)
    # store score
    scores.append(score)
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/xgb_selectp_oof{i}.pickle', 'wb') as f:
        pickle.dump(globals()[f'model{i}'],f)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
scores = []
models_xgb = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    if i >= 3:
        train_ix, test_ix = idx[0], idx[1]
        # get data
        train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
        train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
        # fit model
        globals()[f'model{i}'] = XGBClassifier(**xgb_params).fit(train_X, train_y)
        models_xgb.append(globals()[f'model{i}'])
        # evaluate model
        yhat = globals()[f'model{i}'].predict_proba(test_X)
        score = log_loss(test_y, yhat)
        # store score
        scores.append(score)
        with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/xgb_selectp_oof{i}.pickle', 'wb') as f:
            pickle.dump(globals()[f'model{i}'],f)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
scores = []
models_lgbm = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    train_ix, test_ix = idx[0], idx[1]
    # get data
    train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
    train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
    # fit model
    globals()[f'model{i}'] = LGBMClassifier(**lgbm_params).fit(train_X, train_y)
    models_lgbm.append(globals()[f'model{i}'])
    # evaluate model
    yhat = globals()[f'model{i}'].predict_proba(test_X)
    score = log_loss(test_y, yhat)
    # store score
    scores.append(score)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
for i in tqdm(range(len(models_lgbm))):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lgbm_selectp_oof{i}.pickle', 'wb') as f:
        pickle.dump(models_lgbm[i],f)

In [None]:
scores = []
models_catb = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# enumerate splits
for i, idx in tqdm(enumerate(skf.split(train_f, y_target.LABEL))):
    train_ix, test_ix = idx[0], idx[1]
    # get data
    train_X, test_X = train_f.iloc[train_ix], train_f.iloc[test_ix]
    train_y, test_y = y_target.LABEL[train_ix], y_target.LABEL[test_ix]
    # fit model
    globals()[f'model{i}'] = CatBoostClassifier(random_state=0, verbose=0).fit(train_X, train_y)
    models_catb.append(globals()[f'model{i}'])
    # evaluate model
    yhat = globals()[f'model{i}'].predict_proba(test_X)
    score = log_loss(test_y, yhat)
    # store score
    scores.append(score)
# summarize model performance
mean_s, std_s = np.mean(scores), np.std(scores)
print('Mean: %.3f, Standard Deviation: %.3f' % (mean_s, std_s))

In [None]:
for i in tqdm(range(len(models_catb))):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/catb_selectp_oof{i}.pickle', 'wb') as f:
        pickle.dump(models_catb[i],f)

## Predict

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lr_selectp_oof{i}.pickle','rb') as f:
        globals()[f'lr{i}'] = pickle.load(f)
lrs = [globals()[f'lr{i}'] for i in range(5)]
lr_probas = [model.predict_proba(test_f) for model in lrs]

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/rf_selectp_oof{0}.pickle','rb') as f:
        globals()[f'rf{i}'] = pickle.load(f)
rfs = [globals()[f'rf{i}'] for i in range(5)]
rf_probas = [model.predict_proba(test_f) for model in rfs]

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/xgb_selectp_oof{0}.pickle','rb') as f:
        globals()[f'xgb{i}'] = pickle.load(f)
xgbs = [globals()[f'xgb{i}'] for i in range(5)]
xgb_probas = [model.predict_proba(test_f) for model in xgbs]

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/lgbm_selectp_oof{0}.pickle','rb') as f:
        globals()[f'lgbm{i}'] = pickle.load(f)
lgbms = [globals()[f'lgbm{i}'] for i in range(5)]
lgbm_probas = [model.predict_proba(test_f) for model in lgbms]

In [None]:
for i in range(5):
    with open(f'/content/drive/MyDrive/D&A_ML_Competition/oof_selectp/catb_selectp_oof{0}.pickle','rb') as f:
        globals()[f'catb{i}'] = pickle.load(f)
catbs = [globals()[f'catb{i}'] for i in range(5)]
catb_probas = [model.predict_proba(test_f) for model in catbs]

In [None]:
lr_pred = np.mean(lr_probas, axis=0)
rf_pred = np.mean(rf_probas, axis=0)
xgb_pred = np.mean(xgb_probas, axis=0)
lgbm_pred = np.mean(lgbm_probas, axis=0)
catb_pred = np.mean(catb_probas, axis=0)

In [None]:
dnn = keras.models.load_model('/content/drive/MyDrive/D&A_ML_Competition/model2_good.keras')
dnn_pred = dnn.predict(test_f)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
tr_idxs = []
te_idxs = []
for tr_idx, te_idx in skf.split(train_f, y_target.LABEL):
    tr_idxs.append(tr_idx)
    te_idxs.append(te_idx)

In [None]:
lr_tr_preds = []
rf_tr_preds = []
xgb_tr_preds = []
lgbm_tr_preds = []
catb_tr_preds = []
for i in tqdm(range(5)):
    lr_tr_preds.append((log_loss(y_target.LABEL.iloc[te_idx], lrs[i].predict_proba(train_f.iloc[te_idx])), lrs[i].predict_proba(train_f.iloc[te_idx]), y_target.LABEL.iloc[te_idx]))
    rf_tr_preds.append((log_loss(y_target.LABEL.iloc[te_idx], rfs[i].predict_proba(train_f.iloc[te_idx])), rfs[i].predict_proba(train_f.iloc[te_idx]), y_target.LABEL.iloc[te_idx]))
    xgb_tr_preds.append((log_loss(y_target.LABEL.iloc[te_idx], xgbs[i].predict_proba(train_f.iloc[te_idx])), xgbs[i].predict_proba(train_f.iloc[te_idx]), y_target.LABEL.iloc[te_idx]))
    lgbm_tr_preds.append((log_loss(y_target.LABEL.iloc[te_idx], lgbms[i].predict_proba(train_f.iloc[te_idx])), lgbms[i].predict_proba(train_f.iloc[te_idx]), y_target.LABEL.iloc[te_idx]))
    catb_tr_preds.append((log_loss(y_target.LABEL.iloc[te_idx], catbs[i].predict_proba(train_f.iloc[te_idx])), catbs[i].predict_proba(train_f.iloc[te_idx]), y_target.LABEL.iloc[te_idx]))

In [None]:
lr_tr = np.concatenate([pred for score, pred, true in lr_tr_preds], axis=0)
rf_tr = np.concatenate([pred for score, pred, true in rf_tr_preds], axis=0)
xgb_tr = np.concatenate([pred for score, pred, true in xgb_tr_preds], axis=0)
lgbm_tr = np.concatenate([pred for score, pred, true in lgbm_tr_preds], axis=0)
catb_tr = np.concatenate([pred for score, pred, true in catb_tr_preds], axis=0)
true_tr = np.concatenate([true for score, pred, true in lr_tr_preds], axis=0)

In [None]:
proba_list = [('lr',lr_tr), ('rf',rf_tr), ('xgb',xgb_tr), ('lgbm',lgbm_tr), ('catb',catb_tr)]
for x in proba_list:
    name, pred = x
    score = log_loss(true_tr, pred)
    print(name, score)

In [None]:
proba_list = [('lr',lr_tr), ('xgb',xgb_tr), ('lgbm',lgbm_tr), ('catb',catb_tr)]
min_score = 100
for p in tqdm([0, 1, 2.56]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(true_tr, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
proba_list = [('lr',lr_tr), ('xgb',xgb_tr), ('lgbm',lgbm_tr), ('catb',catb_tr)]
min_score = 100
for p in tqdm([0, 1]):
    name_idx = 0
    for i in range(2,len(proba_list)):
        for comb_ in combinations(proba_list,i):
            comb = [proba for n, proba in comb_]
            if p == 0:
                preds_mean = gmean(list(comb), axis=0)
            else:
                preds_mean = (np.sum(np.array(comb)**p, axis=0)/len(comb))**(1/p)
            score = log_loss(true_tr, preds_mean)
            if score < min_score:
                best_avg_ensemble = (p, [n for n, proba in comb_], score)
                min_score = score

p, models, score = best_avg_ensemble
print('\np={}\n{}\n{}'.format(p, '&'.join(best_avg_ensemble[1]), score))

In [None]:
scores = []
for i in tqdm(range(101)):
    for j in range(101):
        if i+j==100:
            pred = (lgbm_tr*i + catb_tr*j)/100
            score = log_loss(true_tr, pred)
            scores.append(((i,j), score))
scores[np.argmin([score for w, score in scores])]

In [None]:
corrs = []
for i in range(6):
    corrs.append(pd.DataFrame(np.concatenate([lr_tr[:,i].reshape(-1,1),lgbm_tr[:,i].reshape(-1,1),catb_tr[:,i].reshape(-1,1)], axis=1),
                              columns=['lr','lgbm','catb']).corr())
np.mean(corrs, axis=0)

#### Submit

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

# 테스트 데이터 예측
pred = pd.DataFrame(lgbm_pred)

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_lgbm_yj_211119.csv',index=False)

In [None]:
# 테스트 데이터 예측
pred = pd.DataFrame(catb_pred)

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_catb_yj_211119.csv',index=False)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')
# 테스트 데이터 예측
pred = pd.DataFrame(lr_pred)

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_lr_yj_211119.csv',index=False)

In [None]:
# 테스트 데이터 예측
pred = pd.DataFrame(gmean([lr_pred, lgbm_pred, catb_pred], axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_gmean_lrlgbmcatb_yj_211119.csv',index=False)

In [None]:
# 테스트 데이터 예측
pred = pd.DataFrame(((lr_pred**25)*(lgbm_pred**45)*(catb_pred**30))**(1/100))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_wgmean_lr25lgbm45catb30_yj_211119.csv',index=False)

In [None]:
# 테스트 데이터 예측
preds = [lgbm_pred, catb_pred]
pred = pd.DataFrame((np.sum(np.array(preds)**2.56, axis=0)/len(preds))**(1/2.56))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_pmean_lgbmcatb_yj_211119.csv',index=False)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')
# 테스트 데이터 예측
clfs_predict = [lgbm_pred, catb_pred, dnn_pred]
pred = pd.DataFrame(gmean(clfs_predict, axis=0))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_oof_gmean_lgbmcatbdnn_yj_211120.csv',index=False)