# 1. Module Import & Data Load

In [None]:
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import pickle
import joblib
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from eli5.permutation_importance import get_score_importances
import eli5
from eli5.sklearn import PermutationImportance
from sklearn import cluster

import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances
from hyperopt.pyll.base import scope
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from kaggler.model import AutoLGB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss
import random

# 2. Feature Engineering

In [None]:
train = pd.read_csv('data/train.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
test = pd.read_csv('data/test.csv').drop(['index', 'FLAG_MOBIL'], axis=1).fillna('NAN')
sample_submission = pd.read_csv('data/sample_submission.csv')

# train데이터와 test데이터 변수를 함께 조정하기 위해 병합
merge_data = pd.concat([train, test], axis = 0)

# DAYS_BIRTH
merge_data['DAYS_BIRTH_month']=np.floor((-merge_data['DAYS_BIRTH'])/30)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/30)/12).astype(int)*12)
merge_data['DAYS_BIRTH_week']=np.floor((-merge_data['DAYS_BIRTH'])/7)-(
    (np.floor((-merge_data['DAYS_BIRTH'])/7)/4).astype(int)*4)

# DAYS_EMPLOYED
merge_data['DAYS_EMPLOYED_month']=np.floor((-merge_data['DAYS_EMPLOYED'])/30)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['DAYS_EMPLOYED_week']=np.floor((-merge_data['DAYS_EMPLOYED'])/7)-(
    (np.floor((-merge_data['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

# before_EMPLOYED
merge_data['before_EMPLOYED']=merge_data['DAYS_BIRTH']-merge_data['DAYS_EMPLOYED']
merge_data['before_EMPLOYED_month']=np.floor((-merge_data['before_EMPLOYED'])/30)-(
    (np.floor((-merge_data['before_EMPLOYED'])/30)/12).astype(int)*12)
merge_data['before_EMPLOYED_week']=np.floor((-merge_data['before_EMPLOYED'])/7)-(
    (np.floor((-merge_data['before_EMPLOYED'])/7)/4).astype(int)*4)

# DAYS_BIRTH / Income
merge_data['DAYS_BIRTH_month/income_total'] = merge_data['DAYS_BIRTH_month'] / merge_data['income_total']
merge_data['DAYS_BIRTH_week/income_total'] = merge_data['DAYS_BIRTH_week'] / merge_data['income_total']

# DAYS_EMPLOYED / Income
merge_data['DAYS_EMPLOYED_month/income_total'] = merge_data['DAYS_EMPLOYED_month'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED_week/income_total'] = merge_data['DAYS_EMPLOYED_week'] / merge_data['income_total']

# before_EMPLOYED / Income
merge_data['before_EMPLOYED/income_total'] = merge_data['before_EMPLOYED'] / merge_data['income_total']
merge_data['before_EMPLOYED_month/income_total'] = merge_data['before_EMPLOYED_month'] / merge_data['income_total']
merge_data['before_EMPLOYED_week/income_total'] = merge_data['before_EMPLOYED_week'] / merge_data['income_total']

# Income / Family
merge_data['income_total/family_size'] = merge_data['income_total'] / merge_data['family_size']

merge_data['child_num/income_total'] = merge_data['child_num'] / merge_data['income_total']
merge_data['family_size/income_total'] = merge_data['family_size'] / merge_data['income_total']
merge_data['DAYS_BIRTH/income_total'] = merge_data['DAYS_BIRTH'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/income_total'] = merge_data['DAYS_EMPLOYED'] / merge_data['income_total']
merge_data['DAYS_EMPLOYED/DAYS_BIRTH'] =  merge_data['DAYS_EMPLOYED'] / merge_data['DAYS_BIRTH']

# Income skewed-data
merge_data['income_total'] = np.log1p(merge_data['income_total'])
# merge_data['log_income_total'] = np.log(merge_data['income_total'])
# merge_data['sqrt_income_total'] = np.sqrt(merge_data['income_total'])
# merge_data['boxcox_income_total'] = stats.boxcox(merge_data['income_total'])[0]

merge_data = merge_data.fillna(-999)
train = merge_data[merge_data['credit'] != -999]
test = merge_data[merge_data['credit'] == -999]
test.drop('credit', axis = 1, inplace = True)

train_cols = list(train.columns); train_cols.remove('credit'); train_cols.append('credit')
train = train[train_cols]

# 3. Modeling

In [None]:
pred_dict = {}
pred_test_dict = {}

In [None]:
train_lab = train.copy()
test_lab = test.copy()

enc = LabelEncoder()
for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = enc.fit_transform(train_lab[col])
        test_lab[col] = enc.fit_transform(test_lab[col])

train_x = train_lab.drop(['credit'], axis=1) # 데이터 나누기
train_y = train_lab['credit']
test_x = test_lab.copy()

print('Label Encoding Completed')

In [None]:
def reduce_mem_usage(data):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
    start_memory = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_memory = data.memory_usage().sum() / 1024**2
    print('Memory optimization from {:5.2f}MB to {:5.2f}MB ({:.1f}% reduction)'
          .format(start_memory, end_memory, 100 * (start_memory - end_memory) / start_memory))
    return data

In [None]:
train_x = reduce_mem_usage(train_x)
test_x = reduce_mem_usage(test_x)

## (1) Lightgbm

In [None]:
# 0.6874110257206558
lgb_best_hyperparams = {'learning_rate': 0.00584665661176, 'reg_alpha': 0.9931264066149119, 'reg_lambda': 0.9808397348116461, 
                        'max_depth': 11, 'num_leaves': 1039, 'colsample_bytree': 0.3684233026512157, 'subsample': 0.7760609974958406, 
                        'subsample_freq': 12, 'min_child_samples': 3, 'min_child_weight': 1.0687770422304368, 'max_bin': 378}
lgb_base_hyperparams = {'objective':'multiclass', 'n_estimators':10000,
                        'lambda_l1':lgb_best_hyperparams['reg_alpha'],
                        'lambda_l2':lgb_best_hyperparams['reg_lambda'],
                        'reg_alpha':None, 'reg_lambda':None}
lgb_best_hyperparams.update(lgb_base_hyperparams)

## Train & Predict

In [None]:
lucky_seeds=[2283, 8217, 91373] # Lucky seed 늘려가면서 하기
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # CV 늘려가면서 하기
    cv=np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        lgbmodel = LGBMClassifier(**lgb_best_hyperparams)

                                                                                    # 진행상황 보고싶을때 None을 100으로
        lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 
        
        cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
        pred_test += lgbmodel.predict_proba(test_x) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
    pred_dict['lgb'+str(seed)] = cv
    pred_test_dict['lgb'+str(seed)] = pred_test
    print(seed, 'multi_logloss :', log_loss(train_y, cv))

## (2) XGBoost

In [None]:
# 0.6821672016092287
xgb_best_hyperparams = {'learning_rate': 0.004219566178881841, 'reg_alpha': 0.017314214531008332, 'reg_lambda': 0.7804799483256929, 
                       'max_depth': 16, 'colsample_bytree': 0.464918668234781, 'colsample_bylevel': 0.2112468031800087, 
                       'subsample': 0.9035127015017239, 'gamma': 0.7793451203919987, 'min_child_weight': 2.458581150016787, 
                       'max_bin': 309}
xgb_base_hyperparams = {'objective':'multi:softprob', "num_class": 3, "eval_metric": "mlogloss", "random_state": 91373}
xgb_best_hyperparams.update(xgb_base_hyperparams)

## Train & Predict

In [None]:
lucky_seeds=[2283, 8217, 91373]
xgtest = xgb.DMatrix(test_x)
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
                                                                                            # 진행상황 보고싶을때 None을 100으로
        xgbmodel = xgb.train(xgb_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx, :] = xgbmodel.predict(dvalid)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
        pred_test += xgbmodel.predict(xgtest) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['xgb'+str(seed)] = cv
    pred_test_dict['xgb'+str(seed)] = pred_test
    print(seed, 'multi_logloss :', log_loss(train_y, cv))

## (3) Random Forest

In [None]:
# 0.6888343594936606
rf_best_hyperparams = {'n_estimators': 776, 'max_depth': 95, 'max_features': 0.18938529117093866, 
                      'min_samples_split': 12, 'max_samples': 0.9089712797972337}
rf_base_hyperparams = {'random_state': 91373, 'n_jobs': -1}
rf_best_hyperparams.update(rf_base_hyperparams)

### 3 seeds, 10 folds

In [None]:
lucky_seeds=[2283, 8217, 91373]
for i, seed in enumerate(lucky_seeds):

    kfold = StratifiedKFold(n_splits=10, random_state = seed, shuffle = True) # 늘려가면서
    cv = np.zeros((train_x.shape[0], 3))
    pred_test = np.zeros((test_x.shape[0], 3), dtype=float)
    
    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
        
        x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

        rfmodel = RandomForestClassifier(**rf_best_hyperparams)
        rfmodel.fit(x_train, y_train)
     
        cv[val_idx, :] = rfmodel.predict_proba(x_val)      
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}')
        pred_test += rfmodel.predict_proba(test_x) / 10 # CV 바꾸면 이 숫자도 똑같이 바꿔야함
        
    pred_dict['rf'+str(seed)] = cv
    pred_test_dict['rf'+str(seed)] = pred_test
    print(seed, 'multi_logloss :', log_loss(train_y, cv))

## (4) Stacking (AutoLGB)

In [None]:
def sort_dict(model, pred_dict, pred_test_dict):
    pred_dict_local = {}
    for key, value in pred_dict.items():
        if model in key:
            pred_dict_local[key]=value

    pred_test_dict_local = {}
    for key, value in pred_test_dict.items():
        if model in key:
            pred_test_dict_local[key]=value

    pred_dict_new_local = dict(sorted(pred_dict_local.items(), key=lambda x:log_loss(train_y, list(x[1])), reverse=False)[:3])
    pred_test_dict_new_local = {}
    for key, value in pred_dict_new_local.items():
        pred_test_dict_new_local[key]=pred_test_dict_local[key]
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
pred_dict_lgb, pred_test_dict_lgb = sort_dict('lgb', pred_dict, pred_test_dict)
pred_dict_xgb, pred_test_dict_xgb = sort_dict('xgb', pred_dict, pred_test_dict)
pred_dict_rf, pred_test_dict_rf = sort_dict('rf', pred_dict, pred_test_dict)

In [None]:
def save_dict(model, pred_dict, pred_test_dict):
    with open('./pkl/pred_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_dict, fw)

    with open('./pkl/pred_test_dict_'+model+'.pickle', 'wb') as fw:
        pickle.dump(pred_test_dict, fw)

In [None]:
# save_dict('lgb', pred_dict_lgb, pred_test_dict_lgb)
# save_dict('xgb', pred_dict_xgb, pred_test_dict_xgb)
# save_dict('rf', pred_dict_rf, pred_test_dict_rf)

In [None]:
def load_dict(model):
    with open('./pkl/pred_dict_'+model+'.pickle', 'rb') as fw:
        pred_dict_new_local = pickle.load(fw)

    with open('./pkl/pred_test_dict_'+model+'.pickle', 'rb') as fw:
        pred_test_dict_new_local = pickle.load(fw)
        
    return pred_dict_new_local, pred_test_dict_new_local

In [None]:
# pred_dict_lgb, pred_test_dict_lgb = load_dict('lgb')
# pred_dict_xgb, pred_test_dict_xgb = load_dict('xgb')
# pred_dict_rf, pred_test_dict_rf = load_dict('rf')

In [None]:
pred_dict_total = {**pred_dict_lgb, **pred_dict_xgb, **pred_dict_rf}
pred_test_dict_total = {**pred_test_dict_lgb, **pred_test_dict_xgb, **pred_test_dict_rf}

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in pred_dict_total.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in pred_test_dict_total.items()]))

pred = np.zeros((X_train.shape[0], 3), dtype=float)
pred_test = np.zeros((X_test.shape[0], 3), dtype=float)
#kfold = KFold(n_splits=5, random_state = seed, shuffle = True)
cv = StratifiedKFold(n_splits=12, shuffle=True, random_state=42)

for i_cv, (i_trn, i_val) in enumerate(cv.split(X_train, train_y)):
    if i_cv == 0:
        clf = AutoLGB(objective='multiclass', metric='multi_logloss', params={'num_class': 3}, 
                      feature_selection=False, n_est=10000)
        clf.tune(X_train.iloc[i_trn], train_y[i_trn])
        n_best = clf.n_best
        features = clf.features
        params = clf.params
        print(f'best iteration: {n_best}')
        print(f'selected features ({len(features)}): {features}')        
        print(params)
        clf.fit(X_train.iloc[i_trn], train_y[i_trn])
    else:
        train_data = lgb.Dataset(X_train[features].iloc[i_trn], label=train_y[i_trn])
        clf = lgb.train(params, train_data, n_best, verbose_eval=100)
    
    pred[i_val] = clf.predict(X_train[features].iloc[i_val])
    pred_test += clf.predict(X_test[features]) / 12

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')

In [None]:
stack1_train = pred.copy()
stack1_test = pred_test.copy()

# Stacking (XGB)

In [None]:
def stack_objective(trial: Trial) -> float:
    params_xgb = {
        "random_state": 91373,
        "verbose": None,
        "num_class": 3,
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        #"tree_method": "gpu_hist",
        "learning_rate": trial.suggest_uniform("learning_rate", 0.005, 0.01),
        #"reg_alpha": trial.suggest_uniform("reg_alpha", 0.1, 1.0),
        #"reg_lambda": trial.suggest_uniform("reg_lambda", 0.1, 1.0),
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.3, 1.0),
        "colsample_bylevel": trial.suggest_uniform("colsample_bylevel", 0.3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.3, 1.0),
        "gamma": trial.suggest_uniform("gamma", 0.3, 1.0),
        #"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    # CV=10으로 튜닝
    
    seed = 91373
    kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # Cross-validation cv=5
    cv = np.zeros((train_x.shape[0], 3))

    for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

        x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
        
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                            # 진행상황 보고싶을때 None을 100으로
        stack_xgbmodel = xgb.train(params_xgb, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)

        cv[val_idx, :] = stack_xgbmodel.predict(dvalid)
        #print(f'fold{n+1} multi_logloss: {log_loss(y_val, cv[val_idx, :])}') # Fold마다 점수 체크하려면 주석 해제
    print('multi_logloss:', log_loss(train_y, cv))

    return log_loss(train_y, cv)

In [None]:
X_train = pd.DataFrame(np.hstack([x for _, x in pred_dict_total.items()]))
X_test = pd.DataFrame(np.hstack([x for _, x in pred_test_dict_total.items()]))

sampler = TPESampler(seed=42)
stack_study = optuna.create_study(study_name="stack_parameter_opt", direction="minimize", sampler=sampler)
stack_study.optimize(stack_objective, n_trials=30)

stack_best_hyperparams = stack_study.best_trial.params
stack_base_hyperparams = {'objective':'multi:softprob', "num_class": 3, "eval_metric": "mlogloss", 
                         #"tree_method": "gpu_hist", 
                          "random_state": 91373}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
stack_best_hyperparams = {'learning_rate': 0.005171942605576092, 'max_depth': 10, 'colsample_bytree': 0.48114598712001183, 
                          'colsample_bylevel': 0.7637655990477874, 'subsample': 0.5181977532625877, 'gamma': 0.6640476148244676, 
                          'max_bin': 364}
stack_base_hyperparams = {'objective':'multi:softprob', "num_class": 3, "eval_metric": "mlogloss", 
                         #"tree_method": "gpu_hist", 
                          "random_state": 91373}
stack_best_hyperparams.update(stack_base_hyperparams)
print("The best hyperparameters are:\n", stack_best_hyperparams)

In [None]:
pred = np.zeros((X_train.shape[0], 3), dtype=float)
pred_test = np.zeros((X_test.shape[0], 3), dtype=float)
kfold = StratifiedKFold(n_splits=12, random_state = 91373, shuffle = True)

for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, train_y)):
    x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_val, label=y_val)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
                                                                                        # 진행상황 보고싶을때 None을 100으로
    stack_xgbmodel = xgb.train(stack_best_hyperparams, dtrain, 100000, watchlist, early_stopping_rounds=30, verbose_eval=None)
    
    pred[val_idx] = stack_xgbmodel.predict(dvalid)
    pred_test += stack_xgbmodel.predict(xgb.DMatrix(X_test)) / 12

In [None]:
print(f'CV Log Loss: {log_loss(train_y, pred):.6f}')

In [None]:
stack2_train = pred.copy()
stack2_test = pred_test.copy()

# Blending

In [None]:
pred_final = (sum(pred_dict_lgb.values())/3 * 0.1 +
              sum(pred_dict_xgb.values())/3 * 0.2 +
               sum(pred_dict_rf.values())/3 * 0.1 +
                               stack1_train * 0.3 +
                               stack2_train * 0.3)
log_loss(train_y, pred_final)

In [None]:
pred_test_final = (sum(pred_test_dict_lgb.values())/3 * 0.1 +
                   sum(pred_test_dict_xgb.values())/3 * 0.2 +
                    sum(pred_test_dict_rf.values())/3 * 0.1 +
                                          stack1_test * 0.3 +
                                          stack2_test * 0.3)

# 결과 제출

In [None]:
submission = sample_submission.copy()
submission.iloc[:, 1:] = pred_test_final

In [None]:
submission.to_csv('submission/submission.csv', index=False)

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# ============================================================

# Removed Code

## 기본 모델로 성능 측정하는 함수

In [None]:
def base_lgbmodel(train, verbose=True):
    
    train_x = train.drop(['credit'], axis=1)
    train_y = train['credit']
    
    np.random.seed(0)
    lucky_seeds=np.random.randint(1, 10000, 5)
    score_list = []
    
    for i, seed in enumerate(lucky_seeds):

        kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # CV 늘려가면서 하기
        cv=np.zeros((train_x.shape[0], 3))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            lgbmodel = LGBMClassifier(objective='multiclass', n_estimators=10000, random_state=seed)
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 

            cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        if verbose==True:
            print(f'multi_logloss: {log_loss(train_y, cv):.4f}')
        score_list.append(log_loss(train_y, cv))
    print(f'Average Logloss: {np.mean(score_list):.4f}')
    return np.mean(score_list)

## 향상된 모델로 성능 측정하는 함수

In [None]:
def advanced_lgbmodel(train, verbose=True):
    
    train_x = train.drop(['credit'], axis=1)
    train_y = train['credit']
    
    np.random.seed(0)
    lucky_seeds=np.random.randint(1, 10000, 3)
    score_list = []
    
    for i, seed in enumerate(lucky_seeds):

        kfold = StratifiedKFold(n_splits=5, random_state = seed, shuffle = True) # CV 늘려가면서 하기
        cv=np.zeros((train_x.shape[0], 3))

        for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

            x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
            y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

            lgbmodel = LGBMClassifier(learning_rate=0.01, objective='multiclass', num_leaves=1000, max_depth=-1,
                                      n_estimators=10000, random_state=seed)
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None) 

            cv[val_idx,:] = lgbmodel.predict_proba(x_val)
        if verbose==True:
            print(f'multi_logloss: {log_loss(train_y, cv):.4f}')
        score_list.append(log_loss(train_y, cv))
    print(f'Average Logloss: {np.mean(score_list):.4f}')
    return np.mean(score_list)

## 원 핫 인코딩

In [None]:
train_oh = train.copy()
test_oh = test.copy()

object_col = []
for col in train_oh.columns:
    if (train_oh[col].dtype == 'object'):
        object_col.append(col)   
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train_oh.drop(object_col, axis=1, inplace=True)
train_oh = pd.concat([train_oh, train_onehot_df], axis=1)    

test_onehot_df = pd.DataFrame(enc.transform(test_oh.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test_oh.drop(object_col, axis=1, inplace=True)
test_oh = pd.concat([test_oh, test_onehot_df], axis=1)

print('One Hot Encoding Completed')

In [None]:
# base_lgbmodel(train_oh, verbose=False)

## 라벨 인코딩

In [None]:
train_lab = train.copy()
test_lab = test.copy()

enc = LabelEncoder()
for col in train_lab.columns:
    if train_lab[col].dtypes=='object':
        train_lab[col] = enc.fit_transform(train_lab[col])
        test_lab[col] = enc.fit_transform(test_lab[col])

    
print('Label Encoding Completed')

In [None]:
# base_lgbmodel(train_lab, verbose=False)

## 카테고리 인코딩

In [None]:
train_cat = train.copy()
test_cat = test.copy()

for col in train_cat.columns:
    if train_cat[col].dtypes=='object':
        train_cat[col] =  train_cat[col].astype('category')
        test_cat[col] =  test_cat[col].astype('category')
    
print('Category Encoding Completed')

In [None]:
# base_lgbmodel(train_cat, verbose=False)

### income_total 범주화

In [None]:
# print('Initial Logloss', end=' ')
# base_lgbmodel(train_cat, verbose=False)
# raw_income = train_cat.income_total.copy()
# for cut in np.arange(1000, 5000, 500):
#     print(f'cut space:{cut}', end=' ')
#     cutted_income = pd.cut(raw_income, bins=np.arange(27000, 1575000, cut), labels=False)
#     train_cat['income_total'] = cutted_income
#     base_lgbmodel(train_cat, verbose=False)

## KNN (income_total, income_type, occyp_type)

In [None]:
# # income_type, occyp_type, income_total을 이용하여 클러스터링하기 위해 따로 원 핫 인코딩
# kmeans_train = train[['occyp_type', 'income_type', 'income_total']]
# kmeans_test = test[['occyp_type', 'income_type', 'income_total']]
# object_col = []
# for col in kmeans_train.columns:
#     if kmeans_train[col].dtype == 'object':
#         object_col.append(col)
        
# enc = OneHotEncoder()
# enc.fit(kmeans_train.loc[:,object_col])

# train_onehot_df = pd.DataFrame(enc.transform(kmeans_train.loc[:,object_col]).toarray(), 
#              columns=enc.get_feature_names(object_col))
# kmeans_train.drop(object_col, axis=1, inplace=True)
# kmeans_train = pd.concat([kmeans_train, train_onehot_df], axis=1)

# test_onehot_df = pd.DataFrame(enc.transform(kmeans_test.loc[:,object_col]).toarray(),
#              columns=enc.get_feature_names(object_col))
# kmeans_test.drop(object_col, axis=1, inplace=True)
# kmeans_test = pd.concat([kmeans_test, test_onehot_df], axis=1)

In [None]:
# base_lgbmodel(train_cat, verbose=False)
# # n_clusters를 3부터 10까지 진행하여 하나씩 성능 체크
# score_list = {}
# k_means_train_total_df = pd.DataFrame()
# k_means_test_total_df = pd.DataFrame()
# for i in tqdm(range(3, 12)):
#     train_cat = train.copy()
#     test_cat = test.copy()

#     for col in train_cat.columns:
#         if train_cat[col].dtypes=='object':
#             train_cat[col] =  train_cat[col].astype('category')
#             test_cat[col] =  test_cat[col].astype('category')

#     # n_cluster를 늘려가며 클러스터링 진행
#     k_means_train_df = pd.DataFrame()
#     k_means_test_df = pd.DataFrame()
#     k_means = cluster.KMeans(n_clusters=i)
#     k_means.fit(kmeans_train)
#     k_means_train_df = pd.concat([k_means_train_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means_train_total_df = pd.concat([k_means_train_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means.fit(kmeans_test)
#     k_means_test_df = pd.concat([k_means_test_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means_test_total_df = pd.concat([k_means_test_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)

#     train_cat = pd.concat([train_cat, k_means_train_df], axis=1)
#     test_cat = pd.concat([test_cat, k_means_test_df], axis=1)

#     # 클러스터링 결과를 category 타입으로 변경
#     for col in train_cat.columns:
#         if train_cat[col].dtypes=='int32':
#             train_cat[col] =  train_cat[col].astype('category')
#             test_cat[col] =  test_cat[col].astype('category')
#     print(f'cluster: {i}', end=' ')
#     score_list[f'cluster_{i}'] = base_lgbmodel(train_cat, verbose=False)

In [None]:
# base_lgbmodel(train_cat, verbose=False)
# # n_clusters를 3부터 10까지 진행하여 하나씩 성능 체크
# score_list = {}
# k_means_train_total_df = pd.DataFrame()
# k_means_test_total_df = pd.DataFrame()
# for i in tqdm(range(3, 12)):
#     train_cat = train.copy()
#     test_cat = test.copy()

#     for col in train_cat.columns:
#         if train_cat[col].dtypes=='object':
#             train_cat[col] =  train_cat[col].astype('category')
#             test_cat[col] =  test_cat[col].astype('category')

#     # n_cluster를 늘려가며 클러스터링 진행
#     k_means_train_df = pd.DataFrame()
#     k_means_test_df = pd.DataFrame()
#     k_means = cluster.KMeans(n_clusters=i)
#     k_means.fit(kmeans_train)
#     k_means_train_df = pd.concat([k_means_train_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means_train_total_df = pd.concat([k_means_train_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means.fit(kmeans_test)
#     k_means_test_df = pd.concat([k_means_test_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)
#     k_means_test_total_df = pd.concat([k_means_test_total_df, pd.DataFrame(k_means.labels_, columns=[f'cluster_{i}'])], axis=1)

#     train_cat = pd.concat([train_cat, k_means_train_df], axis=1)
#     test_cat = pd.concat([test_cat, k_means_test_df], axis=1)

#     # 클러스터링 결과를 category 타입으로 변경
#     for col in train_cat.columns:
#         if train_cat[col].dtypes=='int32':
#             train_cat[col] =  train_cat[col].astype('category')
#             test_cat[col] =  test_cat[col].astype('category')
#     print(f'cluster: {i}', end=' ')
#     score_list[f'cluster_{i}'] = base_lgbmodel(train_cat, verbose=False)

In [None]:
# # 성능이 좋아지는 클러스터링 개수로만 피처 추출

# train_cat = train.copy()
# test_cat = test.copy()

# for col in train_cat.columns:
#     if train_cat[col].dtypes=='object':
#         train_cat[col] =  train_cat[col].astype('category')
#         test_cat[col] =  test_cat[col].astype('category')
        
# n = 1 # 성능이 좋아지는 클러스터 개수별로 정렬한 후 앞의 n개만 추출
# train_cat = pd.concat([train_cat, k_means_train_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]]], axis=1)
# test_cat = pd.concat([test_cat, k_means_test_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]]], axis=1)

# for col in train_cat.columns:
#     if train_cat[col].dtypes=='int32':
#         train_cat[col] =  train_cat[col].astype('category')
#         test_cat[col] =  test_cat[col].astype('category')
        
# print(k_means_train_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]].columns.tolist())
# print(base_lgbmodel(train_cat, verbose=False))

In [None]:
# k_means_train_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]].to_csv(
#     './save/k_means_train.csv', index=False)
# k_means_test_total_df.loc[:, sorted(score_list, key=lambda x: score_list[x])[:n]].to_csv(
#     './save/k_means_test.csv', index=False)

In [None]:
# train_cat = pd.concat([train_cat, pd.read_csv('./save/k_means_train.csv')], axis=1)
# test_cat = pd.concat([test_cat, pd.read_csv('./save/k_means_test.csv')], axis=1)

### Permutation Feature Importance

In [None]:
# enc = LabelEncoder()
# for col in train_cat.columns:
#     if train_cat[col].dtype.name=='category':
#         train_cat[col] = enc.fit_transform(train_cat[col])
#         test_cat[col] = enc.fit_transform(test_cat[col])
        
# print('Label Encoding Completed')

# train_x = train_cat.drop(['credit'], axis=1)
# train_y = train_cat['credit']
# test_x = test_cat.copy()

# seeds = np.random.randint(0, 1000, 3)
# perm_dicts = {}
# cv = np.zeros((train_x.shape[0], 3))
# for n, seed in enumerate(seeds):
#     kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
#     for i, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):

#         x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#         y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

#         lgbm = LGBMClassifier(n_estimators=10000, objective='multiclass', seed=0)
#         lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#         cv[val_idx, :] = lgbm.predict_proba(x_val)
        
#         perm = PermutationImportance(lgbm, scoring = "neg_log_loss", random_state = seed).fit(x_val, y_val)
#         perm_dicts[str(seed)+'_seed_'+str(i+1)+'_fold'] = pd.DataFrame({'feature':x_val.columns.tolist(), 
#                                                                         'importance':perm.feature_importances_}
#                                                                       ).sort_values('importance')
#     print('multi_logloss:', log_loss(train_y, cv))
        
# for i, df in enumerate(perm_dicts.values()):
#     if i==0:
#         perm_df = df
#     else:
#         perm_df = pd.merge(perm_df, df, on='feature')
# perm_remove_df = perm_df.set_index('feature').mean(axis=1)>=0
# remove_features = perm_remove_df[perm_remove_df==False].index
# train_x = train_x.drop(remove_features, axis=1)

### 변수 하나씩 지우며 성능 체크하는 코드

In [None]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# cv = np.zeros((train_x.shape[0], 3))
# for n, (train_idx, val_idx) in enumerate(kfold.split(train_x, train_y)):
#     x_train, x_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
#     y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#     lgbm = LGBMClassifier(**lgb_best_hyperparams, seed=0)
#     lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#     cv[val_idx] = lgbm.predict_proba(x_val)
# Initial_log_loss = log_loss(train_y, cv)
# print(f'Initial_multi_logloss: {Initial_log_loss}')

# remove_features = {}
# for i in range(1, 2):
#     for j in tqdm(combinations(list(range(0, train_x.shape[1])), i)):
#         train_new_x = train_x.drop(train_x.columns[list(j)], axis=1)
        
#         kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
#         cv = np.zeros((train_new_x.shape[0], 3))
#         for n, (train_idx, val_idx) in enumerate(kfold.split(train_new_x, train_y)):
#             x_train, x_val = train_new_x.iloc[train_idx], train_new_x.iloc[val_idx]
#             y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]
#             lgbm = LGBMClassifier(**lgb_best_hyperparams)
#             lgbm.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=None)
#             cv[val_idx] = lgbm.predict_proba(x_val)
#         remove_features[list(j)[0]] = log_loss(train_y, cv)
#         if Initial_log_loss > log_loss(train_y, cv):
#             print(f'{list(j)[0]}_multi_logloss: {log_loss(train_y, cv)}')

In [None]:
# remove_feature = sorted(remove_features, key=lambda x: remove_features[x])[:3]
# train_x = train_x.drop(train_x.columns[remove_feature], axis=1)
# test_x =  test_x.drop((test_x.columns[remove_feature], axis=1)

### RandomForest GridSearchCV

In [None]:
# params = {'max_depth': [55, 60, 65] # 튜닝할 파라미터 삽입
#             }

# rf_clf = RandomForestClassifier(random_state = 0, n_estimators = 1000, 
#                                 min_samples_leaf=2, min_samples_split=2,
#                                 criterion='entropy', n_jobs = -1)
# grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 5, n_jobs = -1)
# grid_cv.fit(df_train, y)

# print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
# print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))