### 사용할 수 있는 모델들
로지스틱회귀, 디시젼트리, MLP분류모델, KNN, SVC, lightGBM, XGBoost, catBoost

In [None]:
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier  # 뉴럴넷. 대표적인 비선형 모델
from sklearn.svm import SVC # 선형 모델. 서포트 벡터머신. 직선을 그어서 두 개의 클래스를 나눔. 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from ngboost import NGBClassifier
from ngboost.distns import k_categorical, Bernoulli

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
import optuna

In [None]:
# Read Data

# 분류용
X_train = pd.read_csv('X_train_preprocessed.csv', encoding='utf-8') # 저번 컴페티션 데이터
y_train = pd.read_csv('y_train.csv', encoding='utf-8')
X_test = pd.read_csv('X_test_preprocessed.csv', encoding='utf-8')

In [None]:
id_test = X_test.custid

In [None]:
X_train.drop(columns='custid', inplace=True)
X_test.drop(columns='custid', inplace=True)
y_train.drop(columns='custid', inplace=True)

In [None]:
X_train

## 모델링

#### LogisticRegression

In [None]:
def lr_objective(trial):
    lr_C = trial.suggest_int('C', 50000, 100000, step=1)
    lr_penalty = trial.suggest_categorical('penalty', ['l2'])
    
    classifier_obj = LogisticRegression(
        random_state = 0,
        penalty = lr_penalty,
        C = lr_C,
        n_jobs = -1,
        solver = 'lbfgs',
        warm_start = True)
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv = 4, scoring = 'roc_auc', n_jobs = -1)
    roc_auc = score.mean()
    return roc_auc

lr_study = optuna.create_study(direction="maximize")
lr_study.optimize(lr_objective, n_trials = 12)

print("Best score:", lr_study.best_value)
print("Best parameters:", lr_study.best_params)

lr_model = LogisticRegression(**lr_study.best_params, random_state=0)

#### CatBoostClassifier

In [None]:
cat_model = CatBoostClassifier(random_state =0, iterations =7000, learning_rate =0.01)
cat_model.fit(X_train, y_train)

#### XGBClassifier

In [None]:
def xgb_objective(trial):
    xgb_learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
    xgb_max_depth = trial.suggest_int('max_depth', 3, 10)
    xgb_subsample = trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.1)
    xgb_colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.6, 1.0, 0.1)
    
    classifier_obj = XGBClassifier(
        learning_rate=xgb_learning_rate,
        max_depth=xgb_max_depth,
        subsample=xgb_subsample,
        colsample_bytree=xgb_colsample_bytree,
        random_state=0
    )
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv=4, scoring='roc_auc', n_jobs=-1)
    roc_auc = score.mean()
    return roc_auc

xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=12)

print("Best score:", xgb_study.best_value)
print("Best parameters:", xgb_study.best_params)

xgb_model = XGBClassifier(**xgb_study.best_params, random_state=0)

#### RandomForestClassifier

In [None]:
def rf_objective(trial):
    rf_max_depth = trial.suggest_categorical('max_depth', [None])
    rf_max_features = trial.suggest_categorical('max_features', [1, 3, 10])
    rf_min_samples_split = trial.suggest_categorical('min_samples_split', [2, 3, 10])
    rf_min_samples_leaf = trial.suggest_categorical('min_samples_leaf', [1, 3, 10])
    rf_bootstrap = trial.suggest_categorical('bootstrap', [False])
    rf_n_estimators = trial.suggest_categorical('n_estimators', [100, 300])
    rf_criterion = trial.suggest_categorical('criterion', ['gini'])
    
    classifier_obj = RandomForestClassifier(
        max_depth=rf_max_depth,
        max_features=rf_max_features,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        bootstrap=rf_bootstrap,
        n_estimators=rf_n_estimators,
        criterion=rf_criterion,
        random_state=0
    )
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv=4, scoring='roc_auc', n_jobs=4)
    accuracy = score.mean()
    return accuracy

rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(rf_objective, n_trials=12)

print("Best score:", rf_study.best_value)
print("Best parameters:", rf_study.best_params)

rf_model = RandomForestClassifier(**rf_study.best_params, random_state=0)

#### GradientBoostingClassifier

In [None]:
# Gradient Boosting

def gb_objective(trial):
    gb_loss = trial.suggest_categorical('loss', ["deviance"])
    gb_n_estimators = trial.suggest_categorical('n_estimators', [100, 200, 300])
    gb_learning_rate = trial.suggest_categorical('learning_rate', [0.1, 0.05, 0.01])
    gb_max_depth = trial.suggest_categorical('max_depth', [4, 8])
    gb_min_samples_leaf = trial.suggest_categorical('min_samples_leaf', [100, 150])
    gb_max_features = trial.suggest_categorical('max_features', [0.3, 0.1])
    
    classifier_obj = GradientBoostingClassifier(
        loss=gb_loss,
        n_estimators=gb_n_estimators,
        learning_rate=gb_learning_rate,
        max_depth=gb_max_depth,
        min_samples_leaf=gb_min_samples_leaf,
        max_features=gb_max_features,
        random_state=0
    )
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv=4, scoring='accuracy', n_jobs=4)
    accuracy = score.mean()
    return accuracy

gb_study = optuna.create_study(direction="maximize")
gb_study.optimize(gb_objective, n_trials=12)

print("Best score:", gb_study.best_value)
print("Best parameters:", gb_study.best_params)

gb_model = GradientBoostingClassifier(**gb_study.best_params, random_state=0)

### 튜닝한 모델 파라미터들 아래에 입력

In [None]:
clfs = [lr_model, xgb_model, rf_model, gb_model]

cat_model = 0.7265681369577974

### 유사도 측정

In [None]:
pred_results = []
for clf in clfs :
    pred = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
    name = type(clf).__name__ # 이름 그대로 가져옴
    score = (cross_val_score(clf, X_train, y_train, cv=4, scoring='roc_auc', n_jobs=-1)).mean()
    pred_results.append(pd.Series(pred, name=f'{name} \n({score:.4f})'))
    print("{:30s} {}".format(name, score))

ensemble_results = pd.concat(pred_results, axis=1)

In [None]:
corr = (ensemble_results.corr().sum()-1)/(ensemble_results.corr().shape[0]-1)
names = corr.index.str[:-10]
aucs = np.array(corr.index.str[-7:-1]).astype(float)
df = pd.DataFrame({'model': names, 'auc': aucs, 'cor': corr})        

plt.figure(figsize=(8,6))
g = sns.scatterplot(x="cor", y="auc", data=df, s=40, color='red')
for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.003, df.auc[line]-0.001, 
            df.model[line], horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.01,df.cor.max()+0.01))
plt.ylim((df.auc.min()-0.01,df.auc.max()+0.01))
plt.xlabel('Mean Agreement')
plt.ylabel('ROC-AUC')
plt.grid()
plt.show()

## 앙상블 시작
voting: 평가지표가 accuracy, recall, precision 등일 경우 사용  
averaging: 평가지표가 roc-auc, logloss 등일 경우 사용

### voting : soft  - 개별 모델 가중치 조정 가능

In [None]:
#임계값 조정!!
w_weights = [0.2, 0.1, 0.3, 0.4]
e_estimators = [('lr', lr_model),('rf', rf_model), ('xgb', xgb_model), ('cat', cat_model)]

averaging = VotingClassifier(
    estimators = e_estimators,
    voting='soft',
    weights=w_weights)

averaging.fit(X_train, y_train)


# print('AUC =', (cross_val_score(averaging, X_train, y_train, cv=4, scoring='roc_auc', n_jobs=-1)).mean())

### Make submissions & Save features

In [None]:
# 만들어진 csv화일 => 캐글에 submission
t = pd.Timestamp.now()
fname = f"submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
pd.DataFrame({'custid': id_test, 'gender': averaging.predict_proba(X_test)[:,1]}).to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")

# pickle 형식으로 저장 => 컴피티션 종료 후 가장 성능이 높은 화일을 담당교수 이메일로 제출
fname = f"features_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.pkl"
with open(fname, 'wb') as f:
    pickle.dump((X_train, X_test, averaging), f)