In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix, classification_report,roc_auc_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 01. Data Load

In [2]:
# train 데이터 로드
X_train_border = pd.read_csv('Data/X_train_border.csv')
X_train_smote = pd.read_csv('Data/X_train_smote.csv')
X_train_adasyn = pd.read_csv('Data/X_train_adasyn.csv')
X_train_tomek = pd.read_csv('Data/X_train_tomek.csv')
X_train_enn = pd.read_csv('Data/X_train_enn.csv')
y_train_border = pd.read_csv('Data/y_train_border.csv')
y_train_smote = pd.read_csv('Data/y_train_smote.csv')
y_train_adasyn = pd.read_csv('Data/y_train_adasyn.csv')
y_train_tomek = pd.read_csv('Data/y_train_tomek.csv')
y_train_enn = pd.read_csv('Data/y_train_enn.csv')

# test 데이터 로드
X_test = pd.read_csv('Data/X_test.csv')
y_test = pd.read_csv('Data/y_test.csv')

# 02. AUC & F1 score

## 02-1. LogisticRegression
- GridSearchCV를 통해 확인한 파라미터를 통한 AUC 및 F1 score 확인

In [8]:
model = LogisticRegression(C=1,penalty='l2')
model.fit(X_train_smote,y_train_smote)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.21012055723945736
test roc_auc :  0.7855704395647639


In [9]:
model = LogisticRegression(C=1,penalty='l2')
model.fit(X_train_border,y_train_border)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.21629840102317718
test roc_auc :  0.7853252942927272


In [10]:
model = LogisticRegression(C=1,penalty='l2')
model.fit(X_train_adasyn,y_train_adasyn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.20610447558537506
test roc_auc :  0.7853258314575098


In [11]:
model = LogisticRegression(C=1,penalty='l2')
model.fit(X_train_tomek,y_train_tomek)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.2097679907663178
test roc_auc :  0.78558036269948


In [12]:
model = LogisticRegression(C=1,penalty='l2')
model.fit(X_train_enn,y_train_enn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.20124645656161413
test roc_auc :  0.7855238451328883


## 02-2. CatBoost

In [3]:
model = CatBoostClassifier(task_type='GPU',depth=6,iterations=1000,l2_leaf_reg=1e-19,
                           leaf_estimation_iterations=10,random_seed=42)
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

0:	learn: 0.6817169	total: 12.3s	remaining: 3h 24m 58s
1:	learn: 0.6712096	total: 22s	remaining: 3h 2m 56s
2:	learn: 0.6612772	total: 22.1s	remaining: 2h 2m 10s
3:	learn: 0.6518852	total: 22.1s	remaining: 1h 31m 47s
4:	learn: 0.6430878	total: 22.2s	remaining: 1h 13m 33s
5:	learn: 0.6354085	total: 22.2s	remaining: 1h 1m 24s
6:	learn: 0.6278219	total: 22.3s	remaining: 52m 45s
7:	learn: 0.6208715	total: 22.4s	remaining: 46m 14s
8:	learn: 0.6141349	total: 24.6s	remaining: 45m 10s
9:	learn: 0.6082848	total: 25.8s	remaining: 42m 30s
10:	learn: 0.6026729	total: 25.9s	remaining: 38m 44s
11:	learn: 0.5973778	total: 1m 15s	remaining: 1h 44m 6s
12:	learn: 0.5920210	total: 1m 15s	remaining: 1h 36m 5s
13:	learn: 0.5877023	total: 1m 16s	remaining: 1h 30m 17s
14:	learn: 0.5828422	total: 1m 17s	remaining: 1h 25m 6s
15:	learn: 0.5788691	total: 1m 20s	remaining: 1h 22m 42s
16:	learn: 0.5749936	total: 1m 27s	remaining: 1h 24m 25s
17:	learn: 0.5710474	total: 1m 27s	remaining: 1h 19m 42s
18:	learn: 0.56746

In [4]:
model = CatBoostClassifier(task_type='GPU',depth=6,iterations=1000,l2_leaf_reg=1e-19,
                           leaf_estimation_iterations=10,random_seed=42)
model.fit(X_train_border, y_train_border)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

0:	learn: 0.6787666	total: 61.8ms	remaining: 1m 1s
1:	learn: 0.6654123	total: 122ms	remaining: 1m 1s
2:	learn: 0.6534846	total: 184ms	remaining: 1m
3:	learn: 0.6419870	total: 245ms	remaining: 1m 1s
4:	learn: 0.6316953	total: 307ms	remaining: 1m 1s
5:	learn: 0.6223174	total: 368ms	remaining: 1m
6:	learn: 0.6136814	total: 428ms	remaining: 1m
7:	learn: 0.6054324	total: 489ms	remaining: 1m
8:	learn: 0.5970807	total: 550ms	remaining: 1m
9:	learn: 0.5896754	total: 611ms	remaining: 1m
10:	learn: 0.5824340	total: 672ms	remaining: 1m
11:	learn: 0.5763233	total: 732ms	remaining: 1m
12:	learn: 0.5704664	total: 794ms	remaining: 1m
13:	learn: 0.5646638	total: 856ms	remaining: 1m
14:	learn: 0.5594142	total: 915ms	remaining: 1m
15:	learn: 0.5541410	total: 978ms	remaining: 1m
16:	learn: 0.5495864	total: 1.04s	remaining: 1m
17:	learn: 0.5455632	total: 1.1s	remaining: 59.9s
18:	learn: 0.5395106	total: 1.16s	remaining: 59.8s
19:	learn: 0.5350382	total: 1.22s	remaining: 59.8s
20:	learn: 0.5309693	total: 1

In [5]:
model = CatBoostClassifier(task_type='GPU',depth=6,iterations=1000,l2_leaf_reg=1e-19,
                           leaf_estimation_iterations=10,random_seed=42)
model.fit(X_train_adasyn, y_train_adasyn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

0:	learn: 0.6824988	total: 59ms	remaining: 59s
1:	learn: 0.6729686	total: 117ms	remaining: 58.4s
2:	learn: 0.6637332	total: 174ms	remaining: 57.8s
3:	learn: 0.6552036	total: 230ms	remaining: 57.2s
4:	learn: 0.6473714	total: 286ms	remaining: 57s
5:	learn: 0.6399264	total: 344ms	remaining: 57s
6:	learn: 0.6331949	total: 400ms	remaining: 56.8s
7:	learn: 0.6263752	total: 458ms	remaining: 56.8s
8:	learn: 0.6204775	total: 515ms	remaining: 56.7s
9:	learn: 0.6144333	total: 573ms	remaining: 56.7s
10:	learn: 0.6088797	total: 631ms	remaining: 56.8s
11:	learn: 0.6038401	total: 688ms	remaining: 56.7s
12:	learn: 0.5991375	total: 746ms	remaining: 56.6s
13:	learn: 0.5945720	total: 801ms	remaining: 56.4s
14:	learn: 0.5905595	total: 857ms	remaining: 56.3s
15:	learn: 0.5862223	total: 914ms	remaining: 56.2s
16:	learn: 0.5822922	total: 971ms	remaining: 56.1s
17:	learn: 0.5785106	total: 1.03s	remaining: 56s
18:	learn: 0.5751419	total: 1.08s	remaining: 56s
19:	learn: 0.5695782	total: 1.14s	remaining: 55.8s
2

In [6]:
model = CatBoostClassifier(task_type='GPU',depth=6,iterations=1000,l2_leaf_reg=1e-19,
                           leaf_estimation_iterations=10,random_seed=42)
model.fit(X_train_tomek, y_train_tomek)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

0:	learn: 0.6815928	total: 60.8ms	remaining: 1m
1:	learn: 0.6709612	total: 121ms	remaining: 1m
2:	learn: 0.6609781	total: 182ms	remaining: 1m
3:	learn: 0.6515709	total: 242ms	remaining: 1m
4:	learn: 0.6427261	total: 302ms	remaining: 1m
5:	learn: 0.6351271	total: 362ms	remaining: 1m
6:	learn: 0.6276028	total: 421ms	remaining: 59.7s
7:	learn: 0.6203572	total: 480ms	remaining: 59.5s
8:	learn: 0.6139944	total: 540ms	remaining: 59.5s
9:	learn: 0.6075602	total: 598ms	remaining: 59.2s
10:	learn: 0.6017817	total: 655ms	remaining: 58.9s
11:	learn: 0.5966874	total: 714ms	remaining: 58.8s
12:	learn: 0.5913850	total: 774ms	remaining: 58.7s
13:	learn: 0.5865960	total: 834ms	remaining: 58.7s
14:	learn: 0.5817290	total: 894ms	remaining: 58.7s
15:	learn: 0.5772127	total: 952ms	remaining: 58.6s
16:	learn: 0.5733250	total: 1.01s	remaining: 58.4s
17:	learn: 0.5691222	total: 1.07s	remaining: 58.4s
18:	learn: 0.5656495	total: 1.13s	remaining: 58.3s
19:	learn: 0.5598241	total: 1.19s	remaining: 58.3s
20:	lea

In [7]:
model = CatBoostClassifier(task_type='GPU',depth=6,iterations=1000,l2_leaf_reg=1e-19,
                           leaf_estimation_iterations=10,random_seed=42)
model.fit(X_train_enn, y_train_enn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

0:	learn: 0.6786149	total: 55.2ms	remaining: 55.2s
1:	learn: 0.6646506	total: 108ms	remaining: 53.9s
2:	learn: 0.6518506	total: 161ms	remaining: 53.6s
3:	learn: 0.6398632	total: 216ms	remaining: 53.8s
4:	learn: 0.6291309	total: 271ms	remaining: 54s
5:	learn: 0.6180985	total: 324ms	remaining: 53.7s
6:	learn: 0.6087147	total: 378ms	remaining: 53.6s
7:	learn: 0.5993247	total: 431ms	remaining: 53.5s
8:	learn: 0.5907961	total: 486ms	remaining: 53.5s
9:	learn: 0.5830910	total: 539ms	remaining: 53.3s
10:	learn: 0.5755110	total: 592ms	remaining: 53.2s
11:	learn: 0.5689301	total: 647ms	remaining: 53.2s
12:	learn: 0.5620560	total: 700ms	remaining: 53.2s
13:	learn: 0.5557716	total: 752ms	remaining: 53s
14:	learn: 0.5504809	total: 804ms	remaining: 52.8s
15:	learn: 0.5450731	total: 855ms	remaining: 52.6s
16:	learn: 0.5401198	total: 909ms	remaining: 52.6s
17:	learn: 0.5352866	total: 963ms	remaining: 52.5s
18:	learn: 0.5307549	total: 1.02s	remaining: 52.5s
19:	learn: 0.5261643	total: 1.07s	remaining:

## 02-3. XGBoost

In [8]:
model = XGBClassifier(tree_method='gpu_hist', 
                      predictor = 'gpu_predictor',
                     learning_rate=1,max_depth=10,min_child_weight=5) 
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.23324058424544805
test roc_auc :  0.8356116375028685


In [9]:
model = XGBClassifier(tree_method='gpu_hist', 
                      predictor = 'gpu_predictor',
                     learning_rate=1,max_depth=10,min_child_weight=5) 
model.fit(X_train_border, y_train_border)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.23881719972176088
test roc_auc :  0.8350335813880593


In [10]:
model = XGBClassifier(tree_method='gpu_hist', 
                      predictor = 'gpu_predictor',
                     learning_rate=1,max_depth=10,min_child_weight=5) 
model.fit(X_train_adasyn, y_train_adasyn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.21773321613864235
test roc_auc :  0.8366700218621003


In [11]:
model = XGBClassifier(tree_method='gpu_hist', 
                      predictor = 'gpu_predictor',
                     learning_rate=1,max_depth=10,min_child_weight=5) 
model.fit(X_train_tomek, y_train_tomek)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.23680127078445937
test roc_auc :  0.8362411568485397


In [12]:
model = XGBClassifier(tree_method='gpu_hist', 
                      predictor = 'gpu_predictor',
                     learning_rate=1,max_depth=10,min_child_weight=5) 
model.fit(X_train_enn, y_train_enn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.29896446181634445
test roc_auc :  0.8398984577644051


## 02-4. LightGBM

In [13]:
model = LGBMClassifier(objective='binary',
                      device='gpu',
                      metric='binary_logloss',
                      learning_rate = 0.2,
                      max_depth = -1,
                      min_child_samples=15,
                      num_leaves = 80) 
model.fit(X_train_smote, y_train_smote)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.21918828159426235
test roc_auc :  0.833660220041985


In [14]:
model = LGBMClassifier(objective='binary',
                      device='gpu',
                      metric='binary_logloss',
                      learning_rate = 0.2,
                      max_depth = -1,
                      min_child_samples=15,
                      num_leaves = 80) 
model.fit(X_train_border, y_train_border)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.23318308526876247
test roc_auc :  0.8351239535698192


In [15]:
model = LGBMClassifier(objective='binary',
                      device='gpu',
                      metric='binary_logloss',
                      learning_rate = 0.2,
                      max_depth = -1,
                      min_child_samples=15,
                      num_leaves = 80) 
model.fit(X_train_adasyn, y_train_adasyn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.1707580137193149
test roc_auc :  0.8320492370878243


In [16]:
model = LGBMClassifier(objective='binary',
                      device='gpu',
                      metric='binary_logloss',
                      learning_rate = 0.2,
                      max_depth = -1,
                      min_child_samples=15,
                      num_leaves = 80) 
model.fit(X_train_tomek, y_train_tomek)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.22388572110935306
test roc_auc :  0.8334964516719838


In [17]:
model = LGBMClassifier(objective='binary',
                      device='gpu',
                      metric='binary_logloss',
                      learning_rate = 0.2,
                      max_depth = -1,
                      min_child_samples=15,
                      num_leaves = 80) 
model.fit(X_train_enn, y_train_enn)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print('test f1_score : ',f1_score(y_pred,y_test))
print('test roc_auc : ',roc_auc_score(y_test,y_prob))

test f1_score :  0.2872784885064924
test roc_auc :  0.8354471726821245
