In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/Kdigital/final_data.csv')  #기업 데이터(공개 불가)

Mounted at /content/drive


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import graphviz
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [None]:
X = data.iloc[:, 1:]
y = data['TARGET']

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

pca = PCA(n_components=2)
X = pca.fit_transform(X)

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_over, y_train_over = smote.fit_resample(X, y)
print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", X.shape, y.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 전의 분포 :\n',pd.Series(y).value_counts() )
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(y_train_over).value_counts() )

y = y.replace(1,-1)
y = y.replace(0,1)

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, stratify=y, random_state=49)
Kfold = StratifiedKFold(n_splits=3)

SMOTE 적용 전 학습용 피처/레이블 데이터 세트 :  (10642, 2) (10642,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트 : (20000, 2) (20000,)
SMOTE 적용 후 전의 분포 :
 0    10000
1      642
Name: TARGET, dtype: int64
SMOTE 적용 후 값의 분포 :
 0    10000
1    10000
Name: TARGET, dtype: int64


In [None]:
def make_model(model_name, train_x, train_y, KFold):
  if model_name == 'IForest':
    model = IsolationForest()
    param = {'contamination':[0.1, 0.3, 0.5],
             'n_estimators' : [50, 75, 100],
             'max_features' : [0.4, 0.7, 1]}

  elif model_name == 'oc_svm':
    model = OneClassSVM(max_iter=100)
    param = {'kernel' : ['rbf', 'sigmoid', 'poly', 'linear'],
             'gamma' : [0.3, 0.6, 0.9]}

  elif model_name == 'LogisticRegression':
    model = LogisticRegression()
    param = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
             'l1_ratio' : [0.3, 0.5, 0.7, 0.9],
             'solver' : ['newton_cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

  elif model_name == 'DecisionTree':
    model = DecisionTreeClassifier()
    param = {'criterion' : ['gini', 'entropy', 'log_loss'],
             'max_depth' : [3, 6, 9, 10],
             'min_samples_split' : [1, 2, 3],
             'min_samples_leaf' : [1, 2]}

  elif model_name == 'LGBM':
    model = LGBMClassifier()
    param = {'boosting_type':['gbdt', 'dart', 'goss', 'rf'],
             'n_estimators' : [50, 75, 100]}


  grid_search = GridSearchCV(model,
                              param_grid = param,
                              cv = Kfold,
                              scoring=['f1', 'precision', 'recall', 'accuracy'],
                              refit = 'recall',
                              n_jobs = -1)
  grid_search.fit(train_x, train_y)

  grid_search_results = pd.DataFrame(grid_search.cv_results_)
  grid_search_results = grid_search_results.sort_values(by = 'mean_test_f1',
                                                     ascending = False)

  print(grid_search_results[['params', 'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1']])

  best_model = grid_search.best_estimator_

  return best_model

In [None]:
def model_evaluation(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)


    print('Confusion Matrix')
    print(confusion)
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f}, f1: {3:.4f}'.format( accuracy, precision, recall, f1))

In [None]:
svm_model = make_model('oc_svm', train_x, train_y, Kfold)
Iforest_model = make_model('IForest', train_x, train_y, Kfold)
logistic_model = make_model('LogisticRegression', train_x, train_y, Kfold)
tree_model = make_model('DecisionTree', train_x, train_y, Kfold)
lbgm_model = make_model('LGBM', train_x, train_y, Kfold)

                                 params  mean_test_accuracy  \
3    {'gamma': 0.3, 'kernel': 'linear'}            0.781447   
7    {'gamma': 0.6, 'kernel': 'linear'}            0.781447   
11   {'gamma': 0.9, 'kernel': 'linear'}            0.781447   
0       {'gamma': 0.3, 'kernel': 'rbf'}            0.641831   
4       {'gamma': 0.6, 'kernel': 'rbf'}            0.623976   
8       {'gamma': 0.9, 'kernel': 'rbf'}            0.622500   
1   {'gamma': 0.3, 'kernel': 'sigmoid'}            0.406229   
9   {'gamma': 0.9, 'kernel': 'sigmoid'}            0.400322   
5   {'gamma': 0.6, 'kernel': 'sigmoid'}            0.380991   
2      {'gamma': 0.3, 'kernel': 'poly'}            0.365821   
6      {'gamma': 0.6, 'kernel': 'poly'}            0.362062   
10     {'gamma': 0.9, 'kernel': 'poly'}            0.351994   

    mean_test_precision  mean_test_recall  mean_test_f1  
3              0.938180          0.821712      0.875738  
7              0.938180          0.821712      0.875738  
11    

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 436, in _check_solver
    % (all_solvers, solver)
ValueError: Logistic Regression supports only solvers in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], got n

                                               params  mean_test_accuracy  \
2   {'l1_ratio': 0.3, 'penalty': 'l1', 'solver': '...            0.939723   
4   {'l1_ratio': 0.3, 'penalty': 'l1', 'solver': '...            0.939723   
46  {'l1_ratio': 0.7, 'penalty': 'l2', 'solver': '...            0.939723   
47  {'l1_ratio': 0.7, 'penalty': 'l2', 'solver': '...            0.939723   
48  {'l1_ratio': 0.7, 'penalty': 'l2', 'solver': '...            0.939723   
..                                                ...                 ...   
71  {'l1_ratio': 0.9, 'penalty': 'elasticnet', 'so...                 NaN   
72  {'l1_ratio': 0.9, 'penalty': 'elasticnet', 'so...                 NaN   
73  {'l1_ratio': 0.9, 'penalty': 'elasticnet', 'so...                 NaN   
75  {'l1_ratio': 0.9, 'penalty': 'none', 'solver':...                 NaN   
77  {'l1_ratio': 0.9, 'penalty': 'none', 'solver':...                 NaN   

    mean_test_precision  mean_test_recall  mean_test_f1  
2              0.

120 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 942, in fit
    X_idx_sorted=X_idx_sorted,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/tree/_classes.py", line 254, in fit
    % self.min_samples_split
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

-------------------------------------------------------------------

                                               params  mean_test_accuracy  \
25  {'criterion': 'entropy', 'max_depth': 3, 'min_...            0.939052   
26  {'criterion': 'entropy', 'max_depth': 3, 'min_...            0.939052   
29  {'criterion': 'entropy', 'max_depth': 3, 'min_...            0.938918   
28  {'criterion': 'entropy', 'max_depth': 3, 'min_...            0.938918   
2   {'criterion': 'gini', 'max_depth': 3, 'min_sam...            0.938918   
..                                                ...                 ...   
67  {'criterion': 'log_loss', 'max_depth': 10, 'mi...                 NaN   
68  {'criterion': 'log_loss', 'max_depth': 10, 'mi...                 NaN   
69  {'criterion': 'log_loss', 'max_depth': 10, 'mi...                 NaN   
70  {'criterion': 'log_loss', 'max_depth': 10, 'mi...                 NaN   
71  {'criterion': 'log_loss', 'max_depth': 10, 'mi...                 NaN   

    mean_test_precision  mean_test_recall  mean_test_f1  
25             0.

9 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/lightgbm/sklearn.py", line 744, in fit
    callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/lightgbm/sklearn.py", line 544, in fit
    callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/lightgbm/engine.py", line 197, in train
    booster = Booster(params=params, train_set=train_set)
  File "/usr/local/lib/python3.7/dist-packages/lightgbm

In [None]:
pred_train_svm = svm_model.predict(train_x)
pred_test_svm = svm_model.predict(test_x)

model_evaluation(train_y, pred_train_svm)
model_evaluation(test_y, pred_test_svm)

Confusion Matrix
[[ 374   75]
 [6144  856]]
accuracy: 0.1651, precision: 0.9194, recall: 0.1223, f1: 0.2159
Confusion Matrix
[[ 162   31]
 [2578  422]]
accuracy: 0.1829, precision: 0.9316, recall: 0.1407, f1: 0.2444


In [None]:
pred_train_Iforest = Iforest_model.predict(train_x)
pred_test_Iforest = Iforest_model.predict(test_x)

model_evaluation(train_y, pred_train_Iforest)
model_evaluation(test_y, pred_test_Iforest)

Confusion Matrix
[[  74  375]
 [ 671 6329]]
accuracy: 0.8596, precision: 0.9441, recall: 0.9041, f1: 0.9237
Confusion Matrix
[[  22  171]
 [ 275 2725]]
accuracy: 0.8603, precision: 0.9410, recall: 0.9083, f1: 0.9244


In [None]:
pred_train_logistic = logistic_model.predict(train_x)
pred_test_logistic = logistic_model.predict(test_x)

model_evaluation(train_y, pred_train_logistic)
model_evaluation(test_y, pred_test_logistic)

Confusion Matrix
[[   0  449]
 [   0 7000]]
accuracy: 0.9397, precision: 0.9397, recall: 1.0000, f1: 0.9689
Confusion Matrix
[[   0  193]
 [   0 3000]]
accuracy: 0.9396, precision: 0.9396, recall: 1.0000, f1: 0.9688


In [None]:
pred_train_tree = tree_model.predict(train_x)
pred_test_tree = tree_model.predict(test_x)

model_evaluation(train_y, pred_train_tree)
model_evaluation(test_y, pred_test_tree)

Confusion Matrix
[[   5  444]
 [   1 6999]]
accuracy: 0.9403, precision: 0.9403, recall: 0.9999, f1: 0.9692
Confusion Matrix
[[   1  192]
 [   3 2997]]
accuracy: 0.9389, precision: 0.9398, recall: 0.9990, f1: 0.9685


In [None]:
pred_train_lbgm = lbgm_model.predict(train_x)
pred_test_lbgm = lbgm_model.predict(test_x)

model_evaluation(train_y, pred_train_lbgm)
model_evaluation(test_y, pred_test_lbgm)

Confusion Matrix
[[   2  447]
 [   0 7000]]
accuracy: 0.9400, precision: 0.9400, recall: 1.0000, f1: 0.9691
Confusion Matrix
[[   0  193]
 [   0 3000]]
accuracy: 0.9396, precision: 0.9396, recall: 1.0000, f1: 0.9688
