In [52]:
import pandas as pd

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
train=pd.read_csv('/content/drive/MyDrive/Undersampling_0.33_train.csv',index_col=False, encoding='euc-kr')
test=pd.read_csv('/content/drive/MyDrive/Undersampling_0.33_test.csv',index_col=False,encoding='euc-kr')

In [55]:
X_train_int=train[['부채비율', '자기자본순이익률', '부가가치율', '당좌자산회전률', '총자본회전률', '총자본증가율',
       '연구개발비대비매출액', '매출액대비잉여현금흐름', '차입금의존도']]

X_test_int=test[['부채비율', '자기자본순이익률', '부가가치율', '당좌자산회전률', '총자본회전률', '총자본증가율',
       '연구개발비대비매출액', '매출액대비잉여현금흐름', '차입금의존도']]

In [56]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]

X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [57]:
X_train_sc = X_train
X_test_sc = X_test

In [58]:
X_train_sum =X_train_sc
X_test_sum =X_test_sc

In [59]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np

def perform_lgbm_grid_search(X_train, y_train, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # LGBM 하이퍼파라미터 후보 리스트 설정
    param_grid = {
        'max_depth': [1],
        'n_estimators': [200],
        'learning_rate': [250],
        'objective': ['binary'],
        'random_state': [0]
    }

    # LGBM 모델 초기화
    model = lgb.LGBMClassifier()

    # 그리드 서치 설정
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, verbose=1, n_jobs=-1)

    # 모델 학습 및 튜닝
    grid_search.fit(X_train, y_train)

    # 최적 하이퍼파라미터 출력
    print("Best Hyperparameters:", grid_search.best_params_)

    # 평균 평가 지표 계산
    mean_f1_score = np.mean(grid_search.cv_results_['mean_test_score'])
    print("Mean F1 Score:", mean_f1_score)

    return grid_search.best_params_, mean_f1_score

In [60]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_lgbm_with_best_params(X_train, y_train, X_test, y_test, best_params, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # LGBM 모델 초기화
    model = lgb.LGBMClassifier(**best_params)

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # 모델 학습
        model.fit(X_train_fold, y_train_fold)

        # 테스트 데이터에 대한 예측 확률 얻기
        probabilities = model.predict_proba(X_test_fold)

        # threshold를 0.4로 설정하여 예측 클래스를 조정
        threshold = 0.5
        predicted_classes = (probabilities[:, 1] > threshold).astype(np.int)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, predicted_classes)
        precision = precision_score(y_test_fold, predicted_classes)
        recall = recall_score(y_test_fold, predicted_classes)
        f1 = f1_score(y_test_fold, predicted_classes)
        conf_matrix = confusion_matrix(y_test_fold, predicted_classes)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    probabilities_final = best_model.predict_proba(X_test)
    y_pred_final = (probabilities_final[:, 1] > threshold).astype(np.int)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

In [61]:
best_params, mean_f1_score = perform_lgbm_grid_search(X_train, y_train, k_fold=5)
evaluate_lgbm_with_best_params(X_train_int, y_train, X_test_int, y_test, best_params, k_fold=5)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 967, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 748, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/engine.py", line 271, in train
    booster = Booster(params=param

Best Hyperparameters: {'learning_rate': 250, 'max_depth': 1, 'n_estimators': 150, 'objective': 'binary', 'random_state': 0}
Mean F1 Score: nan
Fold 1
Accuracy: 0.821917808219178
Precision: 0.7272727272727273
Recall: 0.4444444444444444
F1 score: 0.5517241379310345
Confusion Matrix:
[[52  3]
 [10  8]]
------------------------------
Fold 2
Accuracy: 0.821917808219178
Precision: 0.7272727272727273
Recall: 0.4444444444444444
F1 score: 0.5517241379310345
Confusion Matrix:
[[52  3]
 [10  8]]
------------------------------
Fold 3
Accuracy: 0.875
Precision: 0.8461538461538461
Recall: 0.6111111111111112
F1 score: 0.7096774193548387
Confusion Matrix:
[[52  2]
 [ 7 11]]
------------------------------
Fold 4
Accuracy: 0.8472222222222222
Precision: 0.8181818181818182
Recall: 0.5
F1 score: 0.6206896551724137
Confusion Matrix:
[[52  2]
 [ 9  9]]
------------------------------
Fold 5
Accuracy: 0.8055555555555556
Precision: 0.75
Recall: 0.3333333333333333
F1 score: 0.46153846153846156
Confusion Matrix:


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  predicted_classes = (probabilities[:, 1] > threshold).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_pred_final = (probabilities_final[:, 1] > threshold).astype(np.int)


([0.821917808219178,
  0.821917808219178,
  0.875,
  0.8472222222222222,
  0.8055555555555556],
 [0.7272727272727273,
  0.7272727272727273,
  0.8461538461538461,
  0.8181818181818182,
  0.75],
 [0.4444444444444444,
  0.4444444444444444,
  0.6111111111111112,
  0.5,
  0.3333333333333333],
 [0.5517241379310345,
  0.5517241379310345,
  0.7096774193548387,
  0.6206896551724137,
  0.46153846153846156])

Final Test Results
Accuracy: 0.8055555555555556
Precision: 0.6875
Recall: 0.4074074074074074
F1 score: 0.5116279069767441
Confusion Matrix:
[[76  5]
 [16 11]]