### 중요!! 실행 절대 시키지 말고 결과 지우지 말것

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_sample_weight

리샘플링 언더, 스케일링 스탠다드 최종피처

In [48]:
# 데이터 로드
data = pd.read_csv('./코스피이상치99%처리.csv')

# 타겟 변수 및 피처 선택
X = data[['당기순이익률(%)', '총자본증가율(전년동기)(%)', '무형자산구성비율(%)', '타인자본비율(%)', '당좌비율(%)', 
          '매출채권회전율(회)', '영업현금흐름/투자현금흐름(%)', 'SGAI']]
y = data['분식기업']

# 수치형 피처만 선택
X_numeric = X.select_dtypes(include=[np.number])

# 트레인/테스트 셋 분리 (벤포드 항목 포함)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=y)

# 스케일링 적용 전에 벤포드 항목 제거
#X_train_to_scale = X_train.drop(columns=['벤포드'])

# 스케일링 적용
standard = StandardScaler()
X_train_scaled = standard.fit_transform(X_train)

# 트레인 셋에 벤포드 항목 다시 포함
#X_train_scaled = np.column_stack((X_train_scaled, X_train['벤포드']))

In [51]:
# 리샘플링 비율 설정
ratios = {
    'under': [0.1, 0.3, 0.5, 0.7, 0.9],
    #'over': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# 결과를 저장할 딕셔너리를 초기화합니다.
results = {}

# 각 리샘플링 비율에 대해 모델 훈련 및 성능 평가
for method, ratios_list in ratios.items():
    for ratio in ratios_list:
        resampled_data = {}
        if method == 'under':
            sampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        else:
            sampler = SMOTE(sampling_strategy=ratio, random_state=42)

        final_X_train, final_y_train = sampler.fit_resample(X_train_scaled, y_train)

        models = {
            'LogisticRegression': (LogisticRegression(), {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2'], 'random_state': [42]}),
            'DecisionTree': (DecisionTreeClassifier(), {'max_depth': [4, 10, 20], 'min_samples_leaf': [6, 10, 3], 'random_state': [42]}),
            'RandomForest': (RandomForestClassifier(), {'n_estimators': [100, 200, 300, 10, 20], 'max_depth': [None, 10, 20, 4, 5], 'random_state': [42]}),
            'GradientBoosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'AdaBoost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'XGBoost': (XGBClassifier(), {'n_estimators': [100, 200, 300, 43], 'learning_rate': [0.01, 0.1, 0.2, 0.03], 'random_state': [42]}),
            'LightGBM': (LGBMClassifier(), {'max_depth': [None, 10, 20, 4, 5], 'learning_rate': [0.01, 0.1, 0.2, 0.03], 'random_state': [42]}),
            'CatBoost': (CatBoostClassifier(), {'iterations': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'RUSBoost': (RUSBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'SVC': (SVC(), {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'random_state' : [42]})
}

        for model_name, (model, param_grid) in models.items():
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
            grid_search.fit(final_X_train, final_y_train)
            best_model = grid_search.best_estimator_

            if model_name not in results:
                results[model_name] = {}

            if method not in results[model_name]:
                results[model_name][method] = {}

            results[model_name][method][ratio] = {
                'best_params': grid_search.best_params_,
                'mean_cv_score': grid_search.best_score_
            }

            # 테스트 세트를 사용하여 모델의 성능을 평가합니다.
            y_pred = best_model.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)
            test_f1_score = f1_score(y_test, y_pred)
            test_confusion_matrix = confusion_matrix(y_test, y_pred)

            results[model_name][method][ratio]['test_score'] = {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1_score': test_f1_score,
                'confusion_matrix': test_confusion_matrix
            }

# 결과 출력
for model_name, res in results.items():
    print(f"{model_name}:")
    for method, ratios in res.items():
        print(f"  Resampling Method: {method}")
        for ratio, scores in ratios.items():
            print(f"    Ratio: {ratio}")
            print(f"      Best Parameters: {scores['best_params']}")
            print(f"      Mean CV Score: {scores['mean_cv_score']:.4f}")
            test_scores = scores['test_score']
            print(f"      Test Accuracy: {test_scores['accuracy']:.4f}")
            print(f"      Test Precision: {test_scores['precision']:.4f}")
            print(f"      Test Recall: {test_scores['recall']:.4f}")
            print(f"      Test F1 Score: {test_scores['f1_score']:.4f}")
            print(f"      Test Confusion Matrix:")
            print(test_scores['confusion_matrix'])
        print("------------------------------------")

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.03120915]


[LightGBM] [Info] Number of positive: 111, number of negative: 1110
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1881
[LightGBM] [Info] Number of data points in the train set: 1221, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090909 -> initscore=-2.302585
[LightGBM] [Info] Start training from score -2.302585
0:	learn: 0.5080534	total: 141ms	remaining: 28s
1:	learn: 0.4039015	total: 150ms	remaining: 14.8s
2:	learn: 0.3486610	total: 157ms	remaining: 10.3s
3:	learn: 0.3102739	total: 162ms	remaining: 7.93s
4:	learn: 0.2871936	total: 167ms	remaining: 6.52s
5:	learn: 0.2701228	total: 172ms	remaining: 5.57s
6:	learn: 0.2582080	total: 179ms	remaining: 4.94s
7:	learn: 0.2454437	total: 185ms	remaining: 4.44s
8:	learn: 0.2365280	total: 190ms	remaining: 4.03s
9:	learn: 0.2301648	total: 197ms	remaining: 3.74s
10:	learn: 0.

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.05019367]


[LightGBM] [Info] Number of positive: 111, number of negative: 370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1162
[LightGBM] [Info] Number of data points in the train set: 481, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230769 -> initscore=-1.203973
[LightGBM] [Info] Start training from score -1.203973
0:	learn: 0.5970791	total: 6.58ms	remaining: 1.31s
1:	learn: 0.5388891	total: 12.2ms	remaining: 1.21s
2:	learn: 0.5000063	total: 16.9ms	remaining: 1.11s
3:	learn: 0.4760560	total: 24ms	remaining: 1.17s
4:	learn: 0.4498711	total: 32.3ms	remaining: 1.26s
5:	learn: 0.4296973	total: 39.1ms	remaining: 1.26s
6:	learn: 0.4138836	total: 43.7ms	remaining: 1.21s
7:	learn: 0.4011148	total: 47.8ms	remaining: 1.15s
8:	learn: 0.3822783	total: 52.3ms	remaining: 1.11s
9:	learn: 0.3658577	total: 56ms	remaining: 1.06s
10:	lea

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.12516989]


[LightGBM] [Info] Number of positive: 111, number of negative: 222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 828
[LightGBM] [Info] Number of data points in the train set: 333, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
0:	learn: 0.6662640	total: 5.11ms	remaining: 1.53s
1:	learn: 0.6412537	total: 10.4ms	remaining: 1.55s
2:	learn: 0.6135757	total: 16.8ms	remaining: 1.66s
3:	learn: 0.5969950	total: 20.8ms	remaining: 1.53s
4:	learn: 0.5811590	total: 25.9ms	remaining: 1.52s
5:	learn: 0.5636726	total: 31.1ms	remaining: 1.52s
6:	learn: 0.5516947	total: 35.2ms	remaining: 1.47s
7:	learn: 0.5428411	total: 40.3ms	remaining: 1.47s
8:	learn: 0.5325058	total: 44.8ms	remaining: 1.45s
9:	learn: 0.5134131	total: 48.8ms	remaining: 1.42s
10:	

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.21039054]


[LightGBM] [Info] Number of positive: 111, number of negative: 158
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 681
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.412639 -> initscore=-0.353065
[LightGBM] [Info] Start training from score -0.353065
0:	learn: 0.6320389	total: 4.85ms	remaining: 965ms
1:	learn: 0.5982626	total: 9.28ms	remaining: 919ms
2:	learn: 0.5567184	total: 13ms	remaining: 851ms
3:	learn: 0.5346926	total: 16.5ms	remaining: 809ms
4:	learn: 0.5135683	total: 20.7ms	remaining: 807ms
5:	learn: 0.4830962	total: 25.3ms	remaining: 819ms
6:	learn: 0.4608175	total: 28.9ms	remaining: 797ms
7:	learn: 0.4490658	total: 32.6ms	remaining: 781ms
8:	learn: 0.4349785	total: 36.3ms	remaining: 771ms
9:	learn: 0.4198564	total: 41ms	remaining: 780ms
10:	lear

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.3223381 ]


[LightGBM] [Info] Number of positive: 111, number of negative: 123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 234, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.474359 -> initscore=-0.102654
[LightGBM] [Info] Start training from score -0.102654
0:	learn: 0.6570943	total: 46.5ms	remaining: 9.26s
1:	learn: 0.6339444	total: 51.5ms	remaining: 5.1s
2:	learn: 0.6126147	total: 55ms	remaining: 3.61s
3:	learn: 0.5947977	total: 58.4ms	remaining: 2.86s
4:	learn: 0.5803562	total: 63.8ms	remaining: 2.49s
5:	learn: 0.5597013	total: 70.1ms	remaining: 2.27s
6:	learn: 0.5410663	total: 74.6ms	remaining: 2.06s
7:	learn: 0.5338188	total: 82.5ms	remaining: 1.98s
8:	learn: 0.5177295	total: 87.5ms	remaining: 1.86s
9:	learn: 0.5055344	total: 90.7ms	remaining: 1.72s
10:	lea



LogisticRegression:
  Resampling Method: under
    Ratio: 0.1
      Best Parameters: {'C': 0.1, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.0312
      Test Accuracy: 0.7005
      Test Precision: 0.0287
      Test Recall: 0.4286
      Test F1 Score: 0.0538
      Test Confusion Matrix:
[[975 406]
 [ 16  12]]
    Ratio: 0.3
      Best Parameters: {'C': 1, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.0502
      Test Accuracy: 0.7729
      Test Precision: 0.0290
      Test Recall: 0.3214
      Test F1 Score: 0.0533
      Test Confusion Matrix:
[[1080  301]
 [  19    9]]
    Ratio: 0.5
      Best Parameters: {'C': 1, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.1252
      Test Accuracy: 0.5877
      Test Precision: 0.0257
      Test Recall: 0.5357
      Test F1 Score: 0.0491
      Test Confusion Matrix:
[[813 568]
 [ 13  15]]
    Ratio: 0.7
      Best Parameters: {'C': 10, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.2104
      Tes



리샘플링 오버, 스케일링 스탠다드 최종피처

In [2]:
# 데이터 로드
data = pd.read_csv('./코스피이상치99%처리.csv')

# 타겟 변수 및 피처 선택
X = data[['당기순이익률(%)', '총자본증가율(전년동기)(%)', '무형자산구성비율(%)', '타인자본비율(%)', '당좌비율(%)', 
          '매출채권회전율(회)', '영업현금흐름/투자현금흐름(%)', 'SGAI']]
y = data['분식기업']

# 수치형 피처만 선택
X_numeric = X.select_dtypes(include=[np.number])

# 트레인/테스트 셋 분리 (벤포드 항목 포함)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=y)

# 스케일링 적용 전에 벤포드 항목 제거
#X_train_to_scale = X_train.drop(columns=['벤포드'])

# 스케일링 적용
standard = StandardScaler()
X_train_scaled = standard.fit_transform(X_train)

# 트레인 셋에 벤포드 항목 다시 포함
#X_train_scaled = np.column_stack((X_train_scaled, X_train['벤포드']))

In [3]:
# 리샘플링 비율 설정
ratios = {
    #'under': [0.1, 0.3, 0.5, 0.7, 0.9],
    'over': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# 결과를 저장할 딕셔너리를 초기화합니다.
results = {}

# 각 리샘플링 비율에 대해 모델 훈련 및 성능 평가
for method, ratios_list in ratios.items():
    for ratio in ratios_list:
        resampled_data = {}
        if method == 'under':
            sampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        else:
            sampler = SMOTE(sampling_strategy=ratio, random_state=42)

        final_X_train, final_y_train = sampler.fit_resample(X_train_scaled, y_train)

        models = {
            'LogisticRegression': (LogisticRegression(), {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2'], 'random_state': [42]}),
            'DecisionTree': (DecisionTreeClassifier(), {'max_depth': [4, 10, 20], 'min_samples_leaf': [6, 10, 3], 'random_state': [42]}),
            'RandomForest': (RandomForestClassifier(), {'n_estimators': [100, 200, 300, 10, 20], 'max_depth': [None, 10, 20, 4, 5], 'random_state': [42]}),
            'GradientBoosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'AdaBoost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'XGBoost': (XGBClassifier(), {'n_estimators': [100, 200, 300, 43], 'learning_rate': [0.01, 0.1, 0.2, 0.03], 'random_state': [42]}),
            'LightGBM': (LGBMClassifier(), {'max_depth': [None, 10, 20, 4, 5], 'learning_rate': [0.01, 0.1, 0.2, 0.03], 'random_state': [42]}),
            'CatBoost': (CatBoostClassifier(), {'iterations': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'RUSBoost': (RUSBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'random_state': [42]}),
            'SVC': (SVC(), {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'random_state' : [42]})
}

        for model_name, (model, param_grid) in models.items():
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
            grid_search.fit(final_X_train, final_y_train)
            best_model = grid_search.best_estimator_

            if model_name not in results:
                results[model_name] = {}

            if method not in results[model_name]:
                results[model_name][method] = {}

            results[model_name][method][ratio] = {
                'best_params': grid_search.best_params_,
                'mean_cv_score': grid_search.best_score_
            }

            # 테스트 세트를 사용하여 모델의 성능을 평가합니다.
            y_pred = best_model.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)
            test_f1_score = f1_score(y_test, y_pred)
            test_confusion_matrix = confusion_matrix(y_test, y_pred)

            results[model_name][method][ratio]['test_score'] = {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1_score': test_f1_score,
                'confusion_matrix': test_confusion_matrix
            }

# 결과 출력
for model_name, res in results.items():
    print(f"{model_name}:")
    for method, ratios in res.items():
        print(f"  Resampling Method: {method}")
        for ratio, scores in ratios.items():
            print(f"    Ratio: {ratio}")
            print(f"      Best Parameters: {scores['best_params']}")
            print(f"      Mean CV Score: {scores['mean_cv_score']:.4f}")
            test_scores = scores['test_score']
            print(f"      Test Accuracy: {test_scores['accuracy']:.4f}")
            print(f"      Test Precision: {test_scores['precision']:.4f}")
            print(f"      Test Recall: {test_scores['recall']:.4f}")
            print(f"      Test F1 Score: {test_scores['f1_score']:.4f}")
            print(f"      Test Confusion Matrix:")
            print(test_scores['confusion_matrix'])
        print("------------------------------------")

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.02351191]
  

[LightGBM] [Info] Number of positive: 552, number of negative: 5525
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2037
[LightGBM] [Info] Number of data points in the train set: 6077, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090834 -> initscore=-2.303490
[LightGBM] [Info] Start training from score -2.303490
0:	learn: 0.5061401	total: 151ms	remaining: 45s
1:	learn: 0.4020930	total: 160ms	remaining: 23.8s
2:	learn: 0.3418427	total: 167ms	remaining: 16.6s
3:	learn: 0.3081457	total: 177ms	remaining: 13.1s
4:	learn: 0.2777562	total: 183ms	remaining: 10.8s
5:	learn: 0.2608001	total: 192ms	remaining: 9.38s
6:	learn: 0.2515620	total: 198ms	remaining: 8.27s
7:	learn: 0.2454259	total: 208ms	remaining: 7.59s
8:	learn: 0.2393583	total: 214ms	remaining: 6.93s

  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none'

[LightGBM] [Info] Number of positive: 1657, number of negative: 5525
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2039
[LightGBM] [Info] Number of data points in the train set: 7182, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230716 -> initscore=-1.204275
[LightGBM] [Info] Start training from score -1.204275
0:	learn: 0.5903892	total: 5.74ms	remaining: 1.72s
1:	learn: 0.5328545	total: 10.9ms	remaining: 1.62s
2:	learn: 0.4959028	total: 16ms	remaining: 1.58s
3:	learn: 0.4659351	total: 20.9ms	remaining: 1.55s
4:	learn: 0.4408621	total: 26.1ms	remaining: 1.54s
5:	learn: 0.4234918	total: 31.1ms	remaining: 1.53s
6:	learn: 0.4095746	total: 35.7ms	remaining: 1.5s
7:	learn: 0.3946779	total: 41.1ms	remaining: 1.5s
8:	learn: 0.3793346	total: 46.1ms	remaining: 1.49s
9:	learn: 0.3725770	total: 50.8ms	remaining: 1.47s
10:	

  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none'

[LightGBM] [Info] Number of positive: 2762, number of negative: 5525
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2039
[LightGBM] [Info] Number of data points in the train set: 8287, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333293 -> initscore=-0.693328
[LightGBM] [Info] Start training from score -0.693328
0:	learn: 0.6322431	total: 8.65ms	remaining: 2.59s
1:	learn: 0.5740791	total: 18.4ms	remaining: 2.74s
2:	learn: 0.5483494	total: 25.6ms	remaining: 2.54s
3:	learn: 0.5206002	total: 37.6ms	remaining: 2.78s
4:	learn: 0.4992858	total: 45.2ms	remaining: 2.67s
5:	learn: 0.4754569	total: 52.8ms	remaining: 2.59s
6:	learn: 0.4619452	total: 60.1ms	remaining: 2.52s
7:	learn: 0.4486685	total: 67.6ms	remaining: 2.47s
8:	learn: 0.4344610	total: 75.5ms	remaining: 2.44s
9:	learn: 0.4246196	total: 83.2ms	remaining: 2.41s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError

[LightGBM] [Info] Number of positive: 3867, number of negative: 5525
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 9392, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411733 -> initscore=-0.356804
[LightGBM] [Info] Start training from score -0.356804
0:	learn: 0.6391425	total: 8.01ms	remaining: 2.39s
1:	learn: 0.5936831	total: 15.3ms	remaining: 2.28s
2:	learn: 0.5720621	total: 22.6ms	remaining: 2.24s
3:	learn: 0.5423019	total: 30.1ms	remaining: 2.23s
4:	learn: 0.5194471	total: 37.4ms	remaining: 2.21s
5:	learn: 0.5026118	total: 45.6ms	remaining: 2.24s
6:	learn: 0.4876593	total: 53.8ms	remaining: 2.25s
7:	learn: 0.4747860	total: 61.4ms	remaining: 2.24s
8:	learn: 0.4620716	total: 69ms	remaining: 2.23s
9:	learn: 0.4515400	total: 76.4ms	remaining: 2.22s
10

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError

[LightGBM] [Info] Number of positive: 4972, number of negative: 5525
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000802 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 10497, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473659 -> initscore=-0.105461
[LightGBM] [Info] Start training from score -0.105461
0:	learn: 0.6332047	total: 12.4ms	remaining: 3.7s
1:	learn: 0.5973970	total: 25.5ms	remaining: 3.8s
2:	learn: 0.5687388	total: 36.4ms	remaining: 3.61s
3:	learn: 0.5462816	total: 46.7ms	remaining: 3.45s
4:	learn: 0.5248427	total: 57.1ms	remaining: 3.37s
5:	learn: 0.5019689	total: 66.5ms	remaining: 3.26s
6:	learn: 0.4889978	total: 75.6ms	remaining: 3.17s
7:	learn: 0.4800158	total: 85.4ms	remaining: 3.12s
8:	learn: 0.4654941	total: 95.4ms	remaining: 3.08s
9:	learn: 0.4558813	total: 106ms	remaining: 3.06s
10

  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression:
  Resampling Method: over
    Ratio: 0.1
      Best Parameters: {'C': 0.1, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.0235
      Test Accuracy: 0.6565
      Test Precision: 0.0270
      Test Recall: 0.4643
      Test F1 Score: 0.0510
      Test Confusion Matrix:
[[912 469]
 [ 15  13]]
    Ratio: 0.3
      Best Parameters: {'C': 1, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.0405
      Test Accuracy: 0.6196
      Test Precision: 0.0261
      Test Recall: 0.5000
      Test F1 Score: 0.0496
      Test Confusion Matrix:
[[859 522]
 [ 14  14]]
    Ratio: 0.5
      Best Parameters: {'C': 10, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.1024
      Test Accuracy: 0.6494
      Test Precision: 0.0283
      Test Recall: 0.5000
      Test F1 Score: 0.0536
      Test Confusion Matrix:
[[901 480]
 [ 14  14]]
    Ratio: 0.7
      Best Parameters: {'C': 10, 'penalty': 'l2', 'random_state': 42}
      Mean CV Score: 0.2082
      Test Ac

  _warn_prf(average, modifier, msg_start, len(result))


## standard kosdaq over

In [7]:
# 리샘플링 비율 설정
ratios = {
    #'under': [0.1, 0.3, 0.5, 0.7, 0.9],
    'over': [0.1, 0.3, 0.5, 0.7, 0.9]
}

# 결과를 저장할 딕셔너리를 초기화합니다.
results = {}

# 각 리샘플링 비율에 대해 모델 훈련 및 성능 평가
for method, ratios_list in ratios.items():
    for ratio in ratios_list:
        resampled_data = {}
        if method == 'over':
            sampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        else:
            sampler = SMOTE(sampling_strategy=ratio, random_state=42)

        final_X_train, final_y_train = sampler.fit_resample(X_train_scaled, y_train)

        models = {
            'LogisticRegression': (LogisticRegression(), {'C': [0.1, 1, 10, 43], 'penalty': ['l1', 'l2']}),
            'DecisionTree': (DecisionTreeClassifier(), {'max_depth': [4, 10, 20], 'min_samples_leaf': [6, 10, 3]}),
            'RandomForest': (RandomForestClassifier(), {'n_estimators': [100, 200, 300, 10, 20], 'max_depth': [None, 10, 20, 4, 5]}),
            'GradientBoosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}),
            'AdaBoost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}),
            'XGBoost': (XGBClassifier(), {'n_estimators': [100, 200, 300, 43], 'learning_rate': [0.01, 0.1, 0.2, 0.03]}),
            "LightGBM": (LGBMClassifier(), {'max_depth': [None, 10, 20, 4, 5], 'learning_rate': [0.01, 0.1, 0.2, 0.03]}),
            'CatBoost': (CatBoostClassifier(), {'iterations': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2]}),
            'RUSBoost': (RUSBoostClassifier(), {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2]}),
            'SVC': (SVC(), {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})
        }

        for model_name, (model, param_grid) in models.items():
            kfold = KFold(n_splits=5, shuffle=True, random_state=42)
            grid_search = GridSearchCV(model, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
            grid_search.fit(final_X_train, final_y_train)
            best_model = grid_search.best_estimator_

            if model_name not in results:
                results[model_name] = {}

            if method not in results[model_name]:
                results[model_name][method] = {}

            results[model_name][method][ratio] = {
                'best_params': grid_search.best_params_,
                'mean_cv_score': grid_search.best_score_
            }

            # 테스트 세트를 사용하여 모델의 성능을 평가합니다.
            y_pred = best_model.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)
            test_f1_score = f1_score(y_test, y_pred)
            test_confusion_matrix = confusion_matrix(y_test, y_pred)

            results[model_name][method][ratio]['test_score'] = {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1_score': test_f1_score,
                'confusion_matrix': test_confusion_matrix
            }

# 결과 출력
for model_name, res in results.items():
    print(f"{model_name}:")
    for method, ratios in res.items():
        print(f"  Resampling Method: {method}")
        for ratio, scores in ratios.items():
            print(f"    Ratio: {ratio}")
            print(f"      Best Parameters: {scores['best_params']}")
            print(f"      Mean CV Score: {scores['mean_cv_score']:.4f}")
            test_scores = scores['test_score']
            print(f"      Test Accuracy: {test_scores['accuracy']:.4f}")
            print(f"      Test Precision: {test_scores['precision']:.4f}")
            print(f"      Test Recall: {test_scores['recall']:.4f}")
            print(f"      Test F1 Score: {test_scores['f1_score']:.4f}")
            print(f"      Test Confusion Matrix:")
            print(test_scores['confusion_matrix'])
        print("------------------------------------")

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.0486674 ]
ST

[LightGBM] [Info] Number of positive: 202, number of negative: 2020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9528
[LightGBM] [Info] Number of data points in the train set: 2222, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090909 -> initscore=-2.302585
[LightGBM] [Info] Start training from score -2.302585
0:	learn: 0.5109856	total: 168ms	remaining: 33.4s
1:	learn: 0.4089363	total: 183ms	remaining: 18.1s
2:	learn: 0.3452623	total: 229ms	remaining: 15s
3:	learn: 0.3100519	total: 257ms	remaining: 12.6s
4:	learn: 0.2868850	total: 282ms	remaining: 11s
5:	learn: 0.2666116	total: 309ms	remaining: 9.98s
6:	learn: 0.2518253	total: 343ms	remaining: 9.46s
7:	learn: 0.2439812	total: 360ms	remaining: 8.64s
8:	learn: 0.2349160	total: 384ms	remaining: 8.15s
9:	learn: 0.2258466	total: 408ms	remaining: 7.75s
10:	learn: 0.2

  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none'

[LightGBM] [Info] Number of positive: 202, number of negative: 673
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8889
[LightGBM] [Info] Number of data points in the train set: 875, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230857 -> initscore=-1.203478
[LightGBM] [Info] Start training from score -1.203478
0:	learn: 0.6012966	total: 14.5ms	remaining: 4.33s
1:	learn: 0.5462672	total: 25.3ms	remaining: 3.77s
2:	learn: 0.5045366	total: 38.9ms	remaining: 3.85s
3:	learn: 0.4689964	total: 51.2ms	remaining: 3.79s
4:	learn: 0.4440735	total: 63.3ms	remaining: 3.73s
5:	learn: 0.4231165	total: 77.5ms	remaining: 3.8s
6:	learn: 0.3972611	total: 93.2ms	remaining: 3.9s
7:	learn: 0.3862562	total: 105ms	remaining: 3.84s
8:	learn: 0.3737627	total: 118ms	remaining: 3.8s
9:	learn: 0.3601513	total: 128ms	remaining: 3.7s
10:	learn

  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none'

[LightGBM] [Info] Number of positive: 202, number of negative: 404
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7092
[LightGBM] [Info] Number of data points in the train set: 606, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
0:	learn: 0.6551686	total: 13.6ms	remaining: 4.05s
1:	learn: 0.6334215	total: 23.9ms	remaining: 3.56s
2:	learn: 0.6117132	total: 36.9ms	remaining: 3.65s
3:	learn: 0.5904019	total: 54.7ms	remaining: 4.05s
4:	learn: 0.5592493	total: 63.7ms	remaining: 3.76s
5:	learn: 0.5390470	total: 75.7ms	remaining: 3.71s
6:	learn: 0.5140940	total: 85.5ms	remaining: 3.58s
7:	learn: 0.5041846	total: 95.6ms	remaining: 3.49s
8:	learn: 0.4940993	total: 106ms	remaining: 3.41s
9:	learn: 0.4822645	total: 114ms	remaining: 3.3s
10:	l

  _warn_prf(average, modifier, msg_start, len(result))
20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none'

[LightGBM] [Info] Number of positive: 202, number of negative: 288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5802
[LightGBM] [Info] Number of data points in the train set: 490, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.412245 -> initscore=-0.354693
[LightGBM] [Info] Start training from score -0.354693
0:	learn: 0.6303238	total: 11.4ms	remaining: 2.28s
1:	learn: 0.5855660	total: 32.3ms	remaining: 3.19s
2:	learn: 0.5589915	total: 41ms	remaining: 2.69s
3:	learn: 0.5330998	total: 54.5ms	remaining: 2.67s
4:	learn: 0.5110401	total: 67.3ms	remaining: 2.63s
5:	learn: 0.4895973	total: 81.9ms	remaining: 2.65s
6:	learn: 0.4603010	total: 92ms	remaining: 2.54s
7:	learn: 0.4471355	total: 107ms	remaining: 2.57s
8:	learn: 0.4280574	total: 120ms	remaining: 2.54s
9:	learn: 0.4169001	total: 132ms	remaining: 2.5s
10:	learn:

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Master\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.66800366]
ST

[LightGBM] [Info] Number of positive: 202, number of negative: 224
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5062
[LightGBM] [Info] Number of data points in the train set: 426, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.474178 -> initscore=-0.103378
[LightGBM] [Info] Start training from score -0.103378
0:	learn: 0.6646207	total: 13.1ms	remaining: 3.93s
1:	learn: 0.6388983	total: 26.4ms	remaining: 3.93s
2:	learn: 0.6135900	total: 37.5ms	remaining: 3.71s
3:	learn: 0.5937180	total: 49.8ms	remaining: 3.68s
4:	learn: 0.5712410	total: 63ms	remaining: 3.72s
5:	learn: 0.5488916	total: 75ms	remaining: 3.68s
6:	learn: 0.5231345	total: 87.9ms	remaining: 3.68s
7:	learn: 0.5142278	total: 101ms	remaining: 3.68s
8:	learn: 0.5071984	total: 113ms	remaining: 3.66s
9:	learn: 0.4947998	total: 129ms	remaining: 3.73s
10:	learn

  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression:
  Resampling Method: over
    Ratio: 0.1
      Best Parameters: {'C': 10, 'penalty': 'l2'}
      Mean CV Score: 0.0487
      Test Accuracy: 0.0209
      Test Precision: 0.0209
      Test Recall: 1.0000
      Test F1 Score: 0.0410
      Test Confusion Matrix:
[[   0 2337]
 [   0   50]]
    Ratio: 0.3
      Best Parameters: {'C': 10, 'penalty': 'l2'}
      Mean CV Score: 0.2582
      Test Accuracy: 0.0209
      Test Precision: 0.0209
      Test Recall: 1.0000
      Test F1 Score: 0.0410
      Test Confusion Matrix:
[[   0 2337]
 [   0   50]]
    Ratio: 0.5
      Best Parameters: {'C': 43, 'penalty': 'l2'}
      Mean CV Score: 0.4069
      Test Accuracy: 0.0209
      Test Precision: 0.0209
      Test Recall: 1.0000
      Test F1 Score: 0.0410
      Test Confusion Matrix:
[[   0 2337]
 [   0   50]]
    Ratio: 0.7
      Best Parameters: {'C': 10, 'penalty': 'l2'}
      Mean CV Score: 0.5612
      Test Accuracy: 0.0209
      Test Precision: 0.0209
      Test Recall: 1.000



In [2]:
import pandas as pd
# 데이터 로드
data = pd.read_csv('./코스닥이상치99%처리.csv')

# 타겟 변수 및 피처 선택
X = data.drop(columns=['Symbol', '상장연도', 'Name', '회계년', '분식기업'])
y = data['분식기업']

In [3]:
import pandas as pd
a = [3.562996254445118, 2.19753413943169, 0.6407992911967733, 1.5436893422328266, 1.4369411954980196, 1.7328230322756235, 2.065434566960113, 0.24761347315081908, 0.7942713768568198, 2.127419542502854, 2.8669047453219516, 0.9941801514646917, 1.9529158344542925, 1.5115701386945513, 4.328603562827706, 2.8977000120092704, 3.0486641119350564, 5.514241433394602, 6.105198933894736, 2.77822833526965, 3.378390619928878, 1.7448324938676512, 6.033884474725172, 3.46612312286889, 3.566206541095964, 1.9537694050362902, 5.901260766901478, 3.5704013944680253, 0.21691795018128474, 1.0426906757758192, 1.3821216082750654, 3.5311668051267873, 1.212721393461332, 1.7765462791069702, 1.881953080652804, 0.37129427776444646, 3.9744468049902557, 2.2384588435954433, 2.0710578082823514, 2.169244009828471, 0.16878217024946005]
col = X.columns

df1 = pd.DataFrame({'지표': col, '값':a})
df1.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
18,무형자산구성비율(%),6.105199
22,매출채권회전율(회),6.033884
26,자본금회전율(회),5.901261
17,유형자산구성비율(%),5.514241
14,유동비율(%),4.328604
36,DEPI,3.974447
27,고정자산회전율(회),3.570401
24,총부채회전율(회),3.566207
0,매출총이익률(%),3.562996
31,영업현금흐름/투자현금흐름(%),3.531167


In [4]:
import pandas as pd
a = [2.839083099629965, 2.9277715428747975, 0.9295024610058279, 1.124909073692536, 1.7506186892825593, 1.3834138012189447, 0.6034231119437697, 1.2221757276377507, 0.8011347620087389, 2.9010474048186836, 2.2423827257134357, 1.712683629698657, 1.5472437690162315, 2.1302242667546007, 5.725122855421415, 2.2231068361869704, 3.6842323175566776, 3.796133194236553, 6.106545147494278, 3.0663335013269313, 4.195962926470378, 1.6720294255230082, 6.654239560378571, 4.3958795530277905, 1.6774467103653545, 1.9003191529080978, 3.883962986097, 4.002219467205511, 0.6105879181819545, 0.9449237764271504, 2.3694077236767175, 3.2817581116594794, 1.6197336318160935, 1.4318560469269672, 2.7081310504166276, 1.0001120759705402, 3.6692859613275055, 1.1000039544912636, 1.8358115720751158, 2.329240477535567, 0.0]
col = X.columns

df3 = pd.DataFrame({'지표': col, '값':a})
df3.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
22,매출채권회전율(회),6.65424
18,무형자산구성비율(%),6.106545
14,유동비율(%),5.725123
23,재고자산회전율(회),4.39588
20,타인자본비율(%),4.195963
27,고정자산회전율(회),4.002219
26,자본금회전율(회),3.883963
17,유형자산구성비율(%),3.796133
16,현금비율(%),3.684232
36,DEPI,3.669286


In [5]:
import pandas as pd
a = [3.99080452714271, 0.9458237782834438, 1.428890687737034, 2.2111382809293465, 1.5229337713092719, 1.2187351852560258, 0.9008715286567426, 1.336554146650472, 1.178995921079604, 1.8466583879610565, 1.9141506948710671, 1.345329182602975, 1.5628650158808355, 1.8879883125632042, 4.715492209063913, 2.694810387673948, 3.8961353367831184, 2.390430498250563, 5.678390092473984, 3.499819388566306, 3.66389662588404, 1.6194769469024077, 7.462375359791452, 4.9043782183459905, 2.860323741180853, 1.977111178111248, 4.407910035325577, 4.103285494957717, 0.8718024199028971, 1.7901285583458384, 2.4645894445204917, 2.6495209202551737, 1.2140798388610836, 2.164351520367515, 2.0027008748432316, 1.138042689663945, 2.9258888281934654, 1.644705641892359, 1.5160843452407637, 2.2757815973538214, 0.17674838632451287]
col = X.columns

df5 = pd.DataFrame({'지표': col, '값':a})
df5.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
22,매출채권회전율(회),7.462375
18,무형자산구성비율(%),5.67839
23,재고자산회전율(회),4.904378
14,유동비율(%),4.715492
26,자본금회전율(회),4.40791
27,고정자산회전율(회),4.103285
0,매출총이익률(%),3.990805
16,현금비율(%),3.896135
20,타인자본비율(%),3.663897
19,판관비율(%),3.499819


In [6]:
import pandas as pd
a = [3.934834904515905, 1.6276587315022049, 0.7363432173764785, 1.9900050375979539, 0.23426728524653442, 0.45427399249811695, 1.2401508648919937, 1.0546286471109683, 2.2431096647061852, 1.1263835359945478, 1.8494592134661494, 1.2308635710216287, 1.366020091725761, 1.996310151588593, 4.543779407790188, 2.0354779847658175, 3.728639022746695, 3.1743493467902515, 6.8645955361926845, 2.300297498004396, 3.2645416343136, 1.5737297262639227, 8.267140596025085, 5.225433889192528, 3.6099188199710244, 1.7006101747672506, 3.7131727288767054, 2.65649837975358, 1.432682699791312, 0.9068193904912466, 2.9928980798549265, 4.0298291989657535, 1.3208912278067717, 2.930856141135287, 2.4078255309728007, 1.2358624877922213, 2.8328084102000664, 1.298263991168761, 2.1811298187118884, 2.671140852016559, 0.016498516395657106]
col = X.columns

df7 = pd.DataFrame({'지표': col, '값':a})
df7.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
22,매출채권회전율(회),8.267141
18,무형자산구성비율(%),6.864596
23,재고자산회전율(회),5.225434
14,유동비율(%),4.543779
31,영업현금흐름/투자현금흐름(%),4.029829
0,매출총이익률(%),3.934835
16,현금비율(%),3.728639
26,자본금회전율(회),3.713173
24,총부채회전율(회),3.609919
20,타인자본비율(%),3.264542


In [7]:
import pandas as pd
a = [4.584942154815138, 1.2943799016250341, 1.217229987232772, 1.3544867756324006, 1.1616940733802108, 1.2565335705980292, 0.7590262977857026, 1.1754042965518512, 1.3479557658361576, 1.458416032710079, 1.8414445967493105, 1.2814832920988604, 1.7340235349147048, 2.3268774652286535, 5.435071820552461, 2.3975658373565256, 3.1584286652811295, 3.2179656295070256, 6.7283256550330455, 2.031868003957247, 3.1416555741130643, 1.9415410230935444, 7.155937602795014, 3.977406150713342, 4.2698019643749285, 2.9809065419451093, 3.964817497805218, 2.2262290476420357, 2.450640422609818, 1.871094911585217, 2.1493508152308016, 3.2293367436688656, 1.9315165478116951, 1.5202997211531966, 2.173104463345194, 0.9860543414809312, 3.2604174785178577, 1.008940454461765, 1.6315699692098244, 2.3111228823238466, 0.05513248927240864]
col = X.columns

df9 = pd.DataFrame({'지표': col, '값':a})
df9.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
22,매출채권회전율(회),7.155938
18,무형자산구성비율(%),6.728326
14,유동비율(%),5.435072
0,매출총이익률(%),4.584942
24,총부채회전율(회),4.269802
23,재고자산회전율(회),3.977406
26,자본금회전율(회),3.964817
36,DEPI,3.260417
31,영업현금흐름/투자현금흐름(%),3.229337
17,유형자산구성비율(%),3.217966


In [None]:
[3.1810571584878016, 2.5025042224641005, 1.4952880758194478, 2.4943980000015813, 1.7308456498508884, 1.6797471429604482, 1.4910638003706471, 0.8627715388731098, 1.1632966114001215, 1.9108932069701048, 1.678559071660711, 0.9507247629360872, 2.9681538228229782, 1.4423640773942878, 4.037019834291557, 3.0340421664355426, 3.594367295242185, 4.853002173088712, 6.112335294339199, 3.580829132858372, 2.370924694503875, 3.0402542069241734, 5.570086893556416, 4.342941617718934, 2.8095733006022954, 2.508555789467297, 3.861929976050338, 3.0885987235581727, 1.3813469397559965, 1.1172637062308277, 2.523524549021285, 2.7644586619187446, 1.9415593163398162, 2.449064231864625, 2.035795172120602, 0.9514806263518123, 1.8848403790243151, 1.436091217566498, 1.3771020260107119, 1.772077508167371, 0.009267424977993603]

In [8]:
import pandas as pd
a = [3.1810571584878016, 2.5025042224641005, 1.4952880758194478, 2.4943980000015813, 1.7308456498508884, 1.6797471429604482, 1.4910638003706471, 0.8627715388731098, 1.1632966114001215, 1.9108932069701048, 1.678559071660711, 0.9507247629360872, 2.9681538228229782, 1.4423640773942878, 4.037019834291557, 3.0340421664355426, 3.594367295242185, 4.853002173088712, 6.112335294339199, 3.580829132858372, 2.370924694503875, 3.0402542069241734, 5.570086893556416, 4.342941617718934, 2.8095733006022954, 2.508555789467297, 3.861929976050338, 3.0885987235581727, 1.3813469397559965, 1.1172637062308277, 2.523524549021285, 2.7644586619187446, 1.9415593163398162, 2.449064231864625, 2.035795172120602, 0.9514806263518123, 1.8848403790243151, 1.436091217566498, 1.3771020260107119, 1.772077508167371, 0.009267424977993603]
col = X.columns

no_df = pd.DataFrame({'지표': col, '값':a})
no_df.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
18,무형자산구성비율(%),6.112335
22,매출채권회전율(회),5.570087
17,유형자산구성비율(%),4.853002
23,재고자산회전율(회),4.342942
14,유동비율(%),4.03702
26,자본금회전율(회),3.86193
16,현금비율(%),3.594367
19,판관비율(%),3.580829
0,매출총이익률(%),3.181057
27,고정자산회전율(회),3.088599


이제부터 under

In [29]:
import pandas as pd
a = [0.0255546915789896, 0.02802654609119308, 0.012067047532854425, 0.002074023794709355, 0.0, 0.0, 0.0, 0.027694171794665694, 0.002074023794709355, 0.00990281970342006, 0.013895959424552668, 0.03725440101564262, 0.012344950325900451, 0.003016761883213607, 0.013817346492572677, 0.010248117573857992, 0.036124604792020565, 0.10638725330155974, 0.08264360845430654, 0.027150856948922452, 0.0, 0.0, 0.03984294727997998, 0.02437112635653276, 0.018813451455042363, 0.012962648716933464, 0.06450564593473053, 0.08701430298999688, 0.00831148938922589, 0.0, 0.020999490921432207, 0.038277434685202574, 0.02501591777003279, 0.030198365014306904, 0.014859132449358686, 0.0, 0.02799932122857628, 0.10929830555969364, 0.0, 0.027253235745864093, 0.0]
col = X.columns

under_df1 = pd.DataFrame({'지표': col, '값':a})
under_df1.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
37,SGAI,0.109298
17,유형자산구성비율(%),0.106387
27,고정자산회전율(회),0.087014
18,무형자산구성비율(%),0.082644
26,자본금회전율(회),0.064506
22,매출채권회전율(회),0.039843
31,영업현금흐름/투자현금흐름(%),0.038277
11,총자본증가율(전년동기)(%),0.037254
16,현금비율(%),0.036125
33,GMI,0.030198


In [30]:
import pandas as pd
a = [0.0, 0.0062303011643075395, 0.0, 0.0, 0.0, 0.0, 0.0, 0.036497030489138785, 0.0, 0.0, 0.0, 0.1271919036954325, 0.04962482947335844, 0.0, 0.0, 0.1862693008532963, 0.1581940203595973, 0.04853480008912252, 0.0, 0.0, 0.0, 0.0, 0.08258613540881088, 0.0, 0.006888738127657402, 0.0, 0.06377131930546624, 0.009982093808995203, 0.07915925890745512, 0.0, 0.0, 0.04726029876061062, 0.0, 0.0, 0.049012203232311646, 0.0, 0.012516466579118138, 0.03628129974532134, 0.0, 0.0, 0.0]
col = X.columns

under_df3 = pd.DataFrame({'지표': col, '값':a})
under_df3.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
15,당좌비율(%),0.186269
16,현금비율(%),0.158194
11,총자본증가율(전년동기)(%),0.127192
22,매출채권회전율(회),0.082586
28,현금흐름/총부채(%),0.079159
26,자본금회전율(회),0.063771
12,유형자산증가율(전년동기)(%),0.049625
34,AQI,0.049012
17,유형자산구성비율(%),0.048535
31,영업현금흐름/투자현금흐름(%),0.04726


In [31]:
import pandas as pd
a = [0.0019422802807211124, 0.0, 0.0, 0.0, 0.034762402751543166, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06198640210128277, 0.07218279629092665, 0.13147160620944429, 0.0018539948134156466, 0.0, 0.0, 0.16131482845446532, 0.060393808284266506, 0.0, 0.05047851424761899, 0.0, 0.0, 0.13643336090643687, 0.029704664153406554, 0.02005073080646796, 0.0, 0.11303752735592214, 0.04492372047891728, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.029417162025550735, 0.022247937760987622, 0.0, 0.02779826307862631, 0.0]
col = X.columns

under_df5 = pd.DataFrame({'지표': col, '값':a})
under_df5.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
16,현금비율(%),0.161315
22,매출채권회전율(회),0.136433
12,유형자산증가율(전년동기)(%),0.131472
26,자본금회전율(회),0.113038
11,총자본증가율(전년동기)(%),0.072183
10,당기순이익증가율(전년동기)(%),0.061986
17,유형자산구성비율(%),0.060394
19,판관비율(%),0.050479
27,고정자산회전율(회),0.044924
4,영업수익/영업비용(%),0.034762


In [32]:
import pandas as pd
a = [0.0, 0.0, 0.0, 0.05573638051792532, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.036256876007537554, 0.15515327813702903, 0.0, 0.07424737493215108, 0.0, 0.2047316791169237, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1551818952414099, 0.0, 0.0, 0.005939789994572095, 0.1792353413897736, 0.05644600431205472, 0.0, 0.0, 0.0, 0.07707138035062308, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
col = X.columns

under_df7 = pd.DataFrame({'지표': col, '값':a})
under_df7.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
16,현금비율(%),0.204732
26,자본금회전율(회),0.179235
22,매출채권회전율(회),0.155182
12,유형자산증가율(전년동기)(%),0.155153
31,영업현금흐름/투자현금흐름(%),0.077071
14,유동비율(%),0.074247
27,고정자산회전율(회),0.056446
3,자본금영업이익률(%),0.055736
11,총자본증가율(전년동기)(%),0.036257
25,총자본회전율(회),0.00594


In [33]:
import pandas as pd
a = [0.0, 0.0, 0.0, 0.3960981538916488, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006532842384096819, 0.0, 0.0, 0.08166444328316703, 0.0, 0.0, 0.0, 0.30923933057614217, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0813346439925464, 0.024121264187434426, 0.0, 0.0, 0.08369149098629354, 0.0, 0.0, 0.0, 0.017317830698670864, 0.0, 0.0]
col = X.columns

under_df9 = pd.DataFrame({'지표': col, '값':a})
under_df9.sort_values(by='값', ascending= False).head(10)

Unnamed: 0,지표,값
3,자본금영업이익률(%),0.396098
16,현금비율(%),0.309239
34,AQI,0.083691
12,유형자산증가율(전년동기)(%),0.081664
30,현금흐름/총자본(%),0.081335
31,영업현금흐름/투자현금흐름(%),0.024121
38,LVGI,0.017318
9,영업이익증가율(전년동기)(%),0.006533
0,매출총이익률(%),0.0
25,총자본회전율(회),0.0


In [34]:
# 데이터 로드
data = pd.read_csv('./코스피이상치99%처리.csv')

# 타겟 변수 및 피처 선택
X = data.drop(columns=['Symbol', '상장연도', 'Name', '회계년', '분식기업'])
y = data['분식기업']

# 수치형 피처만 선택
X_numeric = X.select_dtypes(include=[np.number])

# 트레인/테스트 셋 분리 (벤포드 항목 포함)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=y)
dt = DecisionTreeClassifier()
param_grid = {'max_depth': [4, 10, 20], 'min_samples_leaf': [6, 10, 3], 'random_state' : [42]}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=kfold, scoring='recall', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_

In [39]:
grid_search.best_estimator_

In [40]:
grid_search.best_params_

{'max_depth': 20, 'min_samples_leaf': 3, 'random_state': 42}

In [41]:
dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=3, random_state=42)
dt.fit(X_train, y_train)

In [46]:
dt_import = dt.feature_importances_
dt_import_df = pd.DataFrame(dt_import.reshape(1, -1), columns=X_train.columns)
dt_import_df.T.sort_values(by=0, ascending=False).head(10)

Unnamed: 0,0
무형자산구성비율(%),0.138756
매출채권회전율(회),0.120304
재고자산회전율(회),0.112378
판관비율(%),0.057919
총부채회전율(회),0.056515
영업현금흐름/투자현금흐름(%),0.055064
현금흐름/총부채(%),0.049718
고정자산회전율(회),0.046574
자본금회전율(회),0.042177
당좌비율(%),0.037895


In [None]:

            # 모델의 feature importance 추가
            if hasattr(best_model, 'feature_importances_'):
                feature_importances = best_model.feature_importances_
                results[model_name][method][ratio]['feature_importances'] = feature_importances

            # 테스트 세트를 사용하여 모델의 성능을 평가합니다.
            y_pred = best_model.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)
            test_f1_score = f1_score(y_test, y_pred)
            test_confusion_matrix = confusion_matrix(y_test, y_pred)

            results[model_name][method][ratio]['test_score'] = {
                'accuracy': test_accuracy,
                'precision': test_precision,
                'recall': test_recall,
                'f1_score': test_f1_score,
                'confusion_matrix': test_confusion_matrix
            }

# 결과 출력
with open("model_results.txt", "w") as file:
    for model_name, res in results.items():
        file.write(f"{model_name}:\n")
        for method, ratios in res.items():
            file.write(f"  Resampling Method: {method}\n")
            for ratio, scores in ratios.items():
                file.write(f"    Ratio: {ratio}\n")
                file.write(f"      Best Parameters: {scores['best_params']}\n")
                file.write(f"      Mean CV Score: {scores['mean_cv_score']:.4f}\n")
                if 'feature_importances' in scores:
                    file.write(f"      Feature Importances: {scores['feature_importances'].tolist()}\n")
                test_scores = scores['test_score']
                file.write(f"      Test Accuracy: {test_scores['accuracy']:.4f}\n")
                file.write(f"      Test Precision: {test_scores['precision']:.4f}\n")
                file.write(f"      Test Recall: {test_scores['recall']:.4f}\n")
                file.write(f"      Test F1 Score: {test_scores['f1_score']:.4f}\n")
                file.write(f"      Test Confusion Matrix:\n")
                file.write(f"{test_scores['confusion_matrix']}\n")
            file.write("--------------------------------")