In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from joblib import dump

# 결과 저장 디렉토리
output_dir = "../results/models"
os.makedirs(output_dir, exist_ok=True)

# 2.2s

In [2]:
# 데이터 로드
df = pd.read_csv("../data/processed/data_cleaned.csv")

# X와 y로 분리 (y는 'Label' 컬럼)
X = df.drop(columns=['Label'])
y = df['Label']

# 훈련/테스트 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 10.7s

In [5]:
# 모델, 스케일러, PCA 종류
scalers = [None, MinMaxScaler(), StandardScaler()]
pca_components = [None, 0.99, 10]
models = [
    ('rf', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, tree_method="gpu_hist", gpu_id=0, random_state=42)),
    ('catboost', CatBoostClassifier(iterations=100, task_type="GPU", devices="0", random_state=42, verbose=0)),
    ('lightgbm', lgb.LGBMClassifier(n_estimators=100, random_state=42))
]

# 각 조합에 대해 모델 훈련 및 저장
for scaler in scalers:
    for pca in pca_components:
        for model_name, model in models:
            # 파이프라인 구성
            steps = []
            
            # 스케일러 추가
            if scaler:
                steps.append(('scaler', scaler))
            
            # PCA 추가
            if pca:
                pca_instance = PCA(n_components=pca, random_state=42)  # PCA의 상태 고정
                steps.append(('pca', pca_instance))
            
            # 모델 추가
            steps.append(('model', model))

            # 파이프라인 생성
            pipeline = Pipeline(steps)

            # 모델 훈련
            print(f"훈련 중: {model_name}, Scaler: {scaler}, PCA: {pca}")
            pipeline.fit(X_train, y_train)

            # 모델 저장
            model_filename = f"{model_name}_scaler_{str(scaler)}_pca_{str(pca)}.joblib"
            model_path = os.path.join(output_dir, model_filename)
            dump(pipeline, model_path)
            print(f"모델 저장 완료: {model_path}")

# 모델 훈련 및 저장이 끝난 후
print("모든 모델 훈련 및 저장이 완료되었습니다.")

# 285m 40.7s

훈련 중: rf, Scaler: None, PCA: None
모델 저장 완료: ../results/models\rf_scaler_None_pca_None.joblib
훈련 중: xgb, Scaler: None, PCA: None



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_None_pca_None.joblib
훈련 중: catboost, Scaler: None, PCA: None
모델 저장 완료: ../results/models\catboost_scaler_None_pca_None.joblib
훈련 중: lightgbm, Scaler: None, PCA: None
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.158092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13310
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 63
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[Light


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_None_pca_0.99.joblib
훈련 중: catboost, Scaler: None, PCA: 0.99
모델 저장 완료: ../results/models\catboost_scaler_None_pca_0.99.joblib
훈련 중: lightgbm, Scaler: None, PCA: 0.99
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 6
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start training from score -7.562979
[LightGBM] [Info] 


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_None_pca_10.joblib
훈련 중: catboost, Scaler: None, PCA: 10
모델 저장 완료: ../results/models\catboost_scaler_None_pca_10.joblib
훈련 중: lightgbm, Scaler: None, PCA: 10
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 10
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start training from score -7.562979
[LightGBM] [Info] Start t


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_MinMaxScaler()_pca_None.joblib
훈련 중: catboost, Scaler: MinMaxScaler(), PCA: None
모델 저장 완료: ../results/models\catboost_scaler_MinMaxScaler()_pca_None.joblib
훈련 중: lightgbm, Scaler: MinMaxScaler(), PCA: None
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.153877 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13310
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 63
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Sta


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_MinMaxScaler()_pca_0.99.joblib
훈련 중: catboost, Scaler: MinMaxScaler(), PCA: 0.99
모델 저장 완료: ../results/models\catboost_scaler_MinMaxScaler()_pca_0.99.joblib
훈련 중: lightgbm, Scaler: MinMaxScaler(), PCA: 0.99
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 16
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start trainin


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_MinMaxScaler()_pca_10.joblib
훈련 중: catboost, Scaler: MinMaxScaler(), PCA: 10
모델 저장 완료: ../results/models\catboost_scaler_MinMaxScaler()_pca_10.joblib
훈련 중: lightgbm, Scaler: MinMaxScaler(), PCA: 10
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 10
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start training from s


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_StandardScaler()_pca_None.joblib
훈련 중: catboost, Scaler: StandardScaler(), PCA: None
모델 저장 완료: ../results/models\catboost_scaler_StandardScaler()_pca_None.joblib
훈련 중: lightgbm, Scaler: StandardScaler(), PCA: None
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.449237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13326
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 63
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Star


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_StandardScaler()_pca_0.99.joblib
훈련 중: catboost, Scaler: StandardScaler(), PCA: 0.99
모델 저장 완료: ../results/models\catboost_scaler_StandardScaler()_pca_0.99.joblib
훈련 중: lightgbm, Scaler: StandardScaler(), PCA: 0.99
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.183158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7905
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 31
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



모델 저장 완료: ../results/models\xgb_scaler_StandardScaler()_pca_10.joblib
훈련 중: catboost, Scaler: StandardScaler(), PCA: 10
모델 저장 완료: ../results/models\catboost_scaler_StandardScaler()_pca_10.joblib
훈련 중: lightgbm, Scaler: StandardScaler(), PCA: 10
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051810 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 2264474, number of used features: 10
[LightGBM] [Info] Start training from score -0.219264
[LightGBM] [Info] Start training from score -5.872714
[LightGBM] [Info] Start training from score -6.163381
[LightGBM] [Info] Start training from score -6.187371
[LightGBM] [Info] Start training from score -6.243039
[LightGBM] [Info] Start training from score -2.507606
[LightGBM] [Info] Start training from score -5.614279
[LightGBM] [Info] Start training from score -12.435628
[LightGBM] [Info] Start trainin