In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model
import joblib
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier

# 데이터 로드
df_cleaned = pd.read_csv("../data/processed/data_cleaned.csv")

# 데이터 분할
X = df_cleaned.drop(columns=['Label'])
y = df_cleaned['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 정규화된 데이터를 저장
joblib.dump(scaler, "../results/models2/scaler.pkl")

# 결과 저장 경로 생성
output_dir = "../results/models2/"
os.makedirs(output_dir, exist_ok=True)

# Autoencoder 모델 불러오기
autoencoder_classifier = load_model("../results/models1/autoencoder_classifier.h5")

# 인코더 부분 추출
encoder = Model(inputs=autoencoder_classifier.input, outputs=autoencoder_classifier.get_layer('dense_2').output)

# 차원 축소된 데이터
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# 축소된 차원 확인
print("Original shape:", X_train_scaled.shape)  # 원래 차원: (샘플 수, 63)
print("Reduced shape:", X_train_encoded.shape)  # 축소 차원: (샘플 수, 49)




[1m70765/70765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 613us/step
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 685us/step
Original shape: (2264474, 63)
Reduced shape: (2264474, 56)


In [3]:
# UDBB 설정 및 Random Forest 기반 모델 정의
base_estimator = RandomForestClassifier(
    n_estimators=100, max_depth=10, random_state=42, n_jobs=-1
)

udbb_model = BalancedBaggingClassifier(
    estimator=base_estimator,  # base_estimator -> estimator로 변경
    sampling_strategy='auto',   # 자동으로 언더샘플링 비율 설정
    replacement=False,          # 샘플링 시 중복 허용하지 않음
    random_state=42,
    n_estimators=10,           # 앙상블에 포함할 모델 개수
    n_jobs=-1                   # 병렬 처리
)

# 모델 훈련 (차원 축소된 데이터를 사용)
udbb_model.fit(X_train_encoded, y_train)

# 모델 저장
joblib.dump(udbb_model, os.path.join(output_dir, "udbb_rf_model.pkl"))



Classification Report for UDBB + Random Forest:
              precision    recall  f1-score   support

           0     0.9999    0.7050    0.8270    454590
           1     0.2595    0.9912    0.4113      1588
           2     0.2777    0.8787    0.4221      1179
           3     0.1854    0.8723    0.3059      1159
           4     0.3317    0.9236    0.4881      1100
           5     0.8490    0.9205    0.8833     46215
           6     0.2606    0.9174    0.4058      2059
           7     0.0025    1.0000    0.0050         2
           8     0.0456    0.3189    0.0799       301
           9     0.0410    0.7538    0.0778       130
          10     0.0001    0.7500    0.0002         4
          11     0.0003    0.8571    0.0007         7
          12     0.0109    0.9771    0.0217       393
          13     0.7442    0.9960    0.8519     31786
          14     0.8137    0.9511    0.8771     25606

    accuracy                         0.7528    566119
   macro avg     0.3215    0.854

In [4]:
# 모델 평가
y_pred = udbb_model.predict(X_test_encoded)
print("Classification Report for AE + UDBB + Random Forest:")
print(classification_report(y_test, y_pred, digits=4))

# 테스트 결과 저장
evaluation_results = classification_report(y_test, y_pred, digits=4, output_dict=True)
results_df = pd.DataFrame(evaluation_results).transpose()
results_df.to_csv(os.path.join(output_dir, "udbb_rf_evaluation.csv"), index=True)

# 클래스 분포 출력 (훈련 후)
print("Class distribution after UDBB:")
print(pd.Series(y_train).value_counts())

Classification Report for AE + UDBB + Random Forest:
              precision    recall  f1-score   support

           0     0.9999    0.7050    0.8270    454590
           1     0.2595    0.9912    0.4113      1588
           2     0.2777    0.8787    0.4221      1179
           3     0.1854    0.8723    0.3059      1159
           4     0.3317    0.9236    0.4881      1100
           5     0.8490    0.9205    0.8833     46215
           6     0.2606    0.9174    0.4058      2059
           7     0.0025    1.0000    0.0050         2
           8     0.0456    0.3189    0.0799       301
           9     0.0410    0.7538    0.0778       130
          10     0.0001    0.7500    0.0002         4
          11     0.0003    0.8571    0.0007         7
          12     0.0109    0.9771    0.0217       393
          13     0.7442    0.9960    0.8519     31786
          14     0.8137    0.9511    0.8771     25606

    accuracy                         0.7528    566119
   macro avg     0.3215    