In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import joblib
from sklearn.ensemble import RandomForestClassifier

# 데이터 로드
df_cleaned = pd.read_csv("../data/processed/data_cleaned.csv")

# 데이터 분할
X = df_cleaned.drop(columns=['Label'])
y = df_cleaned['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 정규화된 데이터를 저장
joblib.dump(scaler, "../results/models1/scaler.pkl")

# 결과 저장 경로 생성
output_dir = "../results/models1/"
os.makedirs(output_dir, exist_ok=True)


In [2]:
# 1. Autoencoder + Classifier 모델 정의
input_dim = X_train_scaled.shape[1]  # 입력 차원 (63)
hidden_dim1 = 56  # 중간 은닉층
encoding_dim = 49  # 차원 축소 목표

# 입력층
input_layer = Input(shape=(input_dim,))

# 인코더
encoded = Dense(hidden_dim1, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)

# 디코더 (Autoencoder의 복원 부분)
decoded = Dense(hidden_dim1, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

# 분류기 (분류 태스크를 위한 출력 레이어 추가)
classification_output = Dense(1, activation='sigmoid')(encoded)

# Autoencoder + Classifier 모델 정의
autoencoder_classifier = Model(inputs=input_layer, outputs=[decoded, classification_output])
autoencoder_classifier.compile(
    optimizer=Adam(),
    loss=['mean_squared_error', 'binary_crossentropy'],  # Autoencoder 복원 + 분류 손실
    loss_weights=[0.5, 0.5],  # 손실 가중치
    metrics={'dense_4': 'accuracy'}  # 분류기의 출력에 해당하는 정확도 추적 (dense_4는 classification_output 레이어)
)

# EarlyStopping 콜백
early_stopping = EarlyStopping(
    monitor='val_dense_4_accuracy',  # classification_output의 정확도를 모니터링
    patience=5,
    restore_best_weights=True,
    mode='max'  # 정확도는 최대화해야 하므로 'max'로 설정
)
# 2. Autoencoder + Classifier 학습
y_binary_train = (y_train != 0).astype(int)
y_binary_test = (y_test != 0).astype(int)

autoencoder_classifier.fit(
    X_train_scaled, [X_train_scaled, y_binary_train],  # 입력 데이터와 분류 레이블
    epochs=50,
    batch_size=128,
    validation_data=(X_test_scaled, [X_test_scaled, y_binary_test]),
    callbacks=[early_stopping]
)

# 3. 평가: Autoencoder + Classifier
_, y_pred_binary = autoencoder_classifier.predict(X_test_scaled)

y_pred_binary = (y_pred_binary > 0.5).astype(int)
print("Classification Report for Autoencoder + Classifier:")
print(classification_report(y_binary_test, y_pred_binary))

# 모델 저장
autoencoder_classifier.save(os.path.join(output_dir, "autoencoder_classifier.h5"))


Epoch 1/50
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 2ms/step - dense_3_loss: 0.7035 - dense_4_accuracy: 0.9669 - dense_4_loss: 0.0767 - loss: 0.3901 - val_dense_3_loss: 0.6911 - val_dense_4_accuracy: 0.9819 - val_dense_4_loss: 0.0458 - val_loss: 0.3685
Epoch 2/50
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - dense_3_loss: 0.7688 - dense_4_accuracy: 0.9795 - dense_4_loss: 0.0475 - loss: 0.4081 - val_dense_3_loss: 0.6907 - val_dense_4_accuracy: 0.9797 - val_dense_4_loss: 0.0446 - val_loss: 0.3677
Epoch 3/50
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - dense_3_loss: 0.7015 - dense_4_accuracy: 0.9820 - dense_4_loss: 0.0414 - loss: 0.3714 - val_dense_3_loss: 0.6905 - val_dense_4_accuracy: 0.9850 - val_dense_4_loss: 0.0357 - val_loss: 0.3631
Epoch 4/50
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - dense_3_loss: 0.6816 - dense_4_accuracy: 0.9851 - dense_4_



              precision    recall  f1-score   support

           0       1.00      1.00      1.00    454590
           1       0.99      0.99      0.99    111529

    accuracy                           1.00    566119
   macro avg       0.99      0.99      0.99    566119
weighted avg       1.00      1.00      1.00    566119



In [3]:
# 4. Autoencoder 차원 축소 (인코더만 추출하여 사용)
encoder = Model(inputs=input_layer, outputs=encoded)

# 차원 축소된 데이터
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# 축소된 차원 확인
print("Original shape:", X_train_scaled.shape)  # 원래 차원: (샘플 수, 63)
print("Reduced shape:", X_train_encoded.shape)  # 축소 차원: (샘플 수, 49)

# 5. Random Forest 모델 학습
rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
rf.fit(X_train_encoded, y_binary_train)

# Random Forest 모델 저장
joblib.dump(rf, os.path.join(output_dir, "random_forest.pkl"))

# 6. 평가: Random Forest
y_pred_rf = rf.predict(X_test_encoded)
print("Classification Report for Random Forest:")
print(classification_report(y_binary_test, y_pred_rf))

# 7. 인코더 모델 저장
encoder.save(os.path.join(output_dir, "encoder.h5"))

[1m70765/70765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 611us/step
[1m17692/17692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 600us/step
Original shape: (2264474, 63)
Reduced shape: (2264474, 49)
Classification Report for Random Forest:




              precision    recall  f1-score   support

           0       1.00      1.00      1.00    454590
           1       1.00      0.99      0.99    111529

    accuracy                           1.00    566119
   macro avg       1.00      0.99      1.00    566119
weighted avg       1.00      1.00      1.00    566119

