## 모델 학습을 위한 데이터 전처리

In [6]:
import numpy as np
import os
import time
from google.colab import drive
from tqdm.notebook import tqdm
from skimage.feature import hog

# 3개의 핵심 모델 임포트
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import classification_report, roc_auc_score

# --- 1. 설정 및 데이터 로딩 ---
print("Google Drive 마운트 중...")
drive.mount('/content/drive', force_remount=True)

BASE_DIR = '/content/drive/MyDrive/ML_Project'
SAVE_PATH_TRAIN = os.path.join(BASE_DIR, 'colored_mnist_train.npz')
SAVE_PATH_TEST = os.path.join(BASE_DIR, 'colored_mnist_test.npz')

NUM_CLASSES_DIGIT = 10  # 숫자 (0~9)
NUM_CLASSES_COLOR = 7   # 색상 (무지개 7색)

# 학습 데이터 로드
print("학습 데이터 로딩 중...")
train_data = np.load(SAVE_PATH_TRAIN)
X_train_color = train_data['images']
y_train_digit = train_data['labels_digit']
y_train_fg = train_data['labels_fg']
y_train_bg = train_data['labels_bg']

# 테스트 데이터 로드
print("테스트 데이터 로딩 중...")
test_data = np.load(SAVE_PATH_TEST)
X_test_color = test_data['images']
y_test_digit = test_data['labels_digit']
y_test_fg = test_data['labels_fg']
y_test_bg = test_data['labels_bg']

print(f"\n원본 학습 데이터 Shape: {X_train_color.shape}")


# --- 2. 데이터 전처리 ---
# 2-1. 색상 분류용 (Color 3D -> 1D Flatten)
X_train_flat_color = X_train_color.reshape(X_train_color.shape[0], -1)
X_test_flat_color = X_test_color.reshape(X_test_color.shape[0], -1)

# 2-2. 숫자 분류용 (Color 3D -> Grayscale 2D -> 1D Flatten)
X_train_gray = np.dot(X_train_color[...,:3], [0.299, 0.587, 0.114])
X_test_gray = np.dot(X_test_color[...,:3], [0.299, 0.587, 0.114])
X_train_flat_gray = X_train_gray.reshape(X_train_gray.shape[0], -1)
X_test_flat_gray = X_test_gray.reshape(X_test_gray.shape[0], -1)

print(f"숫자 분류용 데이터 (Gray): {X_train_flat_gray.shape}")
print(f"색상 분류용 데이터 (Color): {X_train_flat_color.shape}")

Google Drive 마운트 중...


Exception ignored in: <function NpzFile.__del__ at 0x7a156d30fa60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 226, in __del__
    self.close()
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 221, in close
    self.fid.close()
OSError: [Errno 107] Transport endpoint is not connected


Mounted at /content/drive
학습 데이터 로딩 중...


Exception ignored in: <function NpzFile.__del__ at 0x7a156d30fa60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 226, in __del__
    self.close()
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 221, in close
    self.fid.close()
OSError: [Errno 107] Transport endpoint is not connected


테스트 데이터 로딩 중...

원본 학습 데이터 Shape: (60000, 28, 28, 3)
숫자 분류용 데이터 (Gray): (60000, 784)
색상 분류용 데이터 (Color): (60000, 2352)


## 모델 학습

In [9]:
# HOG 파라미터 (이 기본값이 MNIST에 최적화되어 있어)
ppc = 8
cpb = 2

# 학습 데이터에서 HOG 피처 추출
X_train_hog = np.array([hog(image, pixels_per_cell=(ppc, ppc), cells_per_block=(cpb, cpb), visualize=False) for image in tqdm(X_train_gray, desc="학습 데이터 HOG 변환")])

# 테스트 데이터에서 HOG 피처 추출
X_test_hog = np.array([hog(image, pixels_per_cell=(ppc, ppc), cells_per_block=(cpb, cpb), visualize=False) for image in tqdm(X_test_gray, desc="테스트 데이터 HOG 변환")])

print(f"HOG 피처 Shape: {X_train_hog.shape}")

# --- 4. HOG 피처로 모델 학습 및 비교 ---
print("\n--- HOG 피처로 최종 모델 학습 시작 ---")

# --- 4-1. RandomForest with HOG ---
print("[1/2] RandomForest 모델 학습 중 (HOG)...")
start_time = time.time()
digit_model_rf_hog = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
digit_model_rf_hog.fit(X_train_hog, y_train_digit)
print(f" -> 학습 완료 (소요 시간: {time.time() - start_time:.2f}초)")

# --- 4-2. XGBoost with HOG ---
print("[2/2] XGBoost 모델 학습 중 (HOG)...")
start_time = time.time()
digit_model_xgb_hog = xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multi:softmax', eval_metric='mlogloss', num_class=NUM_CLASSES_DIGIT)
digit_model_xgb_hog.fit(X_train_hog, y_train_digit)
print(f" -> 학습 완료 (소요 시간: {time.time() - start_time:.2f}초)")

# --- 4-3. LightGBM with HOG ---
print("[3/3] LightGBM 모델 학습 중 (HOG)...")
start_time = time.time()
digit_model_lgb_hog = lgb.LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multiclass', num_class=NUM_CLASSES_DIGIT)
digit_model_lgb_hog.fit(X_train_hog, y_train_digit)
print(f" -> 학습 완료 (소요 시간: {time.time() - start_time:.2f}초)")

학습 데이터 HOG 변환:   0%|          | 0/60000 [00:00<?, ?it/s]

테스트 데이터 HOG 변환:   0%|          | 0/10000 [00:00<?, ?it/s]

HOG 피처 Shape: (60000, 144)
-> 784개의 픽셀이 훨씬 더 스마트한 피처로 압축됐어.

--- HOG 피처로 최종 모델 학습 시작 ---
[1/2] RandomForest 모델 학습 중 (HOG)...
 -> 학습 완료 (소요 시간: 32.09초)
[2/2] XGBoost 모델 학습 중 (HOG)...
 -> 학습 완료 (소요 시간: 54.37초)
[3/3] LightGBM 모델 학습 중 (HOG)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 64
[LightGBM] [Info] Start training from score -2.296437
[LightGBM] [Info] Start training from score -2.311290
[LightGBM] [Info] Start training from score -2.322278
[LightGBM] [Info] Start training from score -2.301752
[LightGBM] [Info] Start training from score -2.300421
[LightGBM] [Info] Start training from score -2.325513
[LightGBM] [Info] Start training from score -2.277892
[LightGBM] [Info] Start trai

In [4]:
print("\n--- 3종 모델 학습 시작 (총 9개) ---")

# 학습할 모델과 데이터를 '작업 목록'으로 정의
training_jobs = [
    # RandomForest
    ("RandomForest - Digit", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), X_train_flat_gray, y_train_digit),
    ("RandomForest - FG Color", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), X_train_flat_color, y_train_fg),
    ("RandomForest - BG Color", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), X_train_flat_color, y_train_bg),
    # XGBoost
    ("XGBoost - Digit", xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multi:softmax', eval_metric='mlogloss', num_class=NUM_CLASSES_DIGIT), X_train_flat_gray, y_train_digit),
    ("XGBoost - FG Color", xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multi:softmax', eval_metric='mlogloss', num_class=NUM_CLASSES_COLOR), X_train_flat_color, y_train_fg),
    ("XGBoost - BG Color", xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multi:softmax', eval_metric='mlogloss', num_class=NUM_CLASSES_COLOR), X_train_flat_color, y_train_bg),
    # LightGBM
    ("LightGBM - Digit", lgb.LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multiclass', num_class=NUM_CLASSES_DIGIT), X_train_flat_gray, y_train_digit),
    ("LightGBM - FG Color", lgb.LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multiclass', num_class=NUM_CLASSES_COLOR), X_train_flat_color, y_train_fg),
    ("LightGBM - BG Color", lgb.LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1, objective='multiclass', num_class=NUM_CLASSES_COLOR), X_train_flat_color, y_train_bg),
]

# 학습된 모델들을 저장할 딕셔너리
trained_models = {}

# TQDM으로 작업 목록을 감싸서 진행도 표시
for name, model, X_train_data, y_train_data in tqdm(training_jobs, desc="Overall Training Progress"):
    model.fit(X_train_data, y_train_data)
    trained_models[name] = model # 학습 완료된 모델을 이름과 함께 저장

print("--- 모든 모델 학습 완료 ---")


--- 3종 모델 학습 시작 (총 9개) ---


Overall Training Progress:   0%|          | 0/9 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.201257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6272
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 784
[LightGBM] [Info] Start training from score -2.296437
[LightGBM] [Info] Start training from score -2.311290
[LightGBM] [Info] Start training from score -2.322278
[LightGBM] [Info] Start training from score -2.301752
[LightGBM] [Info] Start training from score -2.300421
[LightGBM] [Info] Start training from score -2.325513
[LightGBM] [Info] Start training from score -2.277892
[LightGBM] [Info] Start training from score -2.272541
[LightGBM] [Info] Start training from score -2.312972
[LightGBM] [Info] Start training from score -2.306091
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392822 seconds.
Yo

## 모델 성능 평가

In [5]:
def evaluate_model(model_name, model_obj):
    print(f"\n===== {model_name} =====")

    # 모델 이름에 따라 올바른 테스트 데이터를 선택
    if "Digit" in model_name:
        X_test, y_test = X_test_flat_gray, y_test_digit
    elif "FG Color" in model_name:
        X_test, y_test = X_test_flat_color, y_test_fg
    else: # BG Color
        X_test, y_test = X_test_flat_color, y_test_bg

    y_pred = model_obj.predict(X_test)
    y_proba = model_obj.predict_proba(X_test)

    print(classification_report(y_test, y_pred, digits=4))
    auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
    print(f"** Weighted-Average AUC: {auc:.4f} **")

print("\n--- 최종 성능 평가 (Test Set) ---")

# 학습된 모델들을 순회하며 평가
for name, model in trained_models.items():
    evaluate_model(name, model)


--- 최종 성능 평가 (Test Set) ---

===== RandomForest - Digit =====
              precision    recall  f1-score   support

           0     0.9726    0.9803    0.9764      1013
           1     0.9652    0.9808    0.9729       989
           2     0.9431    0.9470    0.9451       963
           3     0.9433    0.9297    0.9364      1038
           4     0.9514    0.9444    0.9479       953
           5     0.9639    0.9387    0.9511       995
           6     0.9500    0.9786    0.9641      1028
           7     0.9530    0.9606    0.9568       991
           8     0.9510    0.9135    0.9318      1040
           9     0.9150    0.9354    0.9251       990

    accuracy                         0.9508     10000
   macro avg     0.9508    0.9509    0.9508     10000
weighted avg     0.9509    0.9508    0.9507     10000

** Weighted-Average AUC: 0.9974 **

===== RandomForest - FG Color =====
              precision    recall  f1-score   support

           0     0.9993    1.0000    0.9997      14



              precision    recall  f1-score   support

           0     0.9572    0.9724    0.9647      1013
           1     0.9667    0.9676    0.9672       989
           2     0.9068    0.9294    0.9179       963
           3     0.9366    0.9104    0.9233      1038
           4     0.9272    0.9224    0.9248       953
           5     0.9446    0.9246    0.9345       995
           6     0.9427    0.9611    0.9518      1028
           7     0.9443    0.9405    0.9424       991
           8     0.9265    0.9096    0.9180      1040
           9     0.9057    0.9212    0.9134       990

    accuracy                         0.9359     10000
   macro avg     0.9358    0.9359    0.9358     10000
weighted avg     0.9360    0.9359    0.9359     10000

** Weighted-Average AUC: 0.9959 **

===== LightGBM - FG Color =====




              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      1433
           1     1.0000    1.0000    1.0000      1435
           2     1.0000    1.0000    1.0000      1426
           3     1.0000    1.0000    1.0000      1441
           4     1.0000    1.0000    1.0000      1438
           5     1.0000    1.0000    1.0000      1371
           6     1.0000    1.0000    1.0000      1456

    accuracy                         1.0000     10000
   macro avg     1.0000    1.0000    1.0000     10000
weighted avg     1.0000    1.0000    1.0000     10000

** Weighted-Average AUC: 1.0000 **

===== LightGBM - BG Color =====




              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000      1426
           1     1.0000    1.0000    1.0000      1383
           2     1.0000    1.0000    1.0000      1407
           3     1.0000    1.0000    1.0000      1468
           4     1.0000    1.0000    1.0000      1464
           5     1.0000    1.0000    1.0000      1424
           6     1.0000    1.0000    1.0000      1428

    accuracy                         1.0000     10000
   macro avg     1.0000    1.0000    1.0000     10000
weighted avg     1.0000    1.0000    1.0000     10000

** Weighted-Average AUC: 1.0000 **


In [10]:
print("\n--- HOG 모델 최종 성능 평가 (Test Set) ---")

def evaluate_model_hog(model, model_name, X_test_hog_data, y_test_digit_data):
    print(f"\n===== {model_name} =====")
    y_pred = model.predict(X_test_hog_data)
    y_proba = model.predict_proba(X_test_hog_data)

    print(classification_report(y_test_digit_data, y_pred, digits=4))
    auc = roc_auc_score(y_test_digit_data, y_proba, multi_class='ovr', average='weighted')
    print(f"** Weighted-Average AUC: {auc:.4f} **")

# HOG로 학습된 두 모델의 성능을 나란히 출력
evaluate_model_hog(digit_model_rf_hog, "RandomForest - Digit (HOG)", X_test_hog, y_test_digit)
evaluate_model_hog(digit_model_xgb_hog, "XGBoost - Digit (HOG)", X_test_hog, y_test_digit)
evaluate_model_hog(digit_model_lgb_hog, "LightGBM - Digit (HOG)", X_test_hog, y_test_digit)


--- HOG 모델 최종 성능 평가 (Test Set) ---

===== RandomForest - Digit (HOG) =====
              precision    recall  f1-score   support

           0     0.9786    0.9911    0.9848      1013
           1     0.9909    0.9899    0.9904       989
           2     0.9790    0.9699    0.9744       963
           3     0.9700    0.9653    0.9676      1038
           4     0.9688    0.9790    0.9739       953
           5     0.9799    0.9779    0.9789       995
           6     0.9788    0.9893    0.9840      1028
           7     0.9763    0.9566    0.9664       991
           8     0.9532    0.9596    0.9564      1040
           9     0.9686    0.9646    0.9666       990

    accuracy                         0.9743     10000
   macro avg     0.9744    0.9743    0.9743     10000
weighted avg     0.9743    0.9743    0.9743     10000

** Weighted-Average AUC: 0.9993 **

===== XGBoost - Digit (HOG) =====
              precision    recall  f1-score   support

           0     0.9911    0.9901    0.9



              precision    recall  f1-score   support

           0     0.9911    0.9891    0.9901      1013
           1     0.9949    0.9889    0.9919       989
           2     0.9853    0.9761    0.9807       963
           3     0.9750    0.9759    0.9754      1038
           4     0.9759    0.9769    0.9764       953
           5     0.9918    0.9779    0.9848       995
           6     0.9827    0.9942    0.9884      1028
           7     0.9725    0.9627    0.9675       991
           8     0.9619    0.9712    0.9665      1040
           9     0.9613    0.9778    0.9695       990

    accuracy                         0.9791     10000
   macro avg     0.9792    0.9791    0.9791     10000
weighted avg     0.9792    0.9791    0.9791     10000

** Weighted-Average AUC: 0.9996 **
