In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("../data/processed/merged_data.csv")

# 데이터 전처리
X=df.drop(columns=['source', 'Label']) 
y=df['Label']

# 데이터 분할
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

# PCA로 차원 축소
n_components=10  # 축소할 차원 수
pca=PCA(n_components=n_components)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

# Random Forest
rf_model=RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_pca, y_train)

# 예측 및 평가
y_pred=rf_model.predict(X_test_pca)

# 정확도 출력
accuracy=accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# 상세 평가 리포트
print(classification_report(y_test, y_pred))

Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    454434
           1       1.00      0.99      0.99      1612
           2       0.93      0.73      0.82      1173
           3       1.00      0.98      0.99      1173
           4       0.99      0.99      0.99      1072
           5       1.00      1.00      1.00     46486
           6       0.99      0.98      0.98      2074
           7       1.00      0.50      0.67         2
           8       0.82      0.74      0.78       308
           9       0.61      0.54      0.57       119
          10       1.00      0.14      0.25         7
          11       0.50      0.40      0.44         5
          12       0.86      0.78      0.82       390
          13       1.00      1.00      1.00     31711
          14       1.00      1.00      1.00     25583

    accuracy                           1.00    566149
   macro avg       0.91      0.78      0.82    566149
weighted av

In [3]:
# 클래스 분포를 확인하는 함수
def check_class_distribution(y, dataset_name="Dataset"):
    class_counts = y.value_counts(normalize=True) * 100  # 각 클래스 비율 계산 (%)
    print(f"Class distribution in {dataset_name}:")
    for cls, proportion in class_counts.items():
        print(f"Class {cls}: {proportion:.2f}%")
    print("-" * 30)

# 전체 데이터 클래스 분포
check_class_distribution(y, "Original Data")

# 훈련 데이터 클래스 분포
check_class_distribution(y_train, "Training Data")

# 테스트 데이터 클래스 분포
check_class_distribution(y_test, "Test Data")


Class distribution in Original Data:
Class 0: 80.30%
Class 5: 8.16%
Class 13: 5.61%
Class 14: 4.52%
Class 6: 0.36%
Class 1: 0.28%
Class 2: 0.21%
Class 3: 0.20%
Class 4: 0.19%
Class 12: 0.07%
Class 8: 0.05%
Class 9: 0.02%
Class 11: 0.00%
Class 10: 0.00%
Class 7: 0.00%
------------------------------
Class distribution in Training Data:
Class 0: 80.31%
Class 5: 8.15%
Class 13: 5.62%
Class 14: 4.52%
Class 6: 0.36%
Class 1: 0.28%
Class 2: 0.21%
Class 3: 0.20%
Class 4: 0.20%
Class 12: 0.07%
Class 8: 0.05%
Class 9: 0.02%
Class 11: 0.00%
Class 10: 0.00%
Class 7: 0.00%
------------------------------
Class distribution in Test Data:
Class 0: 80.27%
Class 5: 8.21%
Class 13: 5.60%
Class 14: 4.52%
Class 6: 0.37%
Class 1: 0.28%
Class 3: 0.21%
Class 2: 0.21%
Class 4: 0.19%
Class 12: 0.07%
Class 8: 0.05%
Class 9: 0.02%
Class 10: 0.00%
Class 11: 0.00%
Class 7: 0.00%
------------------------------
