In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
import time
import joblib
import io

# 결과를 저장할 StringIO 객체 생성
report_buffer = io.StringIO()

def log_result(message):
    print(message)
    print(message, file=report_buffer)

# 데이터 로드
log_result("데이터 로딩 중...")
data = pd.read_csv('labeled_data.csv')

# timestamp를 datetime으로 변환하고 추가 특성 생성
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour'] = data['timestamp'].dt.hour
data['minute'] = data['timestamp'].dt.minute
data['quarter_hour'] = data['minute'] // 15
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)
data['is_rush_hour'] = data['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)

# 특성과 타겟 정의
features = ['hour', 'Holiday', 'enbid_pci', 'day_of_week', 'quarter_hour',
            'is_weekend', 'is_rush_hour']
X = data[features]
y = data[['cell_2100', 'cell_2600_10', 'cell_2600_20']]

# 클러스터별로 모델 학습 및 예측
clusters = [0, 2, 3, 4, 6, 7]  # 주어진 클러스터 목록
models = {}
predictions = pd.DataFrame()
overall_y_true = pd.DataFrame()
overall_y_pred = pd.DataFrame()

for cluster in clusters:
    log_result(f"\n클러스터 {cluster} 처리 중...")
    # 클러스터별 데이터 분할
    cluster_mask = data['cluster'] == cluster
    X_cluster = X[cluster_mask]
    y_cluster = y[cluster_mask]
    X_train, X_test, y_train, y_test = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)

    # 범주형 변수 처리
    for col in ['hour', 'Holiday', 'enbid_pci', 'day_of_week', 'quarter_hour']:
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

    # 모델 정의 및 학습
    lgb_model = MultiOutputClassifier(LGBMClassifier(
        n_estimators=100,
        learning_rate=0.01,
        num_leaves=31,
        random_state=42
    ))
    start_time = time.time()
    lgb_model.fit(X_train, y_train)
    train_time = time.time() - start_time
    log_result(f"학습 시간: {train_time:.2f} 초")

    # 예측
    y_pred = lgb_model.predict(X_test)

    # 성능 평가
    accuracy = accuracy_score(y_test, y_pred)
    log_result(f"클러스터 {cluster} 정확도: {accuracy:.4f}")
    log_result("\n분류 보고서:")
    log_result(classification_report(y_test, y_pred, target_names=['cell_2100', 'cell_2600_10', 'cell_2600_20']))

    # 모델 저장
    models[cluster] = lgb_model

    # 전체 성능 평가를 위한 데이터 저장
    overall_y_true = pd.concat([overall_y_true, y_test])
    overall_y_pred = pd.concat([overall_y_pred, pd.DataFrame(y_pred, columns=['cell_2100', 'cell_2600_10', 'cell_2600_20'])])

# 전체 성능 평가
overall_accuracy = accuracy_score(overall_y_true, overall_y_pred)
log_result(f"\n전체 정확도: {overall_accuracy:.4f}")
log_result("\n전체 분류 보고서:")
log_result(classification_report(overall_y_true, overall_y_pred, target_names=['cell_2100', 'cell_2600_10', 'cell_2600_20']))

# 모델 저장
for cluster, model in models.items():
    joblib.dump(model, f'cluster_{cluster}_model.joblib')
    log_result(f"클러스터 {cluster} 모델이 저장되었습니다.")

log_result("\n모든 처리가 완료되었습니다.")

# 결과 보고서를 파일로 저장
with open('classification_report.txt', 'w', encoding='utf-8') as report_file:
    report_file.write(report_buffer.getvalue())

log_result("결과 보고서가 'classification_report.txt' 파일로 저장되었습니다.")

# StringIO 버퍼 닫기
report_buffer.close()

데이터 로딩 중...

클러스터 0 처리 중...
[LightGBM] [Info] Number of positive: 88170, number of negative: 613935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 702105, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.125580 -> initscore=-1.940622
[LightGBM] [Info] Start training from score -1.940622
[LightGBM] [Info] Number of positive: 229080, number of negative: 473025
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009523 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 702105, number of used f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Number of positive: 143872, number of negative: 944086
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 1087958, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.132240 -> initscore=-1.881293
[LightGBM] [Info] Start training from score -1.881293
[LightGBM] [Info] Number of positive: 308788, number of negative: 779170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 1087958, number of used features: 7
[LightGBM] [In

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Number of positive: 45719, number of negative: 392309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006262 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 438028, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.104375 -> initscore=-2.149536
[LightGBM] [Info] Start training from score -2.149536
[LightGBM] [Info] Number of positive: 92944, number of negative: 345084
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 438028, number of used features: 7
[LightGBM] [Info] 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

   cell_2100       1.00      0.70      0.83     11455
cell_2600_10       0.95      0.84      0.89     23153
cell_2600_20       0.99      0.92      0.95     18425

   micro avg       0.97      0.84      0.90     53033
   macro avg       0.98      0.82      0.89     53033
weighted avg       0.97      0.84      0.90     53033
 samples avg       0.35      0.34      0.34     53033


클러스터 4 처리 중...
[LightGBM] [Info] Number of positive: 71698, number of negative: 559413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 143
[LightGBM] [Info] Number of data points in the train set: 631111, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113606 -> initscore=-2.054425
[LightGBM] [Info] Start training from score -

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Number of positive: 60730, number of negative: 467315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 528045, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115009 -> initscore=-2.040566
[LightGBM] [Info] Start training from score -2.040566
[LightGBM] [Info] Number of positive: 269787, number of negative: 258258
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 528045, number of used features: 7
[LightGBM] [Info]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[LightGBM] [Info] Number of positive: 170690, number of negative: 601970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 772660, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220912 -> initscore=-1.260359
[LightGBM] [Info] Start training from score -1.260359
[LightGBM] [Info] Number of positive: 389966, number of negative: 382694
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010925 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 772660, number of used features: 7
[LightGBM] [Info

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



전체 정확도: 0.8851

전체 분류 보고서:
              precision    recall  f1-score   support

   cell_2100       0.97      0.53      0.69    145529
cell_2600_10       0.91      0.91      0.91    381830
cell_2600_20       0.94      0.86      0.90    221693

   micro avg       0.93      0.82      0.87    749052
   macro avg       0.94      0.77      0.83    749052
weighted avg       0.93      0.82      0.86    749052
 samples avg       0.50      0.49      0.49    749052

클러스터 0 모델이 저장되었습니다.
클러스터 2 모델이 저장되었습니다.
클러스터 3 모델이 저장되었습니다.
클러스터 4 모델이 저장되었습니다.
클러스터 6 모델이 저장되었습니다.
클러스터 7 모델이 저장되었습니다.

모든 처리가 완료되었습니다.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
