In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
import time

# 데이터 로드
print("데이터 로딩 중...")
data = pd.read_csv('labeled_data.csv')

# timestamp를 datetime으로 변환하고 추가 특성 생성
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour'] = data['timestamp'].dt.hour
data['minute'] = data['timestamp'].dt.minute

data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)


# 새로운 특성 추가


data['sin_day'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
data['cos_day'] = np.cos(2 * np.pi * data['day_of_week'] / 7)

# 특성과 타겟 정의
categorical_features = [
    'hour', 'Holiday', 'enbid_pci', 'day_of_week', 'is_weekend'
]
numeric_features = [ 'sin_day', 'cos_day']
features = categorical_features + numeric_features

X = data[features]
y = data[['cell_2100', 'cell_2600_10', 'cell_2600_20']]

# 범주형 변수 처리
for col in categorical_features:
    X[col] = X[col].astype('category')

# 레이블 인코딩
label_encoders = {}
for col in y.columns:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col])
    label_encoders[col] = le

# 데이터 분할
print("데이터 분할 중...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y['cell_2100'])

# LightGBM 모델 정의
lgb_model = MultiOutputClassifier(LGBMClassifier(
    n_estimators=500,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    categorical_feature=categorical_features,
    class_weight='balanced'
))

# 모델 학습
print("모델 학습 중...")
start_time = time.time()
lgb_model.fit(X_train, y_train)
train_time = time.time() - start_time
print(f"학습 시간: {train_time:.2f} 초")

# 예측
print("예측 중...")
start_time = time.time()
y_pred = lgb_model.predict(X_test)
predict_time = time.time() - start_time
print(f"예측 시간: {predict_time:.2f} 초")



# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"\n정확도: {accuracy:.4f}")

# 분류 보고서 출력 및 저장
print("\n분류 보고서:")
report = classification_report(y_test, y_pred, target_names=['cell_2100', 'cell_2600_10', 'cell_2600_20'])
print(report)

with open('classification_report.txt', 'w') as f:
    f.write(report)
print("분류 보고서가 'classification_report.txt' 파일로 저장되었습니다.")

# 특성 중요도 계산 및 저장
feature_importance = multi_model.estimators_[0].feature_importances_
feature_importance_dict = dict(zip(X.columns, feature_importance))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print("\n특성 중요도 (상위 10개):")
for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")

# 특성 중요도를 DataFrame으로 변환
importance_df = pd.DataFrame(sorted_features, columns=['Feature', 'Importance'])
importance_df.to_csv('feature_importance.csv', index=False)
print("전체 특성 중요도가 'feature_importance.csv' 파일로 저장되었습니다.")

# 모델 저장
import joblib
model_filename = 'lgbm_cell_labeling_model.joblib'
joblib.dump(multi_model, model_filename)
print(f"\n모델이 {model_filename} 파일로 저장되었습니다.")

데이터 로딩 중...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[col] = le.fit_transform(y[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

데이터 분할 중...
모델 학습 중...


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Number of positive: 581126, number of negative: 3578784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 4159910, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Number of positive: 1525706, number of negative: 2634204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 4159910, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Number of positive: 885599, number of negative: 3274311
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 4159910, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
학습 시간: 106.80 초
예측 중...
예측 시간: 15.48 초

정확도: 0.8250

분류 보고서:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

   cell_2100       0.59      0.92      0.71    145282
cell_2600_10       0.86      0.95      0.90    380615
cell_2600_20       0.84      0.97      0.90    221811

   micro avg       0.79      0.95      0.86    747708
   macro avg       0.76      0.94      0.84    747708
weighted avg       0.80      0.95      0.87    747708
 samples avg       0.49      0.55      0.51    747708

분류 보고서가 'classification_report.txt' 파일로 저장되었습니다.


NameError: name 'multi_model' is not defined