<a href="https://colab.research.google.com/github/rami2ee3/DA_study/blob/main/boram_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# =========================
# 1️⃣ 데이터 불러오기
# =========================
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/elecClassification_train/train.csv", encoding="utf-8")
test  = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/elecClassification_test/test.csv", encoding="utf-8")

# =========================
# 2️⃣ 전처리: 결측치/피처 준비
# =========================
# 불필요한 컬럼 제거
for df in [train, test]:
    for col in ["index", "ID", "Id"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

# 수치형 컬럼만 추출
num_cols = [c for c in train.select_dtypes(include=np.number).columns]
print(f"[INFO] 수치형 컬럼: {len(num_cols)}")

# 결측치 채우기 (중앙값)
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols]  = test[num_cols].fillna(train[num_cols].median())  # train 기준 median

# =========================
# 3️⃣ 파생 피처: 상별 전력 합/역률, 총전력
# =========================
for phase in ["R","S","T"]:
    train[f"{phase}전력합"] = np.sqrt(train[f"{phase}상유효전력"]**2 + train[f"{phase}상무효전력"]**2)
    train[f"{phase}역률"] = train[f"{phase}상유효전력"] / train[f"{phase}전력합"]

train["총전력합"] = train["R전력합"] + train["S전력합"] + train["T전력합"]

# 동일 피처를 test에도 생성
for phase in ["R","S","T"]:
    test[f"{phase}전력합"] = np.sqrt(test[f"{phase}상유효전력"]**2 + test[f"{phase}상무효전력"]**2)
    test[f"{phase}역률"] = test[f"{phase}상유효전력"] / test[f"{phase}전력합"]
test["총전력합"] = test["R전력합"] + test["S전력합"] + test["T전력합"]

# =========================
# 4️⃣ 라벨 매핑
# =========================
label_cols = ["label_역률평균","label_전류고조파평균","label_전압고조파평균"]
label_map  = {"정상":0, "주의":1, "경고":2}

for col in label_cols:
    train[col] = train[col].astype(str).str.strip()
    train[col] = train[col].map(label_map)

# NaN 확인
for col in label_cols:
    print(f"{col} NaN 개수:", train[col].isna().sum())
train = train.dropna(subset=label_cols)  # 라벨 NaN 제거

# =========================
# 5️⃣ 학습/예측 데이터 준비
# =========================
common_cols = [c for c in num_cols if c in test.columns]
X_train = train[common_cols].copy()
y_train = train[label_cols].copy()
X_test  = test[common_cols].copy()

print("[INFO] X_train:", X_train.shape, "y_train:", y_train.shape, "X_test:", X_test.shape)

# =========================
# 6️⃣ 모델 학습/예측 (LightGBM)
# =========================
try:
    import lightgbm as lgb
    BaseClf = lgb.LGBMClassifier
    clf_params = dict(objective="multiclass", num_class=3,
                      class_weight="balanced", n_estimators=300,
                      learning_rate=0.05, random_state=42)
except:
    from sklearn.ensemble import RandomForestClassifier
    BaseClf = RandomForestClassifier
    clf_params = dict(n_estimators=300, class_weight="balanced", random_state=42)

# pipeline
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", BaseClf(**clf_params))
])

# 학습/검증 분리
X_tr, X_va, y_tr_all, y_va_all = train_test_split(
    X_train, y_train, test_size=0.2,
    random_state=42,
    stratify=y_train[label_cols[0]] if y_train[label_cols[0]].nunique()>1 else None
)

reports = {}
pred_labels = {}
pred_probas = {}

for tgt in label_cols:
    pipe.fit(X_tr, y_tr_all[tgt])
    y_va_pred = pipe.predict(X_va)
    reports[tgt] = classification_report(y_va_all[tgt], y_va_pred, digits=4, zero_division=0)
    print(f"\n=== Validation Report: {tgt} ===\n{reports[tgt]}")

    pred_labels[tgt] = pipe.predict(X_test)
    try:
        pred_probas[tgt] = pipe.predict_proba(X_test)
    except:
        pred_probas[tgt] = None

# =========================
# 7️⃣ 결과 저장
# =========================
out = pd.DataFrame(index=X_test.index)
for tgt in label_cols:
    out[f"pred_{tgt}"] = pred_labels[tgt]
    proba = pred_probas[tgt]
    if proba is not None and proba.ndim==2 and proba.shape[1]>=3:
        out[f"{tgt}_정상_%"] = (proba[:,0]*100).round(2)
        out[f"{tgt}_주의_%"] = (proba[:,1]*100).round(2)
        out[f"{tgt}_경고_%"] = (proba[:,2]*100).round(2)

out.to_csv("predictions_model_clean.csv", index=False, encoding="utf-8-sig")
print("[INFO] Saved:", "predictions_model_clean.csv")


[INFO] 수치형 컬럼: 23
label_역률평균 NaN 개수: 0
label_전류고조파평균 NaN 개수: 0
label_전압고조파평균 NaN 개수: 0
[INFO] X_train: (2420565, 23) y_train: (2420565, 3) X_test: (313267, 23)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.227960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5791
[LightGBM] [Info] Number of data points in the train set: 1936452, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612





=== Validation Report: label_역률평균 ===
              precision    recall  f1-score   support

           0     0.9919    0.9512    0.9711    263476
           1     0.6605    0.9423    0.7766     30875
           2     0.9931    0.9806    0.9868    189762

    accuracy                         0.9622    484113
   macro avg     0.8818    0.9580    0.9115    484113
weighted avg     0.9712    0.9622    0.9649    484113





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5791
[LightGBM] [Info] Number of data points in the train set: 1936452, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612





=== Validation Report: label_전류고조파평균 ===
              precision    recall  f1-score   support

           0     0.9977    0.9825    0.9901    390961
           1     0.9175    0.9673    0.9417     78104
           2     0.8818    0.9839    0.9300     15048

    accuracy                         0.9801    484113
   macro avg     0.9323    0.9779    0.9539    484113
weighted avg     0.9812    0.9801    0.9804    484113





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.142796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5791
[LightGBM] [Info] Number of data points in the train set: 1936452, number of used features: 23
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612





=== Validation Report: label_전압고조파평균 ===
              precision    recall  f1-score   support

           0     0.9285    0.9107    0.9195    198717
           1     0.8102    0.8550    0.8320    138719
           2     0.9374    0.9128    0.9249    146677

    accuracy                         0.8954    484113
   macro avg     0.8920    0.8928    0.8922    484113
weighted avg     0.8973    0.8954    0.8961    484113





[INFO] Saved: predictions_model_clean.csv
