In [2]:
!pip install -q lightgbm scikit-learn pandas numpy matplotlib
import numpy as np, pandas as pd, matplotlib.pyplot as plt, os, json
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
SRC = "../total_dataset_final.csv"

df = pd.read_csv(SRC)
print(df.shape)
df.head(3)


(17540, 49)


Unnamed: 0,timestamp,SO2,CO,O3,NO2,PM10,PM25,WS,PS,TA,...,RN_t_minus_1,RN_t_minus_2,VS_t_minus_1,VS_t_minus_2,Traffic_t_minus_1,Traffic_t_minus_2,WD_sin_t_minus_1,WD_sin_t_minus_2,WD_cos_t_minus_1,WD_cos_t_minus_2
0,2023-01-01 02:00:00,0.004,1.0,0.002,0.047,52.0,52.0,1.9,1028.7,1.5,...,0.0,0.0,1104.0,1503.0,3950.0,3873.0,-0.939693,-0.34202,-0.34202,-0.939693
1,2023-01-01 03:00:00,0.004,1.3,0.002,0.05,57.0,54.0,1.6,1029.0,1.6,...,0.0,0.0,925.0,1104.0,2389.0,3950.0,-0.939693,-0.939693,-0.34202,-0.34202
2,2023-01-01 04:00:00,0.004,1.3,0.002,0.052,67.0,57.0,1.4,1029.0,1.5,...,0.0,0.0,794.0,925.0,1916.0,2389.0,-0.939693,-0.939693,-0.34202,-0.34202


In [9]:
required = ["timestamp", "PM25_t_plus_1","PM25_t_plus_2","PM25_t_plus_3"]
missing = [c for c in required if c not in df.columns]
assert not missing, f"필수 컬럼 누락: {missing}"


In [10]:
import numpy as np
import pandas as pd

TARGETS = ["PM25_t_plus_1","PM25_t_plus_2","PM25_t_plus_3"]
THRESH = 35.0

def clip_range(s, low=None, high=None):
    s = pd.to_numeric(s, errors="coerce").copy()
    if low is not None:

        s[s < low] = np.nan if low == 0 else low
    if high is not None:
        s[s > high] = high
    return s

def preprocess_and_feature(df):
    df = df.copy()


    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)


    if "HM" in df.columns:
        df["HM"] = clip_range(df["HM"], 0, 100)
    for col in ["PM25","PM10","SO2","NO2","O3","CO","WS","RN"]:
        if col in df.columns:
            df[col] = clip_range(df[col], 0, None)


    candidates = [c for c in df.columns if c not in ["timestamp"] + TARGETS]
    for c in candidates:
        if df[c].dtype == "O":
            df[c] = pd.to_numeric(df[c], errors="coerce")

    num_cols = [c for c in candidates if np.issubdtype(df[c].dtype, np.number)]


    df = df.set_index("timestamp")

    if not df.index.is_monotonic_increasing:
        df = df.sort_index()


    df[num_cols] = df[num_cols].interpolate(method="time", limit_direction="both")
    for c in num_cols:
        df[c] = df[c].fillna(df[c].rolling(3, min_periods=1).median())

    df = df.reset_index()


    df["hour"] = df["timestamp"].dt.hour
    df["dow"] = df["timestamp"].dt.dayofweek
    df["month"] = df["timestamp"].dt.month
    df["is_weekend"] = (df["dow"] >= 5).astype(int)
    df["is_night"] = ((df["hour"] <= 6) | (df["hour"] >= 22)).astype(int)


    base = [c for c in ["PM25","PM10","NO2","O3","SO2","WS","TA","HM"] if c in df.columns]

    df = df.sort_values("timestamp").reset_index(drop=True)

    for col in base:
        df[f"{col}_roll3_mean"] = df[col].rolling(3, min_periods=1).mean()
        df[f"{col}_roll3_std"]  = df[col].rolling(3, min_periods=2).std()
        df[f"{col}_roll6_mean"] = df[col].rolling(6, min_periods=1).mean()
        df[f"{col}_roll6_std"]  = df[col].rolling(6, min_periods=2).std()
        for k in [1,2,3,4,5,6]:
            df[f"{col}_lag{k}"] = df[col].shift(k)

    if set(["WS","WD_sin","WD_cos"]).issubset(df.columns):
        df["WS_WDsin"] = df["WS"] * df["WD_sin"]
        df["WS_WDcos"] = df["WS"] * df["WD_cos"]
    if set(["TA","HM"]).issubset(df.columns):
        df["TAxHM"] = df["TA"] * df["HM"]

    feature_cols = [c for c in df.columns if c not in TARGETS + ["timestamp"]]
    feature_cols = [c for c in feature_cols if "_t_plus_" not in c]

    return df, feature_cols

df, feature_cols = preprocess_and_feature(df)
len(df), len(feature_cols), feature_cols[:10]


(17540,
 133,
 ['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', 'WS', 'PS', 'TA', 'HM'])

In [11]:
n = len(df)
train_end = int(n*0.70)
valid_end = int(n*0.85)

df["split"] = "test"
df.loc[:train_end-1, "split"] = "train"
df.loc[train_end:valid_end-1, "split"] = "valid"

X_train = df.loc[df["split"]=="train", feature_cols]
X_valid = df.loc[df["split"]=="valid", feature_cols]
X_test  = df.loc[df["split"]=="test",  feature_cols]

y_train = df.loc[df["split"]=="train", TARGETS].copy()
y_valid = df.loc[df["split"]=="valid", TARGETS].copy()
y_test  = df.loc[df["split"]=="test",  TARGETS].copy()

imputer = SimpleImputer(strategy="median")
imputer.fit(X_train)

X_train = pd.DataFrame(imputer.transform(X_train), columns=feature_cols, index=X_train.index)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=feature_cols, index=X_valid.index)
X_test  = pd.DataFrame(imputer.transform(X_test),  columns=feature_cols, index=X_test.index)

X_train.shape, X_valid.shape, X_test.shape


((12278, 133), (2631, 133), (2631, 133))

In [12]:
params = dict(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=127,
    max_depth=10,
    min_child_samples=60,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

models = {}
for tgt in TARGETS:
    m = lgb.LGBMRegressor(**params)
    m.fit(
        X_train, y_train[tgt].values,
        eval_set=[(X_valid, y_valid[tgt].values)],
        eval_metric="l1",
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    models[tgt] = m

models.keys()


dict_keys(['PM25_t_plus_1', 'PM25_t_plus_2', 'PM25_t_plus_3'])

In [13]:
def eval_split(X, y, name):
    row = {"split": name}
    for t in TARGETS:
        p = models[t].predict(X)
        row[f"MAE_{t}"] = mean_absolute_error(y[t].values, p)
    row["MAE_macro"] = np.mean([row[f"MAE_{t}"] for t in TARGETS])
    return row

metrics = pd.DataFrame([
    eval_split(X_train, y_train, "train"),
    eval_split(X_valid, y_valid, "valid"),
    eval_split(X_test,  y_test,  "test")
])
metrics


Unnamed: 0,split,MAE_PM25_t_plus_1,MAE_PM25_t_plus_2,MAE_PM25_t_plus_3,MAE_macro
0,train,1.528464,1.914395,2.456707,1.966522
1,valid,2.577445,3.311845,3.963089,3.284126
2,test,2.040394,2.713315,3.261111,2.671607


In [14]:
preds = df[["timestamp","split"]].copy()
for t in TARGETS:
    preds[f"yhat_{t}"] = np.nan

preds.loc[preds["split"]=="train", [f"yhat_{t}" for t in TARGETS]] = np.column_stack(
    [models[t].predict(X_train) for t in TARGETS]
)
preds.loc[preds["split"]=="valid", [f"yhat_{t}" for t in TARGETS]] = np.column_stack(
    [models[t].predict(X_valid) for t in TARGETS]
)
preds.loc[preds["split"]=="test",  [f"yhat_{t}" for t in TARGETS]] = np.column_stack(
    [models[t].predict(X_test) for t in TARGETS]
)

for t in TARGETS:
    preds[t] = df[t]

for t in TARGETS:
    preds[f"vent_ok_{t}"] = (preds[f"yhat_{t}"] <= THRESH).astype(int)

preds.head(5)


Unnamed: 0,timestamp,split,yhat_PM25_t_plus_1,yhat_PM25_t_plus_2,yhat_PM25_t_plus_3,PM25_t_plus_1,PM25_t_plus_2,PM25_t_plus_3,vent_ok_PM25_t_plus_1,vent_ok_PM25_t_plus_2,vent_ok_PM25_t_plus_3
0,2023-01-01 02:00:00,train,53.664579,53.666489,53.150684,54.0,57.0,61.0,0,0,0
1,2023-01-01 03:00:00,train,57.167879,55.874118,44.342773,57.0,61.0,41.0,0,0,0
2,2023-01-01 04:00:00,train,58.896289,47.600144,44.356318,61.0,41.0,34.0,0,0,0
3,2023-01-01 05:00:00,train,45.529256,40.018959,46.228268,41.0,34.0,47.0,0,0,0
4,2023-01-01 06:00:00,train,35.584871,41.85851,45.51875,34.0,47.0,55.0,0,0,0


In [15]:
os.makedirs("outputs", exist_ok=True)


for t in TARGETS:
    imp = models[t].feature_importances_
    imp_df = pd.DataFrame({"feature": feature_cols, "importance": imp}) \
                .sort_values("importance", ascending=False).head(30)
    plt.figure(figsize=(8,10))
    plt.barh(imp_df["feature"][::-1], imp_df["importance"][::-1])  # 색상 지정 X
    plt.title(f"Feature Importance (Top 30) - {t}")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.savefig(f"outputs/featimp_{t}.png", dpi=150)
    plt.close()


metrics.to_csv("outputs/metrics.csv", index=False)
preds.to_csv("outputs/predictions.csv", index=False)

with open("outputs/report.txt","w",encoding="utf-8") as f:
    f.write("=== LightGBM Ventilation Prediction Report ===\n")
    f.write(f"Rows={len(df)} | Features={len(feature_cols)} | Targets={TARGETS}\n")
    for _, r in metrics.iterrows():
        f.write(f"[{r['split']}] " + ", ".join([f"{t}={r[f'MAE_{t}']:.3f}" for t in TARGETS])
                + f", macro={r['MAE_macro']:.3f}\n")
    f.write(f"\nVentilation threshold: PM2.5 ≤ {THRESH} µg/m³\n")
    f.write("Artifacts: outputs/metrics.csv, outputs/predictions.csv, outputs/featimp_*.png\n")

sorted(os.listdir("outputs"))


['featimp_PM25_t_plus_1.png',
 'featimp_PM25_t_plus_2.png',
 'featimp_PM25_t_plus_3.png',
 'metrics.csv',
 'predictions.csv',
 'report.txt']

In [16]:
import joblib, os

os.makedirs("artifacts", exist_ok=True)
joblib.dump(models, "artifacts/models_lgbm.pkl")
joblib.dump(imputer, "artifacts/imputer_median.pkl")
joblib.dump(feature_cols, "artifacts/feature_cols.pkl")
joblib.dump(THRESH, "artifacts/threshold.pkl")

print("Saved:", os.listdir("artifacts"))


Saved: ['feature_cols.pkl', 'imputer_median.pkl', 'models_lgbm.pkl', 'threshold.pkl']
