In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report


# =========================
# 0) 데이터 로드
# =========================
accidents = pd.read_csv("./data/accidents_clean.csv")
places    = pd.read_csv("./data/places_clean.csv")
users     = pd.read_csv("./data/users_clean.csv")
vehicles  = pd.read_csv("./data/vehicles_clean.csv")


# =========================
# 1) row-level 결합 (집계/EDA 없음)
#    users 중심으로 vehicles를 (AccidentId, VehicleId)로 붙이면
#    카디시안 폭발을 크게 줄이면서도 "집계 없이" raw 정보를 유지 가능
# =========================
row_df = (
    users
    .merge(vehicles, on=["AccidentId", "VehicleId"], how="left", suffixes=("", "_veh"))
    .merge(places,   on="AccidentId", how="left")
    .merge(accidents[["AccidentId", "Gravity"]], on="AccidentId", how="left")
)

# 라벨 없는 행 제거 (test 사고가 섞였거나 join 누락 시 대비)
row_df = row_df.dropna(subset=["Gravity"]).copy()

y = row_df["Gravity"]
groups = row_df["AccidentId"]

X = row_df.drop(columns=["Gravity"], errors="ignore")


# =========================
# 2) Group split (AccidentId 기준, 누수 방지)
# =========================
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
tr_idx, va_idx = next(gss.split(X, y, groups=groups))

X_train = X.iloc[tr_idx].copy()
X_valid = X.iloc[va_idx].copy()
y_train = y.iloc[tr_idx].copy()
y_valid = y.iloc[va_idx].copy()

gid_valid = groups.iloc[va_idx].values  # pooling용

# 피처에서 그룹키 제거
X_train = X_train.drop(columns=["AccidentId"], errors="ignore")
X_valid = X_valid.drop(columns=["AccidentId"], errors="ignore")


# =========================
# 3) 전처리 + Logistic Regression
#    - 수치: median impute + scaling
#    - 범주: most_frequent impute + one-hot
# =========================
num_cols = X_train.select_dtypes(include=["int64", "float64", "Int64"]).columns
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols),
    ],
    remainder="drop"
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

clf.fit(X_train, y_train)


# =========================
# 4) 사고 단위 평가: row-level 확률 → AccidentId별 평균(pooling)
# =========================
proba = clf.predict_proba(X_valid)
classes = clf.named_steps["model"].classes_

proba_by_acc = (
    pd.DataFrame(proba, columns=classes)
    .assign(AccidentId=gid_valid)
    .groupby("AccidentId")
    .mean()
)

y_by_acc = (
    pd.DataFrame({"AccidentId": gid_valid, "Gravity": y_valid.values})
    .drop_duplicates("AccidentId")
    .set_index("AccidentId")
    .loc[proba_by_acc.index, "Gravity"]
)

pred_by_acc = proba_by_acc.idxmax(axis=1)

macro_f1 = f1_score(y_by_acc, pred_by_acc, average="macro")

print("Accident-level Macro F1 (Logistic, no-EDA/no-aggregation):", macro_f1)
print("\n[Accident-level] classification report")
print(classification_report(y_by_acc, pred_by_acc))


Accident-level Macro F1 (Logistic, no-EDA/no-aggregation): 0.510307427519219

[Accident-level] classification report
              precision    recall  f1-score   support

      Lethal       0.12      0.70      0.20       516
   NonLethal       0.98      0.70      0.82      9049

    accuracy                           0.70      9565
   macro avg       0.55      0.70      0.51      9565
weighted avg       0.93      0.70      0.79      9565



# LightGBM

In [10]:
# =========================
# 0. 라이브러리
# =========================
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import f1_score, classification_report
from lightgbm import LGBMClassifier


# =========================
# 1. 데이터 불러오기
# =========================
accidents = pd.read_csv("./data/accidents_clean.csv")
places    = pd.read_csv("./data/places_clean.csv")
users     = pd.read_csv("./data/users_clean.csv")
vehicles  = pd.read_csv("./data/vehicles_clean.csv")

# (핵심) 라벨 있는 사고만 남기기: Gravity NaN 제거
acc_train = accidents.dropna(subset=["Gravity"]).copy()


# =========================
# 2. row-level 데이터 결합 (집계/EDA 없음)
# =========================
row_df = (
    users
    .merge(vehicles, on=["AccidentId", "VehicleId"], how="left", suffixes=("", "_veh"))
    .merge(places, on="AccidentId", how="left")
    .merge(acc_train[["AccidentId", "Gravity"]], on="AccidentId", how="inner")  # 라벨 있는 사고만
)

# 타깃 / 그룹 / 입력 변수
y = row_df["Gravity"].copy()
groups = row_df["AccidentId"].copy()
X = row_df.drop(columns=["Gravity", "AccidentId"], errors="ignore").copy()


# =========================
# 3. AccidentId 기준 Group split (누수 방지)
# =========================
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, valid_idx = next(gss.split(X, y, groups=groups))

X_train = X.iloc[train_idx].copy()
X_valid = X.iloc[valid_idx].copy()
y_train = y.iloc[train_idx].copy()
y_valid = y.iloc[valid_idx].copy()
gid_valid = groups.iloc[valid_idx].values

# (안전장치) 혹시라도 남아있으면 제거
mask_tr = y_train.notna()
X_train = X_train.loc[mask_tr].copy()
y_train = y_train.loc[mask_tr].copy()

mask_va = y_valid.notna()
X_valid = X_valid.loc[mask_va].copy()
y_valid = y_valid.loc[mask_va].copy()
gid_valid = gid_valid[mask_va.values]


# =========================
# 4. dtype 정리 + NaN 처리 (LightGBM용 최소 전처리)
# =========================
cat_cols = X_train.select_dtypes(include=["object", "bool", "category"]).columns.tolist()

# category로 변환
for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    X_valid[c] = X_valid[c].astype("category")

# 범주형 NaN → "Unknown"
for c in cat_cols:
    if "Unknown" not in X_train[c].cat.categories:
        X_train[c] = X_train[c].cat.add_categories(["Unknown"])
    if "Unknown" not in X_valid[c].cat.categories:
        X_valid[c] = X_valid[c].cat.add_categories(["Unknown"])
    X_train[c] = X_train[c].fillna("Unknown")
    X_valid[c] = X_valid[c].fillna("Unknown")

# 수치형 NaN → -1
num_cols = [c for c in X_train.columns if c not in cat_cols]
X_train[num_cols] = X_train[num_cols].fillna(-1)
X_valid[num_cols] = X_valid[num_cols].fillna(-1)


# =========================
# 5. LightGBM 모델 학습
# =========================
lgb_model = LGBMClassifier(
    objective="multiclass",
    num_class=y_train.nunique(),
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight="balanced"
)

lgb_model.fit(X_train, y_train, categorical_feature=cat_cols)


# =========================
# 6. 사고 단위 평가 (row → AccidentId pooling)
# =========================
proba = lgb_model.predict_proba(X_valid)

proba_by_acc = (
    pd.DataFrame(proba)
    .assign(AccidentId=gid_valid)
    .groupby("AccidentId")
    .mean()
)

y_by_acc = (
    pd.DataFrame({"AccidentId": gid_valid, "Gravity": y_valid.values})
    .drop_duplicates("AccidentId")
    .set_index("AccidentId")
    .loc[proba_by_acc.index, "Gravity"]
)

# 예측 클래스 라벨(문자)로 변환
pred_by_acc = lgb_model.classes_[proba_by_acc.values.argmax(axis=1)]

macro_f1 = f1_score(y_by_acc, pred_by_acc, average="macro")

print("LightGBM (row-level, no aggregation) Accident-level Macro F1:", macro_f1)
print("\nClassification Report")
print(classification_report(y_by_acc, pred_by_acc))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1690
[LightGBM] [Info] Number of data points in the train set: 87004, number of used features: 64
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147
LightGBM (row-level, no aggregation) Accident-level Macro F1: 0.588947698942953

Classification Report
              precision    recall  f1-score   support

      Lethal       0.19      0.34      0.24       516
   NonLethal       0.96      0.92      0.94      9049

    accuracy                           0.88      9565
   macro avg       0.57      0.63      0.59      9565
weighted avg       0.92      0.88      0.90      9565

