In [1]:
import pandas as pd

In [2]:
df_test = pd.read_csv('./data/df_test.csv')
df_train = pd.read_csv('./data/df_train.csv')

In [13]:
df_test.columns.tolist()

['AccidentId',
 'Date',
 'Hour',
 'Light',
 'Department',
 'Commune',
 'InAgglomeration',
 'IntersectionType',
 'Weather',
 'CollisionType',
 'Latitude',
 'Longitude',
 'RoadType',
 'RoadNumber',
 'RoadSecNumber',
 'Circulation',
 'LaneNumber',
 'SpecialLane',
 'Slope',
 'RoadMarkerId',
 'RoadMarkerDistance',
 'Layout',
 'StripWidth',
 'LaneWidth',
 'SurfaceCondition',
 'Infrastructure',
 'Localization',
 'SchoolNear',
 'Vehicle_count_user',
 'Persons',
 'Drivers',
 'Passengers',
 'SeatNan',
 'Safety_used_yes_count',
 'Safety_used_no_count',
 'Safety_used_unknown_count',
 'Safety_used_missing_count',
 'Male_count',
 'Female_count',
 'Gender_unknown',
 'Driver_age_mean',
 'Driver_age_min',
 'Driver_age_max',
 'Driver_age_unknown',
 'Passenger_child',
 'Passenger_youth',
 'Passenger_adult',
 'Passenger_senior',
 'Passenger_elder',
 'Passenger_age_unknown',
 'Category_Bicycle',
 'Category_Bus',
 'Category_Car<=3.5T',
 'Category_Coach',
 'Category_LargeCar+Trailer>3.5T',
 'Category_LargeCa

In [4]:
# 타깃 변수
y = df_train["Gravity"]

# ID / 타깃 제거 (EDA 파생변수 없이 그냥 남아있는 컬럼 전부 사용)
X = df_train.drop(columns=["Gravity", "AccidentId"], errors="ignore")

X_test = df_test.drop(columns=["AccidentId"], errors="ignore")

In [5]:
num_cols = X.select_dtypes(include=["int64", "float64", "Int64"]).columns
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ]
)

Logistic Regression 모델 (Baseline)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [9]:
from sklearn.linear_model import LogisticRegression

logistic_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            n_jobs=-1
        ))
    ]
)

In [10]:
# 모델학습
logistic_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [11]:
# Validation 성능 평가
from sklearn.metrics import classification_report, f1_score

y_pred = logistic_model.predict(X_valid)

print(classification_report(y_valid, y_pred))
print("Macro F1:", f1_score(y_valid, y_pred, average="macro"))

              precision    recall  f1-score   support

      Lethal       0.14      0.70      0.23       532
   NonLethal       0.98      0.75      0.85      9033

    accuracy                           0.75      9565
   macro avg       0.56      0.72      0.54      9565
weighted avg       0.93      0.75      0.81      9565

Macro F1: 0.5409823396017835


# LightGBM 모델

In [17]:
# 타깃
y = df_train["Gravity"]

# 입력 변수 (EDA 파생변수 추가 X, 집계 데이터 그대로)
X = df_train.drop(columns=["Gravity", "AccidentId"], errors="ignore")
X_test = df_test.drop(columns=["AccidentId"], errors="ignore")

In [18]:
num_cols = X.select_dtypes(include=["int64", "float64", "Int64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [20]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    objective="multiclass",
    num_class=y.nunique(),
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight="balanced"
)

In [22]:
# 1) 범주형 컬럼을 category로 변환 (LightGBM native categorical 처리용)
for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    X_valid[c] = X_valid[c].astype("category")

# test도 예측까지 할 거면 같이 맞추기
for c in cat_cols:
    X_test[c] = X_test[c].astype("category")

In [23]:
lgb_model.fit(
    X_train,
    y_train,
    categorical_feature=cat_cols
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info] Number of data points in the train set: 38257, number of used features: 105
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -0.693147


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [24]:
from sklearn.metrics import classification_report, f1_score

y_pred = lgb_model.predict(X_valid)

print(classification_report(y_valid, y_pred))
print("Macro F1:", f1_score(y_valid, y_pred, average="macro"))

              precision    recall  f1-score   support

      Lethal       0.23      0.31      0.26       532
   NonLethal       0.96      0.94      0.95      9033

    accuracy                           0.90      9565
   macro avg       0.59      0.62      0.60      9565
weighted avg       0.92      0.90      0.91      9565

Macro F1: 0.6037616286403978
