In [1]:
# 라이브러리
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score

In [2]:
# 데이터 불러오기 ( train만 사용 )
final_train = pd.read_csv("./data/final_Feature/train_final.csv")

In [3]:
# 타킷의 비율 확인
final_train["Gravity"].value_counts(normalize=True)

Gravity
NonLethal    0.944412
Lethal       0.055588
Name: proportion, dtype: float64

# 1. 최소 변수로 모델 성능 확인하기
- 이번 실험에서는 최소 변수를 사용하여 모델의 성능을 확인 후 새로운 변수를 추가하여 변화되는 모델의 수치를 확인한다.
- 변화가 많은 변수들을 모델이 더 잘 인식할 수 있도록 모델링을 계획하여 실행한다.

## 기초 모델 변수 목록
- Weather 날씨
- SurfaceCondition 노면상태
- Vehicle_count_user 사고에 포함된 차량 수
- Safety_used_yes_count 안전밸트 착용인원
- Safety_used_no_count 안전밸트 미착용 인원
- Persons 사고에 참여한 인원 수

### 기초 변수 만들기

In [4]:
# 변수 컬럼 저장하기
df = final_train[
    ["Weather", "SurfaceCondition", "Vehicle_count_user", "Safety_used_yes_count", "Safety_used_no_count", "Persons"]
    ].copy()

In [5]:
# 데이터 타입 확인
df.dtypes

Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
dtype: object

In [6]:
# dtype 변경할 컬럼 선택
col = ["Weather", "SurfaceCondition"]

In [7]:
# category 타입으로 변환
df[col] = df[col].astype("category")

### LightGBM 모델 실험 exp_01

In [8]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.653436
[100]	valid_0's binary_logloss: 0.646705
[150]	valid_0's binary_logloss: 0.644743
[200]	valid_0's binary_logloss: 0.643395
[250]	valid_0's binary_logloss: 0.641602
[300]	valid_0's binary_logloss: 0.639546
[350]	valid_0's binary_logloss: 0.638905
[400]	valid_0's binary_logloss: 0.638096
[450]	valid_0's binary_logloss: 0.638692
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [9]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6007519421338998
PR-AUC (Average Precision): 0.0958424554864138
F1: 0.13818181818181818

confusion matrix : 
[[6236 2796]
 [ 285  247]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.96      0.69      0.80      9032
      Lethal       0.08      0.46      0.14       532

    accuracy                           0.68      9564
   macro avg       0.52      0.58      0.47      9564
weighted avg       0.91      0.68      0.76      9564



# 2. 변수 추가하여 모델 실험하기
- 변수를 추가하거나 제거하는 실험을 통해 어떤 변수가 관계가 높은지 분석하는 단계이다.
- 종합적으로 관련이 높은 변수를 추가해도 모델이 잘 읽지 못한다면 성능 개선에 의미가 없다고 판단했다.
- 이번 실험에서는 어떤 변수가 영향이 있고 또는 성능을 저하시키는지 살펴보며 실험을 진행한다.

## Date, Hour 변수 추가하기
- 현재 모델은 시간(Date, Hour) 데이터를 단순한 숫자 값으로만 인식하기 때문에 날짜가 가지는 의미나 시간의 개념을 충분히 이해하지 못하는 상태이다.
- 이에 따라 Date와 Hour 변수를 모델이 개념적으로 해석할 수 있도록 피처 엔지니어링을 수행하고 그 결과를 다시 한 번 분석한다.

In [10]:
# 변수 컬럼 저장하기
df = final_train[
    ["Date", "Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", "Safety_used_yes_count", "Safety_used_no_count", "Persons"]
    ].copy()

In [11]:
# 데이터 타입 확인
df.dtypes

Date                      object
Hour                      object
Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
dtype: object

In [12]:
# dtype 변경할 컬럼 선택
col = ["Date", "Hour", "Weather", "SurfaceCondition"]

In [13]:
# category 타입으로 변환
df[col] = df[col].astype("category")

### LightGBM 모델 실험 exp_02

In [14]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.525678
[100]	valid_0's binary_logloss: 0.455196
[150]	valid_0's binary_logloss: 0.405174
[200]	valid_0's binary_logloss: 0.36502
[250]	valid_0's binary_logloss: 0.343165
[300]	valid_0's binary_logloss: 0.321163
[350]	valid_0's binary_logloss: 0.306877
[400]	valid_0's binary_logloss: 0.296242
[450]	valid_0's binary_logloss: 0.288597
[500]	valid_0's binary_logloss: 0.283443
[550]	valid_0's binary_logloss: 0.2805
[

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5545668450355294
PR-AUC (Average Precision): 0.06995412318157368
F1: 0.06627218934911243

confusion matrix : 
[[8747  285]
 [ 504   28]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.97      0.96      9032
      Lethal       0.09      0.05      0.07       532

    accuracy                           0.92      9564
   macro avg       0.52      0.51      0.51      9564
weighted avg       0.90      0.92      0.91      9564



#### Date, Hour 모델 실험 결과
- 이번에는 Lethal로 예측하는 경우가 상승하여 FP가 증가하고 FN이 감소하는 결과가 나타났다.
- 다만 TN이 감소하였으며 이는 Lethal로 구분하는 경우가 증가했다고 볼 수 있다.
- TP도 12건이 상승했지만 FP가 급 상승하면서 종합적인 성능이 많이 상승했다고 할 수 없다.

## Date 피처 엔지니어링 month, weekday, is_weekend 변수 생성 및 추가
- 월 단위의 변수 생성
- 요일 단위의 변수 생성
- 주말 여부 변수 생성

위 세개의 변수만 생성한 뒤 모델에 실험 결과를 확인한다.

In [16]:
# datetime으로 변환하기
final_train["Date_dt"] = pd.to_datetime(final_train["Date"], errors="coerce")

In [17]:
# 월 단위 컬럼 생성
final_train["month"] = final_train["Date_dt"].dt.month

# 요일 컬럼 생성
final_train["weekday"] = final_train["Date_dt"].dt.day_name().str[:3]

# 주말 여부 컬럼 생성
final_train["is_weekend"] = (final_train["weekday"].isin(["Sat", "Sun"]).astype(int))

In [18]:
# 변수 컬럼 저장하기
df = final_train[
    ["Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_weekend"]
    ].copy()

In [19]:
# 데이터 타입 확인
df.dtypes

Hour                      object
Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
month                      int32
weekday                   object
is_weekend                 int64
dtype: object

In [20]:
# dtype 변경할 컬럼 선택
col = ["Hour", "Weather", "SurfaceCondition", "weekday"]

In [21]:
# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_03 모델 실험

In [22]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.58975
[100]	valid_0's binary_logloss: 0.559848
[150]	valid_0's binary_logloss: 0.541729
[200]	valid_0's binary_logloss: 0.528358
[250]	valid_0's binary_logloss: 0.515396
[300]	valid_0's binary_logloss: 0.5019
[350]	valid_0's binary_logloss: 0.493561
[400]	valid_0's binary_logloss: 0.484719
[450]	valid_0's binary_logloss: 0.477599
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [23]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5535904711402065
PR-AUC (Average Precision): 0.07176453706026174
F1: 0.09531374106433678

confusion matrix : 
[[8365  667]
 [ 472   60]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.93      0.94      9032
      Lethal       0.08      0.11      0.10       532

    accuracy                           0.88      9564
   macro avg       0.51      0.52      0.52      9564
weighted avg       0.90      0.88      0.89      9564



### Date 변수 생성 모델 실험 결과
- Date 기반 피처 엔지니어링 이후 TN은 증가하고 FP는 감소하여 NonLethal 사고에 대한 분류 안정성은 다소 개선된 것으로 확인되었다.
- 반면 FN은 초기 최소 모델과 유사한 수준으로 다시 증가하였으며 TP는 초기 모델 대비 소폭(3건) 증가하는 데 그쳤다.
- Date 정보가 Lethal 사고를 직접적으로 구분하는 강한 신호라기보다는 전반적인 분류 경계를 안정화하는 보조적 역할에 가까웠음을 시사한다.
- FN 감소 관점에서 Date 변수가 유의미한 영향을 주었다고 판단하기는 어렵다.


## is_weekend 변수 제거하고 실험

In [24]:
# 변수 컬럼 저장하기
df = final_train[
    ["Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [25]:
# dtype 변경할 컬럼 선택
col = ["Hour", "Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_04 모델 실험

In [26]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1051
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.59187
[100]	valid_0's binary_logloss: 0.561936
[150]	valid_0's binary_logloss: 0.545432
[200]	valid_0's binary_logloss: 0.529906
[250]	valid_0's binary_logloss: 0.517356
[300]	valid_0's binary_logloss: 0.505756
[350]	valid_0's binary_logloss: 0.497507
[400]	valid_0's binary_logloss: 0.488589
[450]	valid_0's binary_logloss: 0.481425
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [27]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5542647653789035
PR-AUC (Average Precision): 0.07201817560553016
F1: 0.09509433962264151

confusion matrix : 
[[8302  730]
 [ 469   63]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.92      0.93      9032
      Lethal       0.08      0.12      0.10       532

    accuracy                           0.87      9564
   macro avg       0.51      0.52      0.51      9564
weighted avg       0.90      0.87      0.89      9564



### is_weekend 변수 제거 결과
- 생성한 is_weekend 변수를 포함한 모델과 is_weekend를 제거 한 모델과 비교시 AUC와 F1-score가 모두 개선되는 현상이 관찰되었다.
- 요일 변수와 중복 신호이며 Lethal 사고를 직접적으로 구분에 제한적인 설명력을 가졌다고 볼 수 있다.
- 위와 같은 이유로 모델의 판단을 불안정하게 만드는 요일으로 작용했을 가능성이 있다.
- Date관련 최종 모델에서는 **is_weekend 변수를 제외**하고 "month", "weekday" 변수만 사용한다.

## hour_int 변수 생성 및 추가 (Hour 변수 제거)

In [28]:
# timedelta로 변환
final_train["Hour"] = pd.to_timedelta(final_train["Hour"], errors="coerce")

In [29]:
# 시(hour)만 추출 (0~23)
final_train["hour_int"] = (final_train["Hour"].dt.total_seconds() // 3600).astype("Int64")

In [30]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_int", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [31]:
# dtype 변경할 컬럼 선택
col = ["Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [32]:
# dtype 조회
df.dtypes

hour_int                    Int64
Weather                  category
SurfaceCondition         category
Vehicle_count_user        float64
Safety_used_yes_count     float64
Safety_used_no_count      float64
Persons                   float64
month                       int32
weekday                  category
dtype: object

### exp_05 모델 실험

In [33]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 122
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.63847
[100]	valid_0's binary_logloss: 0.62146
[150]	valid_0's binary_logloss: 0.61063
[200]	valid_0's binary_logloss: 0.601907
[250]	valid_0's binary_logloss: 0.593539
[300]	valid_0's binary_logloss: 0.586074
[350]	valid_0's binary_logloss: 0.578564
[400]	valid_0's binary_logloss: 0.572408
[450]	valid_0's binary_logloss: 0.567345
[500]	valid_0's bina

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [34]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5446426490273514
PR-AUC (Average Precision): 0.07487481491913142
F1: 0.11709844559585492

confusion matrix : 
[[7747 1285]
 [ 419  113]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.08      0.21      0.12       532

    accuracy                           0.82      9564
   macro avg       0.51      0.54      0.51      9564
weighted avg       0.90      0.82      0.86      9564



## 시간대 그룹 변수 hour_group 생성 및 추가

In [35]:
# 시간 그룹 생성 함수
def hour_group(h):
    if pd.isna(h):
        return pd.NA
    h = int(h)
    if 7 <= h <= 9:
        return "MorningRush"
    elif 10 <= h <= 16:
        return "Daytime"
    elif 17 <= h <= 19:
        return "EveningRush"
    elif 20 <= h <= 23:
        return "Night"
    else:
        return "LateNight"

final_train["hour_group"] = final_train["hour_int"].apply(hour_group)

In [36]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_int","hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [37]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [38]:
# dtype 조회
df.dtypes

hour_int                    Int64
hour_group               category
Weather                  category
SurfaceCondition         category
Vehicle_count_user        float64
Safety_used_yes_count     float64
Safety_used_no_count      float64
Persons                   float64
month                       int32
weekday                  category
dtype: object

### exp_06 모델 실험

In [39]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.638106
[100]	valid_0's binary_logloss: 0.620648
[150]	valid_0's binary_logloss: 0.609455
[200]	valid_0's binary_logloss: 0.601172
[250]	valid_0's binary_logloss: 0.592803
[300]	valid_0's binary_logloss: 0.584362
[350]	valid_0's binary_logloss: 0.577027
[400]	valid_0's binary_logloss: 0.570717
[450]	valid_0's binary_logloss: 0.565631
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [40]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5439392186178467
PR-AUC (Average Precision): 0.07520831896103695
F1: 0.11833069202324353

confusion matrix : 
[[7783 1249]
 [ 420  112]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.08      0.21      0.12       532

    accuracy                           0.83      9564
   macro avg       0.52      0.54      0.51      9564
weighted avg       0.90      0.83      0.86      9564



## 시간대 그룹 변수 hour_group만 적용하여 실험, 시간 추출 변수 hour_int 제외

In [41]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [42]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_07 모델 실험

In [43]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 104
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.640467
[100]	valid_0's binary_logloss: 0.624557
[150]	valid_0's binary_logloss: 0.615001
[200]	valid_0's binary_logloss: 0.607203
[250]	valid_0's binary_logloss: 0.599655
[300]	valid_0's binary_logloss: 0.592488
[350]	valid_0's binary_logloss: 0.58659
[400]	valid_0's binary_logloss: 0.581405
[450]	valid_0's binary_logloss: 0.57746
[500]	valid_0's bin

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [44]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5425118584215188
PR-AUC (Average Precision): 0.077765303982431
F1: 0.1035490605427975

confusion matrix : 
[[7293 1739]
 [ 408  124]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.81      0.87      9032
      Lethal       0.07      0.23      0.10       532

    accuracy                           0.78      9564
   macro avg       0.51      0.52      0.49      9564
weighted avg       0.90      0.78      0.83      9564



## Light 변수 추가하기

In [45]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light"]
    ].copy()

In [46]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_08 모델 실험

In [47]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.623964
[100]	valid_0's binary_logloss: 0.60591
[150]	valid_0's binary_logloss: 0.596336
[200]	valid_0's binary_logloss: 0.588072
[250]	valid_0's binary_logloss: 0.579965
[300]	valid_0's binary_logloss: 0.572777
[350]	valid_0's binary_logloss: 0.566585
[400]	valid_0's binary_logloss: 0.561056
[450]	valid_0's binary_logloss: 0.556532
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [48]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.568918386255719
PR-AUC (Average Precision): 0.08206025550079846
F1: 0.11941659070191431

confusion matrix : 
[[7501 1531]
 [ 401  131]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.83      0.89      9032
      Lethal       0.08      0.25      0.12       532

    accuracy                           0.80      9564
   macro avg       0.51      0.54      0.50      9564
weighted avg       0.90      0.80      0.84      9564



## 'Light' 피처 엔지니어링 is_NightNoStreetLight 변수 생성 및 추가

In [49]:
Light_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("Light")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

Light_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
Light,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NightNoStreetLight,4603,636,0.138171
TwilightOrDawn,3184,201,0.063128
NightStreelightsOff,479,30,0.06263
Daylight,32029,1542,0.048144
NightStreelightsOn,7521,249,0.033107


In [50]:
# NightNoStreetLight 포함 여부 변수 생성

final_train["is_NightNoStreetLight"] = (final_train["Light"] == "NightNoStreetLight").astype(int)

In [51]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light", "is_NightNoStreetLight"]
    ].copy()

In [52]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_09 모델 실험

In [53]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.623605
[100]	valid_0's binary_logloss: 0.606467
[150]	valid_0's binary_logloss: 0.595662
[200]	valid_0's binary_logloss: 0.587184
[250]	valid_0's binary_logloss: 0.5787
[300]	valid_0's binary_logloss: 0.571116
[350]	valid_0's binary_logloss: 0.564584
[400]	valid_0's binary_logloss: 0.559076
[450]	valid_0's binary_logloss: 0.554809
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [54]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5718773725167658
PR-AUC (Average Precision): 0.0843616625105853
F1: 0.12126779972439136

confusion matrix : 
[[7519 1513]
 [ 400  132]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.83      0.89      9032
      Lethal       0.08      0.25      0.12       532

    accuracy                           0.80      9564
   macro avg       0.51      0.54      0.50      9564
weighted avg       0.90      0.80      0.84      9564



## is_light_medium (중간 위험) 변수 생성 및 추가

In [55]:
# is_light_medium 변수 생성 (중간 위험)
final_train["is_light_medium"] = (final_train["Light"].isin(["TwilightOrDawn", 
                                                             "NightStreelightsOff"])).astype(int)

In [56]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light", "is_NightNoStreetLight", "is_light_medium"]
    ].copy()

In [57]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_10 모델 실험

In [58]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.623301
[100]	valid_0's binary_logloss: 0.605895
[150]	valid_0's binary_logloss: 0.595247
[200]	valid_0's binary_logloss: 0.586373
[250]	valid_0's binary_logloss: 0.578584
[300]	valid_0's binary_logloss: 0.571141
[350]	valid_0's binary_logloss: 0.564112
[400]	valid_0's binary_logloss: 0.558651
[450]	valid_0's binary_logloss: 0.553691
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [59]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5714238888296916
PR-AUC (Average Precision): 0.08532029861727777
F1: 0.12522851919561243

confusion matrix : 
[[7513 1519]
 [ 395  137]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.83      0.89      9032
      Lethal       0.08      0.26      0.13       532

    accuracy                           0.80      9564
   macro avg       0.52      0.54      0.51      9564
weighted avg       0.90      0.80      0.84      9564



## is_Daylight (Daylight 포함 여부) 변수 생성 및 추가

In [60]:
# Daylight 여부 변수 생성
final_train["is_Daylight"] = (final_train["Light"] == "Daylight").astype(int)

In [61]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons", "Light",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight"]
    ].copy()

In [62]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_11 모델 실험

In [63]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 116
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.623737
[100]	valid_0's binary_logloss: 0.606375
[150]	valid_0's binary_logloss: 0.595522
[200]	valid_0's binary_logloss: 0.587097
[250]	valid_0's binary_logloss: 0.579082
[300]	valid_0's binary_logloss: 0.571566
[350]	valid_0's binary_logloss: 0.564625
[400]	valid_0's binary_logloss: 0.55967
[450]	valid_0's binary_logloss: 0.555471
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [64]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.5703541751300305
PR-AUC (Average Precision): 0.08637428067458988
F1: 0.12272727272727273

confusion matrix : 
[[7499 1533]
 [ 397  135]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.83      0.89      9032
      Lethal       0.08      0.25      0.12       532

    accuracy                           0.80      9564
   macro avg       0.52      0.54      0.50      9564
weighted avg       0.90      0.80      0.84      9564



## Light 변수 제거 후 재실험

In [65]:
# Light 변수 제거 후 재실험
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight"]
    ].copy()

In [66]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_12 모델 실험

In [67]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.623653
[100]	valid_0's binary_logloss: 0.605719
[150]	valid_0's binary_logloss: 0.595629
[200]	valid_0's binary_logloss: 0.587484
[250]	valid_0's binary_logloss: 0.579368
[300]	valid_0's binary_logloss: 0.572093
[350]	valid_0's binary_logloss: 0.565444
[400]	valid_0's binary_logloss: 0.559712
[450]	valid_0's binary_logloss: 0.554859
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [68]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.573183609488735
PR-AUC (Average Precision): 0.08516145277642308
F1: 0.11965025310630464

confusion matrix : 
[[7521 1511]
 [ 402  130]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.83      0.89      9032
      Lethal       0.08      0.24      0.12       532

    accuracy                           0.80      9564
   macro avg       0.51      0.54      0.50      9564
weighted avg       0.90      0.80      0.84      9564



## CollisionType 변수 분석

In [69]:
# CollisionType 심각도 비율 확인
CollisionType_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("CollisionType")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

CollisionType_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
CollisionType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2Vehicles-BehindVehicles-Frontal,4274,502,0.117454
3+Vehicles-Multiple,1569,141,0.089866
Other,16159,1163,0.071972
NoCollision,4333,288,0.066467
2Vehicles-Side,13462,370,0.027485
2Vehicles-Behind,6139,151,0.024597
3+Vehicles-Chain,1880,43,0.022872


## CollisionType 변수 추가

In [70]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType"]
    ].copy()

In [71]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_13 모델 실험

In [72]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 118
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.60335
[100]	valid_0's binary_logloss: 0.581839
[150]	valid_0's binary_logloss: 0.570077
[200]	valid_0's binary_logloss: 0.560613
[250]	valid_0's binary_logloss: 0.552191
[300]	valid_0's binary_logloss: 0.543821
[350]	valid_0's binary_logloss: 0.537162
[400]	valid_0's binary_logloss: 0.531564
[450]	valid_0's binary_logloss: 0.526421
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [73]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6264111271868777
PR-AUC (Average Precision): 0.09827817937714853
F1: 0.152834008097166

confusion matrix : 
[[7739 1293]
 [ 381  151]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.10      0.28      0.15       532

    accuracy                           0.82      9564
   macro avg       0.53      0.57      0.53      9564
weighted avg       0.91      0.82      0.86      9564



## 2Vehicles-BehindVehicles-Frontal 포함 여부 컬럼 생성

In [74]:
# 2Vehicles-BehindVehicles-Frontal 여부 변수 생성
final_train["is_CollisionType_Frontal"] = (
    final_train["CollisionType"] == "2Vehicles-BehindVehicles-Frontal").astype(int)

In [75]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType", "is_CollisionType_Frontal"]
    ].copy()

In [76]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_14 모델 실험

In [77]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.602303
[100]	valid_0's binary_logloss: 0.58239
[150]	valid_0's binary_logloss: 0.570214
[200]	valid_0's binary_logloss: 0.560438
[250]	valid_0's binary_logloss: 0.55166
[300]	valid_0's binary_logloss: 0.543563
[350]	valid_0's binary_logloss: 0.536475
[400]	valid_0's binary_logloss: 0.530659
[450]	valid_0's binary_logloss: 0.525506
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [78]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.62639822402552
PR-AUC (Average Precision): 0.09749928193598825
F1: 0.15010141987829614

confusion matrix : 
[[7740 1292]
 [ 384  148]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.10      0.28      0.15       532

    accuracy                           0.82      9564
   macro avg       0.53      0.57      0.53      9564
weighted avg       0.91      0.82      0.86      9564



## 2Vehicles-Side, 2Vehicles-Behind, 3+Vehicles-Chain 제외 포함 여부 변수 생성

In [79]:
# 3+Vehicles-Multiple 여부 변수 생성
final_train["is_CollisionType_Multiple"] = (final_train["CollisionType"] == "3+Vehicles-Multiple").astype(int)

In [80]:
# Other 여부 변수 생성
final_train["is_CollisionType_other"] = (final_train["CollisionType"] == "Other").astype(int)

In [81]:
# NoCollision 여부 변수 생성
final_train["is_CollisionType_NoCollision"] = (final_train["CollisionType"] == "NoCollision").astype(int)

In [82]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "CollisionType", 
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [83]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_15 모델 실험

In [84]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.602575
[100]	valid_0's binary_logloss: 0.581226
[150]	valid_0's binary_logloss: 0.568467
[200]	valid_0's binary_logloss: 0.559244
[250]	valid_0's binary_logloss: 0.550468
[300]	valid_0's binary_logloss: 0.542171
[350]	valid_0's binary_logloss: 0.535422
[400]	valid_0's binary_logloss: 0.529556
[450]	valid_0's binary_logloss: 0.524458
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [85]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6265384938764094
PR-AUC (Average Precision): 0.09872081619944706
F1: 0.15392456676860347

confusion matrix : 
[[7753 1279]
 [ 381  151]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.11      0.28      0.15       532

    accuracy                           0.83      9564
   macro avg       0.53      0.57      0.53      9564
weighted avg       0.91      0.83      0.86      9564



## CollisionType row 신호 여부 컬럼 생성

In [86]:
# is_CollisionType_low 변수 생성

final_train["is_CollisionType_low"] = (final_train["CollisionType"].isin(["2Vehicles-Side",
                                                                          "2Vehicles-Behind",
                                                                          "3+Vehicles-Chain"])).astype(int)

In [87]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [88]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_16 모델 실험

In [89]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.603303
[100]	valid_0's binary_logloss: 0.582514
[150]	valid_0's binary_logloss: 0.569953
[200]	valid_0's binary_logloss: 0.560925
[250]	valid_0's binary_logloss: 0.552492
[300]	valid_0's binary_logloss: 0.544537
[350]	valid_0's binary_logloss: 0.537557
[400]	valid_0's binary_logloss: 0.53168
[450]	valid_0's binary_logloss: 0.526161
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [90]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6303316070845848
PR-AUC (Average Precision): 0.10044363219161363
F1: 0.15815815815815815

confusion matrix : 
[[7724 1308]
 [ 374  158]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.11      0.30      0.16       532

    accuracy                           0.82      9564
   macro avg       0.53      0.58      0.53      9564
weighted avg       0.91      0.82      0.86      9564



## CollisionType 변수 제거 실험

In [91]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [92]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_17 모델 실험

In [93]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.603271
[100]	valid_0's binary_logloss: 0.582101
[150]	valid_0's binary_logloss: 0.569649
[200]	valid_0's binary_logloss: 0.560389
[250]	valid_0's binary_logloss: 0.551489
[300]	valid_0's binary_logloss: 0.544063
[350]	valid_0's binary_logloss: 0.537312
[400]	valid_0's binary_logloss: 0.531319
[450]	valid_0's binary_logloss: 0.526895
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [94]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6200502848685043
PR-AUC (Average Precision): 0.09823670703713458
F1: 0.15353329944077276

confusion matrix : 
[[7748 1284]
 [ 381  151]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.86      0.90      9032
      Lethal       0.11      0.28      0.15       532

    accuracy                           0.83      9564
   macro avg       0.53      0.57      0.53      9564
weighted avg       0.91      0.83      0.86      9564



## ImpactPoint_* 변수 추가 실험
- ImpactPoint 변수는 count된 집계 변수이다.
- 총 ImpactPoint 집계 변수는 10개이다.

In [95]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide"]
    ].copy()

In [96]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_18 모델 실험

In [97]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.598194
[100]	valid_0's binary_logloss: 0.574694
[150]	valid_0's binary_logloss: 0.560291
[200]	valid_0's binary_logloss: 0.549233
[250]	valid_0's binary_logloss: 0.538618
[300]	valid_0's binary_logloss: 0.528667
[350]	valid_0's binary_logloss: 0.519823
[400]	valid_0's binary_logloss: 0.512171
[450]	valid_0's binary_logloss: 0.505981
[500]	valid_0's 

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [98]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6565147229233403
PR-AUC (Average Precision): 0.10844235263901417
F1: 0.16005121638924455

confusion matrix : 
[[8127  905]
 [ 407  125]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.90      0.93      9032
      Lethal       0.12      0.23      0.16       532

    accuracy                           0.86      9564
   macro avg       0.54      0.57      0.54      9564
weighted avg       0.91      0.86      0.88      9564



## ImpactPoint_* 변수 포함 여부에 대한 변수 is_Impact_* 생성 실험

In [99]:
# 컬럼 매핑 딕셔너리
impact_cols = [
    "ImpactPoint_Back",
    "ImpactPoint_Front",
    "ImpactPoint_LeftBack",
    "ImpactPoint_LeftFront",
    "ImpactPoint_LeftSide",
    "ImpactPoint_Missing",
    "ImpactPoint_Multiple",
    "ImpactPoint_RightBack",
    "ImpactPoint_RightFront",
    "ImpactPoint_RightSide",
]

In [100]:
# 이진 변수 생성
for col in impact_cols:
    new_col = "is_" + col.replace("ImpactPoint_", "Impact_")
    final_train[new_col] = (final_train[col] > 0).astype(int)

In [101]:
# 생성 확인 - 포함 비율
final_train[[c for c in final_train.columns if c.startswith("is_Impact_")]].mean()

is_Impact_Back          0.142588
is_Impact_Front         0.545926
is_Impact_LeftBack      0.063033
is_Impact_LeftFront     0.224402
is_Impact_LeftSide      0.117304
is_Impact_Missing       0.083131
is_Impact_Multiple      0.022628
is_Impact_RightBack     0.047641
is_Impact_RightFront    0.203133
is_Impact_RightSide     0.101430
dtype: float64

In [102]:
# 치명 비율 확인
impact_lethal_ratio = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    [[c for c in final_train.columns if c.startswith("is_Impact_")] + ["is_lethal"]]
    .groupby("is_lethal")
    .mean()
)

impact_lethal_ratio

Unnamed: 0_level_0,is_Impact_Back,is_Impact_Front,is_Impact_LeftBack,is_Impact_LeftFront,is_Impact_LeftSide,is_Impact_Missing,is_Impact_Multiple,is_Impact_RightBack,is_Impact_RightFront,is_Impact_RightSide
is_lethal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.147261,0.546747,0.064285,0.22439,0.118628,0.083042,0.021502,0.048984,0.204305,0.10244
1,0.063205,0.531979,0.041761,0.224605,0.094808,0.08465,0.041761,0.024831,0.18322,0.084274


In [103]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "is_Impact_Back", "is_Impact_Front", "is_Impact_LeftBack", "is_Impact_LeftFront",
     "is_Impact_LeftSide", "is_Impact_Missing", "is_Impact_Multiple", "is_Impact_RightBack",
     "is_Impact_RightFront", "is_Impact_RightSide"]
    ].copy()

In [104]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_19 모델 실험

In [105]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.598471
[100]	valid_0's binary_logloss: 0.575177
[150]	valid_0's binary_logloss: 0.561576
[200]	valid_0's binary_logloss: 0.55037
[250]	valid_0's binary_logloss: 0.540698
[300]	valid_0's binary_logloss: 0.530885
[350]	valid_0's binary_logloss: 0.522678
[400]	valid_0's binary_logloss: 0.515152
[450]	valid_0's binary_logloss: 0.508631
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [106]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6531303485684984
PR-AUC (Average Precision): 0.10618663001744227
F1: 0.14687100893997446

confusion matrix : 
[[8113  919]
 [ 417  115]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.90      0.92      9032
      Lethal       0.11      0.22      0.15       532

    accuracy                           0.86      9564
   macro avg       0.53      0.56      0.54      9564
weighted avg       0.90      0.86      0.88      9564



## ImpactPoint_* 변수 유지 높은 치명율의 신호만 이진변수로 추가하기

In [107]:
# ImpactPoint_* 단독 치명 비율 분석
impact_cols = [
    "ImpactPoint_Back",
    "ImpactPoint_Front",
    "ImpactPoint_LeftBack",
    "ImpactPoint_LeftFront",
    "ImpactPoint_LeftSide",
    "ImpactPoint_Missing",
    "ImpactPoint_Multiple",
    "ImpactPoint_RightBack",
    "ImpactPoint_RightFront",
    "ImpactPoint_RightSide",
]

impact_lethal_summary = []

for col in impact_cols:
    subset = final_train[final_train[col] > 0]
    impact_lethal_summary.append({
        "ImpactPoint": col,
        "total_count": len(subset),
        "lethal_count": (subset["Gravity"] == "Lethal").sum(),
        "lethal_ratio": (subset["Gravity"] == "Lethal").mean()
    })

impact_lethal_summary = (
    pd.DataFrame(impact_lethal_summary)
    .sort_values("lethal_ratio", ascending=False)
)

impact_lethal_summary

Unnamed: 0,ImpactPoint,total_count,lethal_count,lethal_ratio
6,ImpactPoint_Multiple,1082,111,0.102588
5,ImpactPoint_Missing,3975,225,0.056604
3,ImpactPoint_LeftFront,10730,597,0.055638
1,ImpactPoint_Front,26104,1414,0.054168
8,ImpactPoint_RightFront,9713,487,0.050139
9,ImpactPoint_RightSide,4850,224,0.046186
4,ImpactPoint_LeftSide,5609,252,0.044928
2,ImpactPoint_LeftBack,3014,111,0.036828
7,ImpactPoint_RightBack,2278,66,0.028973
0,ImpactPoint_Back,6818,168,0.024641


In [108]:
# 변수 컬럼 저장하기 ( is_Impact_Multiple 만 추가)
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide", "is_Impact_Multiple"]
    ].copy()

In [109]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_20 모델 실험

In [110]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.599636
[100]	valid_0's binary_logloss: 0.57623
[150]	valid_0's binary_logloss: 0.562604
[200]	valid_0's binary_logloss: 0.550645
[250]	valid_0's binary_logloss: 0.539537
[300]	valid_0's binary_logloss: 0.529467
[350]	valid_0's binary_logloss: 0.521244
[400]	valid_0's binary_logloss: 0.513789
[450]	valid_0's binary_logloss: 0.507929
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [111]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6604945157401919
PR-AUC (Average Precision): 0.10732020709750051
F1: 0.159846547314578

confusion matrix : 
[[8125  907]
 [ 407  125]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.90      0.93      9032
      Lethal       0.12      0.23      0.16       532

    accuracy                           0.86      9564
   macro avg       0.54      0.57      0.54      9564
weighted avg       0.91      0.86      0.88      9564



## ImpactPoint 그룹화 하기 - Count로 유지

In [112]:
# ImpactPoint 그룹화 (count 합산)

final_train["Impact_Multiple"] = (
    final_train["ImpactPoint_Multiple"]
)

final_train["Impact_Front"] = (
    final_train["ImpactPoint_LeftFront"]
    + final_train["ImpactPoint_Front"]
    + final_train["ImpactPoint_RightFront"]
)

final_train["Impact_Side"] = (
    final_train["ImpactPoint_RightSide"]
    + final_train["ImpactPoint_LeftSide"]
    + final_train["ImpactPoint_LeftBack"]
)

final_train["Impact_Back_high"] = (
    final_train["ImpactPoint_LeftBack"]
)

final_train["Impact_Back"] = (
    final_train["ImpactPoint_RightBack"]
    + final_train["ImpactPoint_Back"]
)

final_train["Impact_Missing"] = (
    final_train["ImpactPoint_Missing"]
)

In [113]:
# 그룹별 치명 비율

for col in [
    "Impact_Multiple",
    "Impact_Front",
    "Impact_Side",
    "Impact_Back_high",
    "Impact_Back",
    "Impact_Missing"
]:
    ratio = final_train.loc[final_train[col] > 0, "Gravity"].eq("Lethal").mean()
    print(col, round(ratio, 4))

Impact_Multiple 0.1026
Impact_Front 0.0544
Impact_Side 0.0448
Impact_Back_high 0.0368
Impact_Back 0.0255
Impact_Missing 0.0566


In [114]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "Impact_Multiple", "Impact_Front", "Impact_Side", "Impact_Back_high",
     "Impact_Back", "Impact_Missing"]
    ].copy()

In [115]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_21 모델 실험

In [116]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 145
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.599969
[100]	valid_0's binary_logloss: 0.578712
[150]	valid_0's binary_logloss: 0.56471
[200]	valid_0's binary_logloss: 0.554891
[250]	valid_0's binary_logloss: 0.545737
[300]	valid_0's binary_logloss: 0.536737
[350]	valid_0's binary_logloss: 0.529263
[400]	valid_0's binary_logloss: 0.523318
[450]	valid_0's binary_logloss: 0.517713
[500]	valid_0's b

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [117]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6545327349041337
PR-AUC (Average Precision): 0.10892162924158935
F1: 0.16610549943883277

confusion matrix : 
[[7930 1102]
 [ 384  148]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.88      0.91      9032
      Lethal       0.12      0.28      0.17       532

    accuracy                           0.84      9564
   macro avg       0.54      0.58      0.54      9564
weighted avg       0.91      0.84      0.87      9564



## Maneuver_* 변수 분석

In [118]:
# Maneuver 컬럼 리스트 정의
maneuver_cols = [
    'Maneuver_Avoidance',
    'Maneuver_BetweenLanes',
    'Maneuver_BusLaneSameDirection',
    'Maneuver_BusLaneWrongDirection',
    'Maneuver_ChangeToLeftLane',
    'Maneuver_ChangeToRightLane',
    'Maneuver_CrossLane',
    'Maneuver_CrossStrip',
    'Maneuver_Insertion',
    'Maneuver_Missing',
    'Maneuver_NoDirectionChange',
    'Maneuver_OpenDoor',
    'Maneuver_Park',
    'Maneuver_Parked',
    'Maneuver_PassLeft',
    'Maneuver_PassRight',
    'Maneuver_Reverse',
    'Maneuver_SameDirectionOrLane',
    'Maneuver_Stopped',
    'Maneuver_SwerveToLeft',
    'Maneuver_SwerveToRight',
    'Maneuver_TurnToLeft',
    'Maneuver_TurnToRight',
    'Maneuver_UTurnInLane',
    'Maneuver_WrongWay'
]

maneuver_cols = [c for c in maneuver_cols if c in final_train.columns]

In [119]:
# Maneuver별 심각도 비교

# 전체 데이터 기준 심각 사고 비율 (baseline)
baseline_lethal_ratio = (final_train["Gravity"] == "Lethal").mean()

out = []

for col in maneuver_cols:
    subset = final_train[final_train[col] > 0]

    if len(subset) == 0:
        continue

    lethal_ratio = (subset["Gravity"] == "Lethal").mean()

    out.append({
        "maneuver": col,
        "count": len(subset),
        "lethal_ratio": lethal_ratio,
        "baseline_lethal_ratio": baseline_lethal_ratio,
        "delta_vs_baseline": lethal_ratio - baseline_lethal_ratio,
        "relative_risk": lethal_ratio / baseline_lethal_ratio
    })

maneuver_severity_compare = (
    pd.DataFrame(out)
    .sort_values("delta_vs_baseline", ascending=False)
    .reset_index(drop=True)
)

maneuver_severity_compare

Unnamed: 0,maneuver,count,lethal_ratio,baseline_lethal_ratio,delta_vs_baseline,relative_risk
0,Maneuver_SwerveToLeft,3758,0.1248,0.055588,0.069212,2.245093
1,Maneuver_CrossStrip,115,0.104348,0.055588,0.04876,1.877162
2,Maneuver_WrongWay,823,0.09599,0.055588,0.040402,1.726814
3,Maneuver_SwerveToRight,1628,0.088452,0.055588,0.032864,1.591206
4,Maneuver_NoDirectionChange,27608,0.062989,0.055588,0.007401,1.133138
5,Maneuver_Missing,5603,0.053543,0.055588,-0.002045,0.963205
6,Maneuver_CrossLane,2093,0.053034,0.055588,-0.002554,0.954052
7,Maneuver_Avoidance,1360,0.052206,0.055588,-0.003382,0.939156
8,Maneuver_PassLeft,2681,0.051846,0.055588,-0.003742,0.932688
9,Maneuver_Parked,335,0.050746,0.055588,-0.004842,0.912898


## Maneuver_* 관련 변수 추가

In [120]:
# 변수 컬럼 저장하기
df = final_train[
    ["Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",

     "hour_group", "month", "weekday", 
     
     "is_NightNoStreetLight", "is_light_medium", "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide",
    
     "Maneuver_Avoidance", "Maneuver_BetweenLanes", "Maneuver_BusLaneSameDirection", "Maneuver_BusLaneWrongDirection",
     "Maneuver_ChangeToLeftLane", "Maneuver_ChangeToRightLane", "Maneuver_CrossLane", "Maneuver_CrossStrip",
     "Maneuver_Insertion", "Maneuver_Missing", "Maneuver_NoDirectionChange", "Maneuver_OpenDoor",
     "Maneuver_Park", "Maneuver_Parked", "Maneuver_PassLeft", "Maneuver_PassRight", "Maneuver_Reverse",
     "Maneuver_SameDirectionOrLane", "Maneuver_Stopped", "Maneuver_SwerveToLeft", "Maneuver_SwerveToRight",
     "Maneuver_TurnToLeft", "Maneuver_TurnToRight", "Maneuver_UTurnInLane", "Maneuver_WrongWay"]
    ].copy()

In [121]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_22 모델 실험

In [122]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 248
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.591939
[100]	valid_0's binary_logloss: 0.564666
[150]	valid_0's binary_logloss: 0.548391
[200]	valid_0's binary_logloss: 0.535655
[250]	valid_0's binary_logloss: 0.525028
[300]	valid_0's binary_logloss: 0.514504
[350]	valid_0's binary_logloss: 0.5055
[400]	valid_0's binary_logloss: 0.497156
[450]	valid_0's binary_logloss: 0.490134
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [123]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6911271410923233
PR-AUC (Average Precision): 0.11709385305645632
F1: 0.15877862595419848

confusion matrix : 
[[8358  674]
 [ 428  104]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.93      0.94      9032
      Lethal       0.13      0.20      0.16       532

    accuracy                           0.88      9564
   macro avg       0.54      0.56      0.55      9564
weighted avg       0.91      0.88      0.89      9564



## Maneuver_ SwerveToLeft, CrossStrip, WrongWay, SwerveToRight, NoDirectionChange 포함 여부에 따른 is_Maneuver* 변수 생성

In [124]:
# Maneuver_SwerveToLeft 여부 변수 생성
final_train["is_ManeuverSwerveToLeft"] = (final_train["Maneuver_SwerveToLeft"] > 0).astype(int)

In [125]:
# Maneuver_CrossStrip 여부 변수 생성
final_train["is_ManeuverCrossStrip"] = (final_train["Maneuver_CrossStrip"] > 0).astype(int)

In [126]:
# Maneuver_WrongWay 여부 변수 생성
final_train["is_ManeuverWrongWay"] = (final_train["Maneuver_WrongWay"] > 0).astype(int)

In [127]:
# Maneuver_SwerveToRight 여부 변수 생성
final_train["is_ManeuverSwerveToRight"] = (final_train["Maneuver_SwerveToRight"] > 0).astype(int)

In [128]:
# Maneuver_NoDirectionChange 여부 변수 생성
final_train["is_ManeuverNoDirectionChange"] = (final_train["Maneuver_NoDirectionChange"] > 0).astype(int)

In [129]:
# 변수 컬럼 저장하기
df = final_train[
    ["Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",

     "hour_group", "month", "weekday", 
     
     "is_NightNoStreetLight", "is_light_medium", "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide",
    
     "is_ManeuverSwerveToLeft", "is_ManeuverCrossStrip", "is_ManeuverWrongWay", "is_ManeuverSwerveToRight",
     "is_ManeuverNoDirectionChange"]
    ].copy()

In [130]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_23 모델 실험

In [131]:
# X, y 준비
y = final_train['Gravity']
X = df

# Train / Valid 분할
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 불균형 가중치 계산
y_tr_bin = (y_tr == "Lethal").astype(int)
pos_rate = y_tr_bin.mean()


# LightGBM 학습
model = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.03,
    random_state=42,
    class_weight="balanced",
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_lambda=0.5,
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="aucpr",
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

# best_iteration 확인 (실제로 멈춘 지점)
# best_iter = getattr(model, "best_iteration_", None)
# print(f"\n[Info] best_iteration_ = {best_iter}")

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 167
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
[50]	valid_0's binary_logloss: 0.593592
[100]	valid_0's binary_logloss: 0.567941
[150]	valid_0's binary_logloss: 0.552279
[200]	valid_0's binary_logloss: 0.540055
[250]	valid_0's binary_logloss: 0.528959
[300]	valid_0's binary_logloss: 0.518624
[350]	valid_0's binary_logloss: 0.509659
[400]	valid_0's binary_logloss: 0.50242
[450]	valid_0's binary_logloss: 0.49553
[500]	valid_0's bi

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [132]:
# 예측 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# 기본 예측
y_pred = model.predict(X_val)

# 평가 지표
# ROC-AUC
y_val_bin = (y_val == "Lethal").astype(int)
print("ROC-AUC:", roc_auc_score(y_val_bin, y_proba))

# PR-AUC(=Average Precision)
print("PR-AUC (Average Precision):", average_precision_score(y_val_bin, y_proba))

# F1
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

ROC-AUC: 0.6634894019259842
PR-AUC (Average Precision): 0.10722067923564191
F1: 0.15741391426563597

confusion matrix : 
[[8253  779]
 [ 420  112]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.91      0.93      9032
      Lethal       0.13      0.21      0.16       532

    accuracy                           0.87      9564
   macro avg       0.54      0.56      0.54      9564
weighted avg       0.91      0.87      0.89      9564



In [133]:
# 노면상태별 빈도 + 치명 비율
surface_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("SurfaceCondition")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

surface_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
SurfaceCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Flooded,38,7,0.184211
Mud,20,2,0.1
Ice,187,17,0.090909
Other,244,16,0.065574
Oil,95,6,0.063158
Snow,122,7,0.057377
Puddles,70,4,0.057143
Normal,34057,1924,0.056494
Missing,4384,238,0.054288
Wet,7653,399,0.052136


In [134]:
# risk 별 그룹화 진행
surface_risk_map = {
    "Flooded": "top_risk",
    
    "Mud": "high_risk",
    "Ice": "high_risk",
    
    "Other": "medium_risk",
    "Oil": "medium_risk",
    
    "Snow": "row_risk",
    "Puddles": "row_risk",
    "Normal": "row_risk",
    "Missing": "row_risk",
    "Wet": "row_risk",
    
    "Unknown": "Unknown"
}

# 그룹 컬럼 생성
final_train["surface_risk_group"] = (
    final_train["SurfaceCondition"]
    .map(surface_risk_map)
    .fillna("Unknown")
)

In [135]:
# surface_risk_group × Gravity 빈도 + 비율 계산

surface_risk_summary = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("surface_risk_group")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

surface_risk_summary

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
surface_risk_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
top_risk,38,7,0.184211
high_risk,207,19,0.091787
medium_risk,339,22,0.064897
row_risk,46286,2572,0.055568
Unknown,946,38,0.040169


In [136]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "surface_risk_group", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [137]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "surface_risk_group", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

### exp_09 모델 실험

In [138]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [139]:
THRESHOLD = 0.30

# Lethal 확률
lethal_idx = list(model.classes_).index("Lethal")
y_proba = model.predict_proba(X_val)[:, lethal_idx]

# threshold 적용
y_pred = np.where(y_proba >= THRESHOLD, "Lethal", "NonLethal")

# AUC (확률 기반 → threshold 영향 없음)
print("AUC:", roc_auc_score((y_val == "Lethal").astype(int), y_proba))

# F1 / confusion matrix / report
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

print("\nconfusion matrix:")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

print("\nClassification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6099120420626412

F1: 0.06209150326797386

confusion matrix:
[[8971   61]
 [ 513   19]]

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.95      0.99      0.97      9032
      Lethal       0.24      0.04      0.06       532

    accuracy                           0.94      9564
   macro avg       0.59      0.51      0.52      9564
weighted avg       0.91      0.94      0.92      9564

