In [1]:
# 라이브러리
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# 데이터 불러오기 ( train만 사용 )
final_train = pd.read_csv("./data/final_Feature/train_final.csv")

# 1. 최소 변수로 모델 성능 확인하기
- 이번 실험에서는 최소 변수를 사용하여 모델의 성능을 확인 후 새로운 변수를 추가하여 변화되는 모델의 수치를 확인한다.
- 변화가 많은 변수들을 모델이 더 잘 인식할 수 있도록 모델링을 계획하여 실행한다.

## 기초 모델 변수 목록
- Weather 날씨
- SurfaceCondition 노면상태
- Vehicle_count_user 사고에 포함된 차량 수
- Safety_used_yes_count 안전밸트 착용인원
- Safety_used_no_count 안전밸트 미착용 인원
- Persons 사고에 참여한 인원 수

In [3]:
# 변수 컬럼 저장하기
df = final_train[
    ["Weather", "SurfaceCondition", "Vehicle_count_user", "Safety_used_yes_count", "Safety_used_no_count", "Persons"]
    ].copy()

In [4]:
# 데이터 타입 확인
df.dtypes

Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
dtype: object

In [5]:
# dtype 변경할 컬럼 선택
col = ["Weather", "SurfaceCondition"]

In [6]:
# category 타입으로 변환
df[col] = df[col].astype("category")

In [7]:
# Y-train 만들기
y_train = final_train['Gravity']

In [8]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [9]:
# LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [11]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

AUC: 0.6066053988492044


In [12]:
# F1_score 확인
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

F1: 0.0036429872495446266


In [13]:
# confusion matrix (혼동행렬 확인)
print("confusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

confusion matrix : 
[[9016   16]
 [ 531    1]]


In [14]:
# Classification Report
print("Classification Report")
print(classification_report(y_val, y_pred, labels=["NonLethal", "Lethal"]))

Classification Report
              precision    recall  f1-score   support

   NonLethal       0.94      1.00      0.97      9032
      Lethal       0.06      0.00      0.00       532

    accuracy                           0.94      9564
   macro avg       0.50      0.50      0.49      9564
weighted avg       0.90      0.94      0.92      9564



In [15]:
# 타킷의 비율 확인
final_train["Gravity"].value_counts(normalize=True)

Gravity
NonLethal    0.944412
Lethal       0.055588
Name: proportion, dtype: float64

## 최소 모델 실험 결과
- 대부분 심각하지 않다고 구분하여 심각하다라고 구분한것은 단 한건에 불과하다.
- 이 데이터는 실제로도 NonLethal 비율이 약 94% Lethal 비율이 약 0.55 % 으로 모델 또한 NonLethal로 예측을 더 많이하는것을 확인할 수 있다.

# 2. 변수 추가하여 모델 실험하기
- 변수를 추가하거나 제거하는 실험을 통해 어떤 변수가 관계가 높은지 분석하는 단계이다.
- 종합적으로 관련이 높은 변수를 추가해도 모델이 잘 읽지 못한다면 성능 개선에 의미가 없다고 판단했다.
- 이번 실험에서는 어떤 변수가 영향이 있고 또는 성능을 저하시키는지 살펴보며 실험을 진행한다.

In [16]:
# 변수 컬럼 저장하기
df = final_train[
    ["Date", "Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", "Safety_used_yes_count", "Safety_used_no_count", "Persons"]
    ].copy()

In [17]:
# 데이터 타입 확인
df.dtypes

Date                      object
Hour                      object
Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
dtype: object

In [18]:
# dtype 변경할 컬럼 선택
col = ["Date", "Hour", "Weather", "SurfaceCondition"]

In [19]:
# category 타입으로 변환
df[col] = df[col].astype("category")

In [20]:
# Y-train 만들기
y_train = final_train['Gravity']

In [21]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [22]:
# LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [23]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [24]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

AUC: 0.5485321197146986
F1: 0.03969465648854962


In [25]:
# confusion matrix (혼동행렬 확인)
print("confusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

confusion matrix : 
[[8922  110]
 [ 519   13]]


## Date, Hour 변수 추가하기
- 현재 모델은 시간(Date, Hour) 데이터를 단순한 숫자 값으로만 인식하기 때문에 날짜가 가지는 의미나 시간의 개념을 충분히 이해하지 못하는 상태이다.
- 이에 따라 Date와 Hour 변수를 모델이 개념적으로 해석할 수 있도록 피처 엔지니어링을 수행하고 그 결과를 다시 한 번 분석한다.

## Date, Hour 모델 실험 결과
- 이번에는 Lethal로 예측하는 경우가 상승하여 FP가 증가하고 FN이 감소하는 결과가 나타났다.
- 다만 TN이 감소하였으며 이는 Lethal로 구분하는 경우가 증가했다고 볼 수 있다.
- TP도 12건이 상승했지만 FP가 급 상승하면서 종합적인 성능이 많이 상승했다고 할 수 없다.

### Date
- 월 단위의 변수 생성
- 요일 단위의 변수 생성
- 주말 여부 변수 생성

위 세개의 변수만 생성한 뒤 모델에 실험 결과를 확인한다.

In [26]:
# datetime으로 변환하기
final_train["Date_dt"] = pd.to_datetime(final_train["Date"], errors="coerce")

In [27]:
# 월 단위 컬럼 생성
final_train["month"] = final_train["Date_dt"].dt.month

# 요일 컬럼 생성
final_train["weekday"] = final_train["Date_dt"].dt.day_name().str[:3]

# 주말 여부 컬럼 생성
final_train["is_weekend"] = (final_train["weekday"].isin(["Sat", "Sun"]).astype(int))

In [28]:
final_train["is_weekend"]

0        0
1        0
2        0
3        1
4        0
        ..
47811    0
47812    1
47813    1
47814    1
47815    0
Name: is_weekend, Length: 47816, dtype: int64

In [29]:
# 변수 컬럼 저장하기
df = final_train[
    ["Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_weekend"]
    ].copy()

In [30]:
# 데이터 타입 확인
df.dtypes

Hour                      object
Weather                   object
SurfaceCondition          object
Vehicle_count_user       float64
Safety_used_yes_count    float64
Safety_used_no_count     float64
Persons                  float64
month                      int32
weekday                   object
is_weekend                 int64
dtype: object

In [31]:
# dtype 변경할 컬럼 선택
col = ["Hour", "Weather", "SurfaceCondition", "weekday"]

In [32]:
# category 타입으로 변환
df[col] = df[col].astype("category")

In [33]:
# Y-train 만들기
y_train = final_train['Gravity']

In [34]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [35]:
# exp_03_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)


[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [36]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [37]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

AUC: 0.5730674810365151
F1: 0.01436265709156194


In [38]:
# confusion matrix (혼동행렬 확인)
print("confusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

confusion matrix : 
[[9011   21]
 [ 528    4]]


### Date 변수 생성 모델 실험 결과
- Date 기반 피처 엔지니어링 이후 TN은 증가하고 FP는 감소하여 NonLethal 사고에 대한 분류 안정성은 다소 개선된 것으로 확인되었다.
- 반면 FN은 초기 최소 모델과 유사한 수준으로 다시 증가하였으며 TP는 초기 모델 대비 소폭(3건) 증가하는 데 그쳤다.
- Date 정보가 Lethal 사고를 직접적으로 구분하는 강한 신호라기보다는 전반적인 분류 경계를 안정화하는 보조적 역할에 가까웠음을 시사한다.
- FN 감소 관점에서 Date 변수가 유의미한 영향을 주었다고 판단하기는 어렵다.


In [39]:
# 변수 컬럼 저장하기
df = final_train[
    ["Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [40]:
# dtype 변경할 컬럼 선택
col = ["Hour", "Weather", "SurfaceCondition", "weekday"]

In [41]:
# category 타입으로 변환
df[col] = df[col].astype("category")

In [42]:
# Y-train 만들기
y_train = final_train['Gravity']

In [43]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [44]:
# exp_03_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1051
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [45]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [46]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("F1:", f1_score(y_val, y_pred, pos_label="Lethal"))

AUC: 0.5784800450528447
F1: 0.02857142857142857


In [47]:
# confusion matrix (혼동행렬 확인)
print("confusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

confusion matrix : 
[[9012   20]
 [ 524    8]]


### is_weekend 변수 제거하고 실험

In [48]:
# 변수 컬럼 저장하기
df = final_train[
    ["Hour", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [49]:
# dtype 변경할 컬럼 선택
col = ["Hour", "Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [50]:
# Y-train 만들기
y_train = final_train['Gravity']

In [51]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [52]:
# exp_04_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1051
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [53]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [54]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.5784800450528447

F1: 0.02857142857142857

confusion matrix : 
[[9012   20]
 [ 524    8]]


### is_weekend 변수 제거 결과
- 생성한 is_weekend 변수를 포함한 모델과 is_weekend를 제거 한 모델과 비교시 AUC와 F1-score가 모두 개선되는 현상이 관찰되었다.
- 요일 변수와 중복 신호이며 Lethal 사고를 직접적으로 구분에 제한적인 설명력을 가졌다고 볼 수 있다.
- 위와 같은 이유로 모델의 판단을 불안정하게 만드는 요일으로 작용했을 가능성이 있다.
- Date관련 최종 모델에서는 **is_weekend 변수를 제외**하고 "month", "weekday" 변수만 사용한다.

### Hour
- 시간만 추출한 변수 생성 (hour_int)
- 시간대 그룹 변수 생성
  - MorningRush (출근시간대)
  - Daytime (낮 시간)
  - EveningRush (퇴근시간대)
  - Night (밤 시간)
  - LateNight (밤-새벽시간대)

### hour_int 변수 생성 후 실험

In [55]:
# timedelta로 변환
final_train["Hour"] = pd.to_timedelta(final_train["Hour"], errors="coerce")

In [56]:
# 시(hour)만 추출 (0~23)
final_train["hour_int"] = (final_train["Hour"].dt.total_seconds() // 3600).astype("Int64")

In [57]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_int", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [58]:
# dtype 변경할 컬럼 선택
col = ["Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [59]:
# dtype 조회
df.dtypes

hour_int                    Int64
Weather                  category
SurfaceCondition         category
Vehicle_count_user        float64
Safety_used_yes_count     float64
Safety_used_no_count      float64
Persons                   float64
month                       int32
weekday                  category
dtype: object

In [60]:
# Y-train 만들기
y_train = final_train['Gravity']

In [61]:
# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [62]:
# exp_05_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001296 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 122
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [63]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [64]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.5995431032186312

F1: 0.010889292196007259

confusion matrix : 
[[9016   16]
 [ 529    3]]


### 시간대 그룹 변수 생성 후 실험

In [65]:
# 시간 그룹 생성 함수
def hour_group(h):
    if pd.isna(h):
        return pd.NA
    h = int(h)
    if 7 <= h <= 9:
        return "MorningRush"
    elif 10 <= h <= 16:
        return "Daytime"
    elif 17 <= h <= 19:
        return "EveningRush"
    elif 20 <= h <= 23:
        return "Night"
    else:
        return "LateNight"

final_train["hour_group"] = final_train["hour_int"].apply(hour_group)

In [66]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_int","hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [67]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [68]:
# dtype 조회
df.dtypes

hour_int                    Int64
hour_group               category
Weather                  category
SurfaceCondition         category
Vehicle_count_user        float64
Safety_used_yes_count     float64
Safety_used_no_count      float64
Persons                   float64
month                       int32
weekday                  category
dtype: object

In [69]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [70]:
# exp_06_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [71]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [72]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.5986715154804637

F1: 0.018148820326678767

confusion matrix : 
[[9018   14]
 [ 527    5]]


### 시간대 그룹 변수만 사용하여 실험, 시간 추출 변수 제외

In [73]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [74]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [75]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [76]:
# exp_07_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 104
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [77]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [78]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6000874293239742

F1: 0.025089605734767026

confusion matrix : 
[[9013   19]
 [ 525    7]]


## Light 변수 추가하기

In [79]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light"]
    ].copy()

In [80]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [81]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [82]:
# exp_08_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [83]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [84]:
# 노면상태별 빈도 + 치명 비율
surface_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("SurfaceCondition")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

surface_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
SurfaceCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Flooded,38,7,0.184211
Mud,20,2,0.1
Ice,187,17,0.090909
Other,244,16,0.065574
Oil,95,6,0.063158
Snow,122,7,0.057377
Puddles,70,4,0.057143
Normal,34057,1924,0.056494
Missing,4384,238,0.054288
Wet,7653,399,0.052136


In [85]:
# risk 별 그룹화 진행
surface_risk_map = {
    "Flooded": "top_risk",
    
    "Mud": "high_risk",
    "Ice": "high_risk",
    
    "Other": "medium_risk",
    "Oil": "medium_risk",
    
    "Snow": "row_risk",
    "Puddles": "row_risk",
    "Normal": "row_risk",
    "Missing": "row_risk",
    "Wet": "row_risk",
    
    "Unknown": "Unknown"
}

# 그룹 컬럼 생성
final_train["surface_risk_group"] = (
    final_train["SurfaceCondition"]
    .map(surface_risk_map)
    .fillna("Unknown")
)

In [86]:
# surface_risk_group × Gravity 빈도 + 비율 계산

surface_risk_summary = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("surface_risk_group")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

surface_risk_summary

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
surface_risk_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
top_risk,38,7,0.184211
high_risk,207,19,0.091787
medium_risk,339,22,0.064897
row_risk,46286,2572,0.055568
Unknown,946,38,0.040169


In [87]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "surface_risk_group", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday"]
    ].copy()

In [88]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "surface_risk_group", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [89]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [90]:
# exp_08_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [91]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [92]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.609912042062641

F1: 0.018083182640144666

confusion matrix : 
[[9016   16]
 [ 527    5]]


In [93]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.609912042062641

F1: 0.018083182640144666

confusion matrix : 
[[9016   16]
 [ 527    5]]


### 'Light' 파생 변수 생성

In [94]:
Light_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("Light")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

Light_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
Light,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NightNoStreetLight,4603,636,0.138171
TwilightOrDawn,3184,201,0.063128
NightStreelightsOff,479,30,0.06263
Daylight,32029,1542,0.048144
NightStreelightsOn,7521,249,0.033107


In [95]:
# NightNoStreetLight 포함 여부 변수 생성

final_train["is_NightNoStreetLight"] = (final_train["Light"] == "NightNoStreetLight").astype(int)

In [96]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light", "is_NightNoStreetLight"]
    ].copy()

In [97]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [98]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [99]:
# exp_09_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 112
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [100]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [101]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6450201913663698

F1: 0.025089605734767026

confusion matrix : 
[[9013   19]
 [ 525    7]]


In [102]:
# is_light_medium 변수 생성 (중간 위험)
final_train["is_light_medium"] = (final_train["Light"].isin(["TwilightOrDawn", 
                                                             "NightStreelightsOff"])).astype(int)

In [103]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "Light", "is_NightNoStreetLight", "is_light_medium"]
    ].copy()

In [104]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [105]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [106]:
# exp_10_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [107]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

In [108]:
# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6347948522213417

F1: 0.02857142857142857

confusion matrix : 
[[9012   20]
 [ 524    8]]


In [109]:
# Daylight 여부 변수 생성
final_train["is_Daylight"] = (final_train["Light"] == "Daylight").astype(int)

In [110]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons", "Light",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight"]
    ].copy()

In [111]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "Light"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [112]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [113]:
# exp_11_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 116
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [114]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6319571973001592

F1: 0.021505376344086023

confusion matrix : 
[[9012   20]
 [ 526    6]]


In [115]:
# Light 변수 제거 후 재실험
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight"]
    ].copy()

In [116]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [117]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [118]:
# exp_12_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [119]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.6357414655993394

F1: 0.031914893617021274

confusion matrix : 
[[9009   23]
 [ 523    9]]


## CollisionType 변수 분석

In [120]:
# CollisionType 심각도 비율 확인
CollisionType_gravity = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    .groupby("CollisionType")
    .agg(
        total_count=("is_lethal", "size"),
        lethal_count=("is_lethal", "sum"),
        lethal_ratio=("is_lethal", "mean")
    )
    .sort_values("lethal_ratio", ascending=False)
)

CollisionType_gravity

Unnamed: 0_level_0,total_count,lethal_count,lethal_ratio
CollisionType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2Vehicles-BehindVehicles-Frontal,4274,502,0.117454
3+Vehicles-Multiple,1569,141,0.089866
Other,16159,1163,0.071972
NoCollision,4333,288,0.066467
2Vehicles-Side,13462,370,0.027485
2Vehicles-Behind,6139,151,0.024597
3+Vehicles-Chain,1880,43,0.022872


### CollisionType 변수 추가하여 모델 확인

In [121]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType"]
    ].copy()

In [122]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [123]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [124]:
# exp_13_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 118
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [125]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7054484015064233

F1: 0.028469750889679714

confusion matrix : 
[[9010   22]
 [ 524    8]]


### 2Vehicles-BehindVehicles-Frontal 포함 여부 컬럼 생성

In [126]:
# 2Vehicles-BehindVehicles-Frontal 여부 변수 생성
final_train["is_CollisionType_Frontal"] = (
    final_train["CollisionType"] == "2Vehicles-BehindVehicles-Frontal").astype(int)

In [127]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType", "is_CollisionType_Frontal"]
    ].copy()

In [128]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [129]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [130]:
# exp_14_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [131]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7062312279813795

F1: 0.0285204991087344

confusion matrix : 
[[9011   21]
 [ 524    8]]


### 2Vehicles-Side, 2Vehicles-Behind, 3+Vehicles-Chain 제외 포함 여부 변수 생성

In [132]:
# 3+Vehicles-Multiple 여부 변수 생성
final_train["is_CollisionType_Multiple"] = (final_train["CollisionType"] == "3+Vehicles-Multiple").astype(int)

In [133]:
# Other 여부 변수 생성
final_train["is_CollisionType_other"] = (final_train["CollisionType"] == "Other").astype(int)

In [134]:
# NoCollision 여부 변수 생성
final_train["is_CollisionType_NoCollision"] = (final_train["CollisionType"] == "NoCollision").astype(int)

In [135]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "CollisionType", 
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [136]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [137]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [138]:
# exp_15_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)


model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [139]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7073376740678089

F1: 0.0351493848857645

confusion matrix : 
[[9005   27]
 [ 522   10]]


### CollisionType row 신호 여부 컬럼 생성

In [140]:
# is_CollisionType_low 변수 생성

final_train["is_CollisionType_low"] = (final_train["CollisionType"].isin(["2Vehicles-Side",
                                                                          "2Vehicles-Behind",
                                                                          "3+Vehicles-Chain"])).astype(int)

In [141]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "CollisionType", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [142]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday", "CollisionType"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [143]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [144]:
# exp_16_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [145]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7069926185592412

F1: 0.03527336860670194

confusion matrix : 
[[9007   25]
 [ 522   10]]


### CollisionType 변수 제거 실험

In [146]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision"]
    ].copy()

In [147]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [148]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [149]:
# exp_17_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [150]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7036746330507402

F1: 0.03208556149732621

confusion matrix : 
[[9012   20]
 [ 523    9]]


## ImpactPoint_* 변수 분석
- ImpactPoint 변수는 count된 집계 변수이다.
- 총 ImpactPoint 집계 변수는 10개이다.

### ImpactPoint_* 변수 추가 실험

In [151]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide"]
    ].copy()

In [152]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [153]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [154]:
# exp_18_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [155]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7149207787515733

F1: 0.0427807486631016

confusion matrix : 
[[9015   17]
 [ 520   12]]


### ImpactPoint_* 변수 포함 여부 (이진화) 생성 실험

In [156]:
# 컬럼 매핑 딕셔너리
impact_cols = [
    "ImpactPoint_Back",
    "ImpactPoint_Front",
    "ImpactPoint_LeftBack",
    "ImpactPoint_LeftFront",
    "ImpactPoint_LeftSide",
    "ImpactPoint_Missing",
    "ImpactPoint_Multiple",
    "ImpactPoint_RightBack",
    "ImpactPoint_RightFront",
    "ImpactPoint_RightSide",
]

In [157]:
# 이진 변수 생성
for col in impact_cols:
    new_col = "is_" + col.replace("ImpactPoint_", "Impact_")
    final_train[new_col] = (final_train[col] > 0).astype(int)

In [158]:
# 생성 확인 - 포함 비율
final_train[[c for c in final_train.columns if c.startswith("is_Impact_")]].mean()

is_Impact_Back          0.142588
is_Impact_Front         0.545926
is_Impact_LeftBack      0.063033
is_Impact_LeftFront     0.224402
is_Impact_LeftSide      0.117304
is_Impact_Missing       0.083131
is_Impact_Multiple      0.022628
is_Impact_RightBack     0.047641
is_Impact_RightFront    0.203133
is_Impact_RightSide     0.101430
dtype: float64

In [159]:
# 치명 비율 확인
impact_lethal_ratio = (
    final_train
    .assign(is_lethal=(final_train["Gravity"] == "Lethal").astype(int))
    [[c for c in final_train.columns if c.startswith("is_Impact_")] + ["is_lethal"]]
    .groupby("is_lethal")
    .mean()
)

impact_lethal_ratio

Unnamed: 0_level_0,is_Impact_Back,is_Impact_Front,is_Impact_LeftBack,is_Impact_LeftFront,is_Impact_LeftSide,is_Impact_Missing,is_Impact_Multiple,is_Impact_RightBack,is_Impact_RightFront,is_Impact_RightSide
is_lethal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.147261,0.546747,0.064285,0.22439,0.118628,0.083042,0.021502,0.048984,0.204305,0.10244
1,0.063205,0.531979,0.041761,0.224605,0.094808,0.08465,0.041761,0.024831,0.18322,0.084274


In [160]:
# 변수 컬럼 저장하기
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "is_Impact_Back", "is_Impact_Front", "is_Impact_LeftBack", "is_Impact_LeftFront",
     "is_Impact_LeftSide", "is_Impact_Missing", "is_Impact_Multiple", "is_Impact_RightBack",
     "is_Impact_RightFront", "is_Impact_RightSide"]
    ].copy()

In [161]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [162]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [163]:
# exp_19_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [164]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7102471288384824

F1: 0.02882882882882883

confusion matrix : 
[[9017   15]
 [ 524    8]]


### ImpactPoint_* 변수 유지 높은 치명율의 신호만 이진변수로 추가하기

In [165]:
# ImpactPoint_* 단독 치명 비율 분석
impact_cols = [
    "ImpactPoint_Back",
    "ImpactPoint_Front",
    "ImpactPoint_LeftBack",
    "ImpactPoint_LeftFront",
    "ImpactPoint_LeftSide",
    "ImpactPoint_Missing",
    "ImpactPoint_Multiple",
    "ImpactPoint_RightBack",
    "ImpactPoint_RightFront",
    "ImpactPoint_RightSide",
]

impact_lethal_summary = []

for col in impact_cols:
    subset = final_train[final_train[col] > 0]
    impact_lethal_summary.append({
        "ImpactPoint": col,
        "total_count": len(subset),
        "lethal_count": (subset["Gravity"] == "Lethal").sum(),
        "lethal_ratio": (subset["Gravity"] == "Lethal").mean()
    })

impact_lethal_summary = (
    pd.DataFrame(impact_lethal_summary)
    .sort_values("lethal_ratio", ascending=False)
)

impact_lethal_summary

Unnamed: 0,ImpactPoint,total_count,lethal_count,lethal_ratio
6,ImpactPoint_Multiple,1082,111,0.102588
5,ImpactPoint_Missing,3975,225,0.056604
3,ImpactPoint_LeftFront,10730,597,0.055638
1,ImpactPoint_Front,26104,1414,0.054168
8,ImpactPoint_RightFront,9713,487,0.050139
9,ImpactPoint_RightSide,4850,224,0.046186
4,ImpactPoint_LeftSide,5609,252,0.044928
2,ImpactPoint_LeftBack,3014,111,0.036828
7,ImpactPoint_RightBack,2278,66,0.028973
0,ImpactPoint_Back,6818,168,0.024641


In [184]:
# 변수 컬럼 저장하기 ( is_Impact_Multiple 만 추가)
df = final_train[
    ["hour_group", "Weather", "SurfaceCondition", "Vehicle_count_user", 
     "Safety_used_yes_count", "Safety_used_no_count", "Persons",
     "month", "weekday", "is_NightNoStreetLight", "is_light_medium",
     "is_Daylight", "is_CollisionType_Frontal", "is_CollisionType_low",
     "is_CollisionType_Multiple", "is_CollisionType_other", "is_CollisionType_NoCollision",
     "ImpactPoint_Back", "ImpactPoint_Front", "ImpactPoint_LeftBack", "ImpactPoint_LeftFront",
     "ImpactPoint_LeftSide", "ImpactPoint_Missing", "ImpactPoint_Multiple", "ImpactPoint_RightBack",
     "ImpactPoint_RightFront", "ImpactPoint_RightSide", "is_Impact_Multiple"]
    ].copy()

In [185]:
# dtype 변경할 컬럼 선택
col = ["hour_group","Weather", "SurfaceCondition", "weekday"]

# category 타입으로 변환
df[col] = df[col].astype("category")

In [186]:
# Y-train 만들기
y_train = final_train['Gravity']

# 변수 분할

X = df
y = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [187]:
# exp_20_LightGBM 학습
model = LGBMClassifier(n_estimators=500,      
                       learning_rate=0.05,    
                       random_state=42)

model.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 36126, number of negative: 2126
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 159
[LightGBM] [Info] Number of data points in the train set: 38252, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.944421 -> initscore=2.832771
[LightGBM] [Info] Start training from score 2.832771


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [188]:
# 예측 확률 확인
y_proba = model.predict_proba(X_val)[:, 1]

# 이진 예측
y_pred = model.predict(X_val)

# AUC 확인
print("AUC:", roc_auc_score(y_val, y_proba))

# F1_score 확인
print("\nF1:", f1_score(y_val, y_pred, pos_label="Lethal"))

# confusion matrix (혼동행렬 확인)
print("\nconfusion matrix : ")
print(confusion_matrix(y_val, y_pred, labels=["NonLethal", "Lethal"]))

AUC: 0.7149207787515733

F1: 0.0427807486631016

confusion matrix : 
[[9015   17]
 [ 520   12]]
