# 교통사고 위험구역 예측 (정리본)

이 노트북은 **데이터 전처리 → 피처 생성 → 모델 학습/검증 → 모델 저장** 흐름만 남긴 정리본입니다.

- 타깃: `반경500m사고건수` (회귀)
- 기본 지표: MAE
- 검증: Hold-out + KFold CV


In [7]:
# 0) 라이브러리
import math
import numpy as np
import pandas as pd

from numba import jit
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

pd.set_option("display.max_columns", None)


In [8]:
# 1) 데이터 로드 (경로만 프로젝트에 맞게 수정)
# - 원본 사고 데이터(2019~2023)
accident_2023 = pd.read_csv('data/rawdata/도로교통공단_사망교통사고정보_2023.csv', encoding='cp949')
accident_2022 = pd.read_csv('data/rawdata/도로교통공단_사망교통사고정보_2022.csv', encoding='cp949')
accident_2021 = pd.read_csv('data/rawdata/도로교통공단_사망교통사고정보_2021.csv', encoding='cp949')
accident_2020 = pd.read_csv('data/rawdata/도로교통공단_사망교통사고정보_2020.csv', encoding='cp949')
accident_2019 = pd.read_csv('data/rawdata/도로교통공단_사망교통사고정보_2019.csv', encoding='cp949')

# (예시) 카메라/학교/전광판 raw 파일도 여기서 로드
camera = pd.read_csv('data/rawdata/전국무인교통단속카메라표준데이터.csv', encoding='cp949')
school = pd.read_csv('data/rawdata/재단법인한국지방교육행정연구재단_초중등학교위치.csv')
sign   = pd.read_csv('data/rawdata/전국가변전광표지판_안내전광판_표준데이터.csv', encoding='cp949')

print(accident_2023.shape, accident_2019.shape)


(2468, 23) (3233, 23)


  camera = pd.read_csv('data/rawdata/전국무인교통단속카메라표준데이터.csv', encoding='cp949')


In [9]:
# 2) 사고 데이터 통합 + 기본 전처리
# 2-1) 컬럼 통일 (필요 시)
# 여기서는 원본이 동일한 스키마라고 가정하고 concat
accident = pd.concat([accident_2019, accident_2020, accident_2021, accident_2022, accident_2023], axis=0, ignore_index=True)

# 2-2) 사용할 컬럼만 남기기 (원본 스키마에 맞게 조정)
use_cols = [
    '발생년', '발생년월일시', '주야', '요일',
    '사망자수', '부상자수', '중상자수', '경상자수', '부상신고자수',
    '발생지시도', '발생지시군구',
    '사고유형_대분류', '사고유형_중분류', '사고유형',
    '가해자법규위반', '도로형태_대분류', '도로형태',
    '가해자_당사자종별', '피해자_당사자종별',
    '경도', '위도'
]
accident = accident[use_cols].copy()

# 결측치 확인
print(accident.isna().sum().sort_values(ascending=False).head(10))


발생년       0
발생년월일시    0
주야        0
요일        0
사망자수      0
부상자수      0
중상자수      0
경상자수      0
부상신고자수    0
발생지시도     0
dtype: int64


In [10]:
# 3) 타깃/시간 파생
# 3-1) 사상자수(설명변수) 생성 (원본 노트북 로직 유지)
accident['사상자수'] = accident['사망자수'] + accident['중상자수'] * 0.7 + accident['경상자수'] * 0.3
accident = accident.drop(columns=['사망자수', '중상자수', '경상자수'])

# 3-2) 발생년월일시 -> 발생월/일/시
accident['발생월'] = accident['발생년월일시'].apply(lambda x: int(str(x)[5:7]))
accident['발생일'] = accident['발생년월일시'].apply(lambda x: int(str(x)[8:10]))
accident['발생시'] = accident['발생년월일시'].apply(lambda x: int(str(x)[11:13]))
accident = accident.drop(columns=['발생년월일시'])

accident[['발생년','발생월','발생일','발생시','주야','요일','경도','위도','사상자수']].head()


Unnamed: 0,발생년,발생월,발생일,발생시,주야,요일,경도,위도,사상자수
0,2019,1,1,0,야,화,129.152465,35.157884,1.0
1,2019,1,1,3,야,화,126.857149,37.288292,1.0
2,2019,1,1,16,주,화,128.60923,35.650108,1.0
3,2019,1,1,19,야,화,127.37956,36.355946,1.0
4,2019,1,1,21,야,화,128.915041,35.899506,1.0


In [11]:
# 4) (핵심) 반경 500m 내 시설 수/사고 수 계산
# 주의: 이 단계가 가장 오래 걸림 (좌표 개수에 따라 매우 느릴 수 있음)
# - 반경 단위는 km 기준 (radius=0.5 => 500m)

@jit
def count_function(coord1, coord2, radius=0.5):
    R = 6371.0
    count_data = []
    for (lat1, lon1) in coord1:
        count = 0
        for (lat2, lon2) in coord2:
            dx = (lon2 - lon1) * math.cos((lat1 + lat2) / 2)
            dy = lat2 - lat1
            distance = R * math.sqrt(dx**2 + dy**2)
            if distance <= radius:
                count += 1
        count_data.append(count)
    return count_data

# 좌표 배열 준비 (lat, lon) = (위도, 경도) 주의
acc_coord = list(zip(accident['위도'].astype(float).values, accident['경도'].astype(float).values))
cam_coord = list(zip(camera['위도'].astype(float).values,  camera['경도'].astype(float).values))
sch_coord = list(zip(school['위도'].astype(float).values,  school['경도'].astype(float).values))
sgn_coord = list(zip(sign['위도'].astype(float).values,    sign['경도'].astype(float).values))

# 사고(타깃) 자체도 반경 내 사고 건수로 만들려면, 'accident'끼리 카운팅
accident['반경500m사고건수'] = count_function(acc_coord, acc_coord, radius=0.5)
accident['반경500m카메라수'] = count_function(acc_coord, cam_coord, radius=0.5)
accident['반경500m학교수']  = count_function(acc_coord, sch_coord, radius=0.5)
accident['반경500m전광판수']= count_function(acc_coord, sgn_coord, radius=0.5)

accident[['반경500m사고건수','반경500m카메라수','반경500m학교수','반경500m전광판수']].head()


Unnamed: 0,반경500m사고건수,반경500m카메라수,반경500m학교수,반경500m전광판수
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [12]:
# 5) 모델 학습용 데이터셋 만들기 (원-핫 인코딩)
# - 범주형: 요일, 발생지시도, 사고유형_대분류, 도로형태_대분류, 가해자/피해자 종별, 주야
# - 수치형: 발생년/월/일/시, 경도/위도, 사상자수, 반경500m 시설수
#   (원본 노트북에서는 일부 수치형을 제거했는데, 여기서는 유지합니다)

cat_cols = ['요일','발생지시도','사고유형_대분류','도로형태_대분류','가해자_당사자종별','피해자_당사자종별','주야']
num_cols = ['발생년','발생월','발생일','발생시','경도','위도','사상자수',
            '반경500m카메라수','반경500m학교수','반경500m전광판수']
target_col = '반경500m사고건수'

X_cat = pd.get_dummies(accident[cat_cols], drop_first=False)
X_num = accident[num_cols].copy()
X = pd.concat([X_cat, X_num], axis=1)
y = accident[target_col].copy()

print("X shape:", X.shape, "y shape:", y.shape)


X shape: (14158, 75) y shape: (14158,)


In [13]:
# 6) Hold-out 평가 (baseline)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)

baseline = LGBMRegressor(random_state=42)
baseline.fit(X_train, y_train)
pred = baseline.predict(X_test)
mae = mean_absolute_error(y_test, pred)

print("Hold-out Baseline MAE:", mae)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 756
[LightGBM] [Info] Number of data points in the train set: 11326, number of used features: 66
[LightGBM] [Info] Start training from score 1.012096
Hold-out Baseline MAE: 0.03056329779266418


In [14]:
# 7) 모델 비교 (LightGBM를 baseline으로 하게됨)
models = {
    "LinearRegression": LinearRegression(),
    "LightGBM": LGBMRegressor(random_state=42),
}

if XGBRegressor is not None:
    models["XGBoost"] = XGBRegressor(random_state=42)

print("=== Model Comparison (Hold-out MAE) ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    p = model.predict(X_test)
    print(name, mean_absolute_error(y_test, p))


=== Model Comparison (Hold-out MAE) ===
LinearRegression 0.02640202238589598
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 756
[LightGBM] [Info] Number of data points in the train set: 11326, number of used features: 66
[LightGBM] [Info] Start training from score 1.012096
LightGBM 0.03056329779266418
XGBoost 0.03648877143859863


In [15]:
# 8) KFold CV (진짜 성능 확인)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_list = []

for tr_idx, va_idx in kf.split(X):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    model = LGBMRegressor(random_state=42)
    model.fit(X_tr, y_tr)
    p = model.predict(X_va)
    mae_list.append(mean_absolute_error(y_va, p))

print("MAE list:", mae_list)
print("CV mean MAE:", float(np.mean(mae_list)))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 758
[LightGBM] [Info] Number of data points in the train set: 11326, number of used features: 66
[LightGBM] [Info] Start training from score 1.013156
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 755
[LightGBM] [Info] Number of data points in the train set: 11326, number of used features: 66
[LightGBM] [Info] Start training from score 1.012714
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins

In [16]:
# 9) 개선해 보기 위해 GridSearch - CV 기준으로 튜닝
param_grid = {
    "n_estimators": [300, 500],
    "learning_rate": [0.05, 0.1],
    "num_leaves": [31, 50],
    "max_depth": [-1, 5],
}

grid = GridSearchCV(
    LGBMRegressor(random_state=42),
    param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1,
)
grid.fit(X, y)

print("Best Params:", grid.best_params_)
print("Best CV MAE:", -grid.best_score_)
best_model = grid.best_estimator_


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 14158, number of used features: 67
[LightGBM] [Info] Start training from score 1.012290
Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'num_leaves': 31}
Best CV MAE: 0.028923755940263345


In [None]:
# 10) 최종 모델 학습 + 저장 (API 서빙용)
import joblib

# 전체 데이터로 최종 학습
best_model.fit(X, y)

joblib.dump(best_model, "models/lgbm_model.pkl")
joblib.dump(list(X.columns), "models/feature_columns.pkl")

print("Saved: lgbm_model.pkl, feature_columns.pkl")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 14158, number of used features: 67
[LightGBM] [Info] Start training from score 1.012290
Saved: lgbm_model.pkl, feature_columns.pkl


## 다음 단계: API 서빙 (FastAPI)

- `lgbm_model.pkl` + `feature_columns.pkl`을 로드해서 `/predict` 엔드포인트를 만들면 됩니다.
- 입력은 **원-핫까지 끝난 feature dict** 형태로 받거나,
  더 실무적으로는 **원본 입력(JSON)** → 서버에서 동일 전처리/인코딩 → 예측 흐름으로 구현합니다.


In [18]:
import joblib
joblib.dump(best_model, "lgbm_model.pkl")
joblib.dump(list(X.columns), "feature_columns.pkl")

['feature_columns.pkl']