# 라이브러리 및 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline

from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import  mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("교통사고데이터.csv")
df = df.iloc[:,1:]
df.head()

Unnamed: 0,월,요일,발생지_시도,기상상태,하루_교통사고_건수합
0,1,일,강원,맑음,427
1,1,일,강원,맑음,427
2,1,일,서울/경기/인천,흐림,427
3,1,일,경남/경북/부산/울산/대구,맑음,427
4,1,일,전남/전북/광주,맑음,427


# 모델링

In [3]:
# test set 만들기
train, test = train_test_split(df, train_size = 0.8, random_state=10)

# Target과 features 지정
Target = '하루_교통사고_건수합'
features = train.drop(columns=[Target]).columns

X_train = train[features]
y_train = train[Target]
X_test = test[features]
y_test = test[Target]

print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

X_train shape (530466, 4)
y_train shape (530466,)
X_test shape (132617, 4)
y_test shape (132617,)


In [4]:
# 회귀방정식 평가지표
def model_evaluation(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y, y_pred)

    return pd.DataFrame([['MSE', mse],['MAE', mae],['RMSE', rmse],['R2', r2]], columns=['Metric', 'Score'])

In [5]:
# LinearRegression
lr_pipe = make_pipeline(
    OrdinalEncoder(),
    LinearRegression()
)
lr_pipe.fit(X_train, y_train)
y_pred_lr = lr_pipe.predict(X_train)
mae_lr = mean_absolute_error(y_train, y_pred_lr)
print(f'훈련 에러: {mae_lr:.2f}')

훈련 에러: 63.41


In [6]:
# DecisionTreeRegressor
dt_pipe = make_pipeline(
    OrdinalEncoder(),
    DecisionTreeRegressor(max_depth=20, random_state=10)
)
dt_pipe.fit(X_train, y_train)
y_pred_dt = dt_pipe.predict(X_train)
mae_dt = mean_absolute_error(y_train, y_pred_dt)
print(f'훈련 에러: {mae_dt:.2f}')

훈련 에러: 42.65


In [7]:
# RandomForestRegressor
rf_pipe = make_pipeline(
    OrdinalEncoder(),
    RandomForestRegressor(n_estimators=50, max_depth=15, random_state=10)
)
rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_train)
mae_rf = mean_absolute_error(y_train, y_pred_rf)
print(f'훈련 에러: {mae_rf:.2f}')

훈련 에러: 42.66


In [8]:
# DecisionTreeRegressor Test set
y_test_pred_dt = dt_pipe.predict(X_test)
dt_test_df = model_evaluation(y_test, y_test_pred_dt)
dt_test_df

Unnamed: 0,Metric,Score
0,MSE,3541.833127
1,MAE,42.801242
2,RMSE,59.513302
3,R2,0.556291


In [9]:
# RandomForestRegressor Test set
y_test_pred_rf = rf_pipe.predict(X_test)
rf_test_df = model_evaluation(y_test, y_test_pred_rf)
rf_test_df

Unnamed: 0,Metric,Score
0,MSE,3539.696666
1,MAE,42.790046
2,RMSE,59.49535
3,R2,0.556559


# 모델 부호화

In [None]:
import pickle

# pickle.dump 함수를 통해 파일에 model의 내용을 부호화
with open('rf_model.pkl','wb') as pickle_file:
    pickle.dump(rf_pipe, pickle_file)