In [1]:
# !pip install dacon_submit_api-0.0.4-py3-none-any.whl

In [1]:
# from pycaret.time_series import *

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import category_encoders as ce

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import matplotlib
import seaborn as sns
import plotly.express as px
%matplotlib inline
matplotlib.rcParams['font.family'] = 'Malgun Gothic' # 한글 패치
# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
# from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRegressor, XGBRFRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, VotingRegressor 
from sklearn.ensemble import StackingClassifier, StackingRegressor
# from sklearn.base import ClassifierMixin

# CatBoost
from catboost import CatBoostRegressor

# PyTorch
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import Parameter
from torch import Tensor
from torch.utils.data import DataLoader

# for Torch hyper parameter tuning
from functools import partial
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss,mean_squared_error

# Utility
import os
import time
import datetime # ⚠️2019년 12월30일과 31일의 week of year가 1인 오류가 있음
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import holidays

# from bayes_opt import BayesianOptimization
# from num2words import num2words
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

In [2]:
import os
import random
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

import warnings
warnings.filterwarnings('ignore')

path = '../data/daegu/'

train_org = pd.read_csv(path+'train.csv')
test_org = pd.read_csv(path+'test.csv')
sample = pd.read_csv(path+'sample_submission.csv')

countrywide = pd.read_csv(path+'/external_open/countrywide_accident.csv')

In [3]:
light_df = pd.read_csv(path+'/external_open/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소']]
light_df.rename(columns={'설치개수':'보안등개수'},inplace=True)
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])
light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

child_area_df = pd.read_csv(path + 'external_open/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()
child_area_df['어린이보호구역개수'] = 1
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])
child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df = child_area_df[['도시','구','동','어린이보호구역개수','CCTV설치대수']]
child_area_df.reset_index(inplace=True, drop=True)

parking_df = pd.read_csv(path+'external_open/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])
parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)


In [4]:
# CCTV
cctv = pd.read_csv(path+'external_open/대구 CCTV 정보.csv',encoding='cp949')
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 중구 종로 17", '소재지지번주소'] = '대구광역시 중구 종로2가'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 중구 국채보상로 713", '소재지지번주소'] = '대구광역시 중구 동인동4가'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 달성군 다사읍 세천로 1", '소재지지번주소'] = '대구광역시 달성군 다사읍 세천리 1684-4'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 달성군 가창면 가창로 1", '소재지지번주소'] = '대구광역시 달성군 가창면 삼산리 산327-9'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 달성군 논공읍 논공로 818", '소재지지번주소'] = '대구광역시 달성군 논공읍'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 달성군 논공읍 비슬로 1193", '소재지지번주소'] = '대구광역시 달성군 논공읍 하리'
cctv.loc[cctv['소재지도로명주소'] == "대구광역시 달성군 가창면 헐티로 210", '소재지지번주소'] = '대구광역시 달성군 가창면 정대리'

cctv.loc[cctv['소재지지번주소'] == "대구광역시 중구 남산동912-5", '소재지지번주소'] = '대구광역시 중구 남산동 912-5'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 동구 능성동78-2", '소재지지번주소'] = '대구광역시 동구 능성동 78-2'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달서구 두류1.2동 1227-19", '소재지지번주소'] = '대구광역시 달서구 두류동'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 서구 비산2.3동 60-3", '소재지지번주소'] = '대구광역시 서구 비산동 60-3'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 중구 서성로1가41-3", '소재지지번주소'] = '대구광역시 중구 서성로1가 41-3'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달성군 현풍면 신기리 85-10", '소재지지번주소'] = '대구광역시 달성군 현풍읍 신기리 85-10'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달성군 북리 490-16", '소재지지번주소'] = '대구광역시 달성군 논공읍 논공로 818'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 서구 비산4동 169-1", '소재지지번주소'] = '대구광역시 서구 비산동 169-1'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 동구 신암4동 149-34", '소재지지번주소'] = '대구광역시 동구 신암동 149-34'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 동구 신암1동 592-2", '소재지지번주소'] = '대구광역시 동구 신암동 592-2'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달성군 용계리 산86-11", '소재지지번주소'] = '대구광역시 달성군 가창면'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 서구 원대동 170", '소재지지번주소'] = '대구광역시 서구 원대동1가 170'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달성군 유가면 한정리 3-4", '소재지지번주소'] = '대구광역시 달성군 유가읍 한정리 3-4'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 북구 침산2동 333-3", '소재지지번주소'] = '대구광역시 북구 침산동 333-3'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 서구 평리4동 1371-1", '소재지지번주소'] = '대구광역시 서구 평리동 1371-1'
cctv.loc[cctv['소재지지번주소'] == "대구광역시 달성군 매곡리 1125", '소재지지번주소'] = '대구광역시 달성군 다사읍 매곡리 1125'

cctv['도시'] = cctv.소재지지번주소.apply(lambda x : str(x).split(' ')[0])
cctv['구'] = cctv.소재지지번주소.apply(lambda x : str(x).split(' ')[1])
cctv['동'] = cctv.소재지지번주소.apply(lambda x : str(x).split(' ')[2])
cctv = cctv.loc[cctv['구'] != '군위군']
cctv = cctv.loc[cctv['동'] != '신안동']

# data = pd.concat([cctv[['구', '동']], pd.get_dummies(cctv['도로노선방향'], prefix = '방향')], axis = 1).groupby(['구', '동']).sum().reset_index()
# X = data.drop(['구', '동'], axis=1)
# X = X.applymap(lambda x : x*10).applymap(np.log)
# X = X.replace([np.inf, -np.inf, np.nan], 0)
# df = pd.concat([data[['구', '동']], X], axis = 1)

# train_df = train_df.merge(df, on = ['구', '동'], how = 'left').fillna(0)
# test_df = test_df.merge(df, on = ['구', '동'], how = 'left').fillna(0)
# data = pd.concat([cctv[['구', '동']], pd.get_dummies(cctv['도로종류'], prefix = '도로종류')], axis = 1).groupby(['구', '동']).sum().reset_index()
# train_df = train_df.merge(data, on = ['구', '동'], how = 'left').fillna(0)
# test_df = test_df.merge(data, on = ['구', '동'], how = 'left').fillna(0)

speed_cctv = cctv[cctv['제한속도'] > 0].groupby(['동'])[['제한속도']].mean()
speed_cctv  = speed_cctv.reset_index()


In [5]:
train_df = train_org.copy()
test_df = test_org.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})'

train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다
# train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다

# 해당 과정을 test_x에 대해서도 반복해줍니다
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
# test_df = test_df.drop(columns=['사고일시'])

# pd.to_datetime
train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])

location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
# train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
# test_df = test_df.drop(columns=['시군구'])

In [6]:
# train_df와 test_df에, light_df와 child_area_df, parking_df를 merge하세요.
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, speed_cctv, how='left', on=[ '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, speed_cctv, how='left', on=[ '동'])


In [7]:
# delete unnecessary objects
del light_df
del child_area_df
del cctv
del speed_cctv
del parking_df

In [8]:
train_df.shape, test_df.shape

((39609, 37), (10963, 22))

In [9]:
train_df

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO,연,월,일,시간,도시,구,동,보안등개수,어린이보호구역개수,CCTV설치대수,급지구분_1,급지구분_2,급지구분_3,제한속도
0,ACCIDENT_00000,2019-01-01 00:00:00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,상해없음,보행자,여,70세,중상,0,1,0,0,5,2019,1,1,0,대구광역시,중구,대신동,391.0,2.0,13.0,11.0,0.0,0.0,40.000000
1,ACCIDENT_00001,2019-01-01 00:00:00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,승용,남,39세,상해없음,보행자,남,61세,경상,0,0,1,0,3,2019,1,1,0,대구광역시,달서구,감삼동,932.0,,,0.0,1.0,3.0,52.500000
2,ACCIDENT_00002,2019-01-01 01:00:00,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,상해없음,보행자,남,38세,경상,0,0,1,0,3,2019,1,1,1,대구광역시,수성구,두산동,473.0,5.0,0.0,,,,55.000000
3,ACCIDENT_00003,2019-01-01 02:00:00,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,남,49세,상해없음,승용,남,36세,중상,0,1,0,0,5,2019,1,1,2,대구광역시,북구,복현동,534.0,11.0,32.0,0.0,9.0,5.0,38.000000
4,ACCIDENT_00004,2019-01-01 04:00:00,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,승용,남,30세,상해없음,승용,남,52세,경상,0,0,1,0,3,2019,1,1,4,대구광역시,동구,신암동,2057.0,,,0.0,1.0,0.0,41.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19:00:00,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,승용,여,52세,상해없음,이륜,남,28세,경상,0,0,1,0,3,2021,12,31,19,대구광역시,수성구,수성동3가,,1.0,0.0,,,,40.000000
39605,ACCIDENT_39605,2021-12-31 19:00:00,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,승용,여,60세,상해없음,승용,남,52세,경상,0,0,1,0,3,2021,12,31,19,대구광역시,달서구,상인동,843.0,,,0.0,0.0,5.0,44.444444
39606,ACCIDENT_39606,2021-12-31 21:00:00,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,승용,남,60세,중상,승용,남,73세,중상,0,2,0,0,10,2021,12,31,21,대구광역시,달서구,월성동,164.0,,,0.0,1.0,0.0,41.111111
39607,ACCIDENT_39607,2021-12-31 22:00:00,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,승용,남,40세,상해없음,승용,여,57세,경상,0,0,1,0,3,2021,12,31,22,대구광역시,달서구,장동,210.0,,,0.0,0.0,1.0,60.000000


In [9]:
# train_df['도로형태_1'] = train_df['도로형태'].str.split(' - ', expand=True)[0]
# train_df['도로형태_2'] = train_df['도로형태'].str.split(' - ', expand=True)[1]

# test_df['도로형태_1'] = test_df['도로형태'].str.split(' - ', expand=True)[0]
# test_df['도로형태_2'] = test_df['도로형태'].str.split(' - ', expand=True)[1]

# train_df.columns

In [10]:
# ############ 공간 데이터 추가
# timeline = {5:'새벽',6:"새벽",7:"출퇴근",8:"출퇴근",18:"출퇴근",19:"출퇴근",20:"출퇴근",
#             9:"주간",10:"주간",11:"주간",12:"주간",13:"주간",14:"주간",15:"주간",16:"주간",
#             17:"주간",21:"심야",22:"심야",23:"심야",0:"심야",1:"심야",2:"심야",3:"심야",4:"심야"}

season = {3:"봄", 4:"봄",5:"봄",
          6:"여름",7:"여름",8:"여름",
          9:"가을",10:"가을",11:"가을",
          1:"겨울",2:"겨울",12:"겨울"}
# train_df['사고시간대'] = train_df['시간'].map(timeline)
train_df['계절'] = train_df["월"].map(season)
# test_df['사고시간대'] = test_df['시간'].map(timeline)
test_df['계절'] = test_df["월"].map(season)

In [23]:
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Polygon, MultiPolygon, Point
import matplotlib.pyplot as plt

# 횡단보도
# cross_walk = gpd.read_file(path+'A0043325.shp', encoding='cp949')#shx 데이터도 같이 있어야함
# cross_walk.to_crs(epsg = 4326, inplace = True)
# cross_walk.head()

In [12]:
# death_2019 = pd.read_csv(path+'도로교통공단_사망교통사고정보(2019).csv',encoding = 'cp949')
# death_2020 = pd.read_csv(path+'도로교통공단_사망교통사고정보(2020).csv',encoding = 'cp949')
# death_2021 = pd.read_csv(path+'사망교통사고정보(2021).csv',encoding = 'cp949')

# death_2019_deagu=death_2019.loc[death_2019['발생지시도']=="대구"].reset_index()
# death_2020_deagu=death_2020.loc[death_2020['발생지시도']=="대구"].reset_index()
# death_2021_deagu=death_2021.loc[death_2021['발생지시도']=="대구"].reset_index()

# a=pd.concat([death_2019_deagu,death_2020_deagu])
# death_train = pd.concat([a,death_2021_deagu]).reset_index()

# geometry = [Point(lon, lat) for lon, lat in zip(death_train['경도'], death_train['위도'])]
# death_gdf = gpd.GeoDataFrame(death_train, geometry=geometry, crs='EPSG:4326')

# death_gdf.head()

In [13]:
# F1 = path+'대구광역시_가로망주정차현황_주차운영및이용실태정보(SHP)_20211210/'+'대구_가로망주정차.shp'
# D1 = gpd.read_file(F1, encoding='cp949')
# D1.to_crs(epsg = 4326, inplace = True)

# D1.head(3)

In [14]:
import folium
m = folium.Map(location= [35.8714354,128.601445],
               zoom_start = 14)

In [15]:
######## 완료된 코드
# for _ , r in cross_walk.iterrows():
#     # Without simplifying the representation of each borough,
#     # the map might not be displayed
#     sim_geo = gpd.GeoSeries(r["geometry"]).simplify(tolerance=0.001)
#     geo_j = sim_geo.to_json()
#     geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
# #     folium.Popup(r["BoroName"]).add_to(geo_j)
#     geo_j.add_to(m)

In [16]:
######## 완료된 코드
# m

In [17]:
######## 완료된 코드
# coords = list(cross_walk.geometry[0].exterior.coords)[0]
# coords

In [12]:
import requests

# # NCP 콘솔에서 복사한 클라이언트ID와 클라이언트Secret 값
# client_id = "y9ig6cz3sg"
# client_secret = "1YHOy21rqo1f1gmJkxkUGD8jTDmIZ7Z81IdtLfcY"

# # 좌표 (경도, 위도)
# coords = f"{coords[0]},{coords[1]}"
# output = "json"
# orders = 'addr'
# endpoint = "https://naveropenapi.apigw.ntruss.com/map-reversegeocode/v2/gc"
# url = f"{endpoint}?coords={coords}&output={output}&orders={orders}"

# # 헤더
# headers = {
#     "X-NCP-APIGW-API-KEY-ID": client_id,
#     "X-NCP-APIGW-API-KEY": client_secret,
# }

# # 요청
# res = requests.get(url, headers=headers)
# # res.json()

In [182]:
# j = res.json()
# j

In [13]:
#### 횡단보도 동별로 개수 구하기
def get_sigudong_of_cross_walk(row):
    polygon_obj = row['geometry']
    latlong = list(polygon_obj.exterior.coords)[0]
    
    # NCP 콘솔에서 복사한 클라이언트ID와 클라이언트Secret 값
    client_id = "y9ig6cz3sg"
    client_secret = "1YHOy21rqo1f1gmJkxkUGD8jTDmIZ7Z81IdtLfcY"

    # 좌표 (경도, 위도)
    coords = f"{latlong[0]},{latlong[1]}"
    output = "json"
    orders = 'addr'
    endpoint = "https://naveropenapi.apigw.ntruss.com/map-reversegeocode/v2/gc"
    url = f"{endpoint}?coords={coords}&output={output}&orders={orders}"

    # 헤더
    headers = {
        "X-NCP-APIGW-API-KEY-ID": client_id,
        "X-NCP-APIGW-API-KEY": client_secret,
    }

    # 요청
    res = requests.get(url, headers=headers)
    j = res.json()
    
    if j['status']['name'] == 'ok':
        si = j['results'][0]['region']['area1']['name']
        gu = j['results'][0]['region']['area2']['name']
        dong = j['results'][0]['region']['area3']['name']
        
    else:
        si = ''
        gu = ''
        dong = ''
    
    row[['도시','구','동']] = [si,gu,dong]
    return row
    

In [90]:
######## 완료된 코드
# cross_walk[['도시','구','동']]=''

# cross_walk = cross_walk.apply(get_sigudong_of_cross_walk,axis=1)


In [14]:
cross_walk = pd.read_csv('../data/daegu/reverse_geocoding/cross_walk.csv',encoding='cp949')
cross_walk.head(2)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,g2_id,uqftid_id,cnstc_se,cnstc_de,geometry,도시,구,동,cross_walk_area,횡단보도개수
0,0,0,0,1,,STS001,20211025,POLYGON ((128.54589292024264 35.86499997997502...,대구광역시,서구,중리동,2.25461e-09,126
1,1,1,1,2,,STS001,20211025,POLYGON ((128.54621118455825 35.86499998387293...,대구광역시,서구,중리동,3.020793e-09,126


In [36]:
cw = pd.DataFrame(cross_walk.groupby('동')['횡단보도개수'].sum())
cw

Unnamed: 0_level_0,횡단보도개수
동,Unnamed: 1_level_1
가창면,11664
가천동,400
각산동,45369
갈산동,6084
감삼동,10000
...,...
호산동,57600
화원읍,104976
화전동,1
황금동,8100


In [211]:
### 완료된 코드
# cross_walk_count = pd.DataFrame(data=cross_walk.groupby('동').count()['도시']).reset_index()
# cross_walk_count.rename(columns={'도시':'횡단보도개수'},inplace=True)

# def get_crosswalk_count(dong):
#     count=cross_walk_count[cross_walk_count['동']==dong]['횡단보도개수']
#     if len(count)>0:
#         count = count.values[0]
#     else:
#         count = 0
#     return count
    
# cross_walk['횡단보도개수'] = cross_walk['동'].map(get_crosswalk_count)

# cross_walk.head(3)

In [116]:
### 완료된 코드
# cross_walk['cross_walk_area'] = cross_walk['geometry'].area
# cross_walk = cross_walk.drop(index=cross_walk['도시']!='대구광역시')
# cross_walk.head(3)

In [160]:
### 완료된 코드
# cross_walk.to_csv('../data/daegu/reverse_geocoding/cross_walk.csv',encoding='cp949')

In [15]:
### 동별, 시간대별 대중교통 통행량
traffic = pd.read_csv(path+'동별_시간별_통행량.csv',encoding='utf-8')
traffic.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,구,동,시간,통행량
0,0,0,남구,대명동,0,492
1,1,1,남구,대명동,1,402


In [37]:
#### 📢 외부 데이터 Merge
traffic = traffic[['구','동','시간','통행량']]

###### 동별, 시간대별 대중교통 교통 통행량 
train_df_merged = pd.merge(train_df,traffic,how='left',on=['구','동','시간'])

###### 횡단보도 개수 Merge 
# 개수로 merge
train_df_merged = pd.merge(train_df_merged, cw, how='left', on=['동'])

# 면적 밀도로 merge (횡단보도 area / 동 area)
# (나중에 동별 면적 구하면 다시)

#######



####### test 에도 Merge
test_df_merged = pd.merge(test_df,traffic,how='left',on=['구','동','시간'])

test_df_merged = pd.merge(test_df_merged, cw, how='left', on=['동'])



train_df_merged.head(3)

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO,연,월,일,시간,도시,구,동,보안등개수,어린이보호구역개수,CCTV설치대수,급지구분_1,급지구분_2,급지구분_3,제한속도,계절,통행량,횡단보도개수
0,ACCIDENT_00000,2019-01-01 00:00:00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,상해없음,보행자,여,70세,중상,0,1,0,0,5,2019,1,1,0,대구광역시,중구,대신동,391.0,2.0,13.0,11.0,0.0,0.0,40.0,겨울,40.0,5929.0
1,ACCIDENT_00001,2019-01-01 00:00:00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,승용,남,39세,상해없음,보행자,남,61세,경상,0,0,1,0,3,2019,1,1,0,대구광역시,달서구,감삼동,932.0,,,0.0,1.0,3.0,52.5,겨울,214.0,10000.0
2,ACCIDENT_00002,2019-01-01 01:00:00,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,상해없음,보행자,남,38세,경상,0,0,1,0,3,2019,1,1,1,대구광역시,수성구,두산동,473.0,5.0,0.0,,,,55.0,겨울,84.0,14641.0


In [38]:
train_df_merged.shape

(39609, 40)

In [29]:
# # delete unnecessary object
# del cross_walk
# del traffic

In [94]:
########### 가로망주정차현황_노상주차면정보
# 노상주차
external = path+'external_open/대구 빅데이터 마트 데이터/'
nosang1 = gpd.read_file(external+
    '6. 건축물/3. 대구광역시_가로망주정차현황_노상주차면정보/노상주차_100.shp',)
nosang2 = gpd.read_file(external+
    '6. 건축물/3. 대구광역시_가로망주정차현황_노상주차면정보/노상주차_300.shp',)
nosang3 = gpd.read_file(external+
    '6. 건축물/3. 대구광역시_가로망주정차현황_노상주차면정보/노상주차_500.shp',)

In [95]:
nosang = pd.concat([nosang1,nosang2,nosang3],axis=0)
nosang = nosang.to_crs(epsg='4326') # crs 정보 위경도 좌표계로 변경
nosang.set_index('id',inplace=True)
nosang.sort_index(inplace=True)
nosang.reset_index(inplace=True)
nosang.head(2)

Unnamed: 0,id,count,PRK_LVL_co,PRK_LVL__1,PRK_LVL__2,PRK_PAY_YN,PRK_PAY__1,PRK_PAY__2,geometry
0,69,,,,,,,,"POLYGON ((128.35675 35.70806, 128.35710 35.708..."
1,70,,,,,,,,"POLYGON ((128.35180 35.70402, 128.35230 35.704..."


In [None]:
nosang = pd.read_csv(path+'reverse_geocoding/nosang.csv',encoding='cp949')
nosang.head(3)


## Insight 반영

In [41]:
##################################### 🔻피처 추가
import holidays
def make_holi(x):
    kr_holi = holidays.KR()
    if x in kr_holi:
        # 공휴일vs비공휴일 eclo가 큰 차이를 보이지 않기 때문에 0.5만
        return 0.5
    else:
        return 0

for df in [train_df_merged,test_df_merged]:
    print('?1')
    # 제주감귤 공휴일 추출 코드 참고
    # 공휴일 변수 추가
#     df["공휴일"] = df["사고일시"].map(lambda x : make_holi(x))
    print('?2')
    # 주말 컬럼 추가
    # 주말vs평일 eclo가 큰 차이를 보이지 않기 때문에 0.5만
    df['주말'] = df['요일'].map(lambda x:0.5 if x in ['토요일','일요일'] else 0)
    print('?3')
    # 연평균 eclo의 감소 추세 반영
    # 일단 대충 넣어두기
    years = dict(zip([2019,2020,2021,2022,2023],[4.8,4.7,4.6,4.5,4.4]))
    df['연평균추세']=df['연'].map(lambda x:years.get(x))
    print('?4')
    # 시야감소 컬럼 추가 (시간이 20시~04시 사이인 경우)
    # 이 시간대와 이 시간대 외의 eclo가 0.5~1.4 차이가 나므로 1값을 준다.
    df['시야지수'] = df['시간'].map(lambda x:1 if (x>=20)|(x<=4) else 0)
    print('?5')
    # 주차장 급지구분 일단 주차장 지수로 계산해서 컬럼 수를 줄임
    # 주차장지수 = (1급지 * 4) + (2급지 * 2) + (3급지 * 1)
    df[['급지구분_1','급지구분_2','급지구분_3']] = df[['급지구분_1','급지구분_2','급지구분_3']].fillna(0)
    df['주차장지수'] = df['급지구분_1']*4 + df['급지구분_2']*2 + df['급지구분_3']
    print('?6')



?1
?2
?3
?4
?5
?6
?1
?2
?3
?4
?5
?6


In [42]:
# 📢📢📢 모든 데이터 준비가 완료되면 이 코드 실행
train_df = train_df_merged
test_df = test_df_merged

## train & pred

In [59]:
### 불필요 컬럼 drop
train_df_ = train_df #.drop(columns=['ID','사고일시'])
test_df_ = test_df.drop(columns=['사고일시','ID','시군구','도시','급지구분_1','급지구분_2',
         '급지구분_3'#,'cross_walk_area'
        ]) 
# 어차피 다 대구광역시라 도시 불필요

test_x = test_df_.copy()

train_x = train_df_[test_x.columns].copy()
# train_y = train_df_['ECLO'].copy()
train_y = np.log1p(train_df_['ECLO'].values).copy() # 타겟 log 변환

In [60]:
categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)
numerical_features = list(train_x.dtypes[train_x.dtypes != "object"].index)
# 추출된 문자열 변수 확인
display(categorical_features)

['요일', '기상상태', '도로형태', '노면상태', '사고유형', '구', '동', '계절']

In [53]:
# ohe = OneHotEncoder()


In [61]:
# 📢📢📢 인코딩

# from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder
# from sklearn.preprocessing import OneHotEncoder

# onehot_cols = ['요일','기상상태','도로형태','노면상태','사고유형','계절']
# target_cols = ['구','동']

# train_x_ = pd.get_dummies(train_x[onehot_cols],dtype=int)
# train_x_ = pd.concat([train_x_,train_x.drop(columns=onehot_cols)], axis=1)
# test_x_ = pd.get_dummies(test_x[onehot_cols],dtype=int)
# test_x_ = pd.concat([test_x_,test_x.drop(columns=onehot_cols)], axis=1)
# ## test에 없는 컬럼 >> 0으로 추가
# test_x_['기상상태_안개'] = 0
# test_x_[['동_내동','동_도남동','동_둔산동',
#         '동_서야동','동_신무동','동_장관동']] = 0


# for c in target_cols:
#     encoder = TargetEncoder()
#     train_x[c] = encoder.fit_transform(train_x[c], train_df['ECLO'])
#     test_x[c] = encoder.transform(test_x[c])

for i in categorical_features:
    encoder = TargetEncoder()
    train_x[i] = encoder.fit_transform(train_x[i], train_df['ECLO'])
    test_x[i] = encoder.transform(test_x[i])

# One-Hot 인코딩용
# train_x = train_x_
# test_x = test_x_
# display(train_x_.head(3))
# display(test_x_.head(3))
# print(train_x_.shape, test_x_.shape)

# Target 인코딩용
display(train_x.head(3))
display(test_x.head(3))
print(train_x.shape, test_x.shape)

Unnamed: 0,요일,기상상태,도로형태,노면상태,사고유형,연,월,일,시간,구,동,보안등개수,어린이보호구역개수,CCTV설치대수,제한속도,계절,통행량,횡단보도개수,주말,연평균추세,시야지수,주차장지수
0,4.627926,4.712888,4.65399,4.712878,3.81765,2019,1,1,0,4.54161,4.282449,391.0,2.0,13.0,40.0,4.659111,40.0,5929.0,0.0,4.8,1,44.0
1,4.627926,4.77915,4.65399,4.712878,3.81765,2019,1,1,0,4.618441,4.738938,932.0,,,52.5,4.659111,214.0,10000.0,0.0,4.8,1,5.0
2,4.627926,4.712888,4.65399,4.712878,3.81765,2019,1,1,1,4.7273,4.842715,473.0,5.0,0.0,55.0,4.659111,84.0,14641.0,0.0,4.8,1,0.0


Unnamed: 0,요일,기상상태,도로형태,노면상태,사고유형,연,월,일,시간,구,동,보안등개수,어린이보호구역개수,CCTV설치대수,제한속도,계절,통행량,횡단보도개수,주말,연평균추세,시야지수,주차장지수
0,4.920811,4.712888,5.006142,4.712878,3.81765,2022,1,1,1,4.7273,4.881657,700.0,5.0,0.0,52.5,4.659111,2.0,9025.0,0.5,4.5,1,0.0
1,4.920811,4.712888,4.65399,4.712878,3.81765,2022,1,1,1,4.7273,4.563008,,10.0,0.0,47.142857,4.659111,38.0,52441.0,0.5,4.5,1,2.0
2,4.920811,4.712888,5.006142,4.712878,4.944597,2022,1,1,4,4.7273,4.945578,,1.0,0.0,40.0,4.659111,1.0,1764.0,0.5,4.5,1,0.0


(39609, 22) (10963, 22)


In [62]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)
print(train_x.shape, test_x.shape)

(39609, 22) (10963, 22)


In [63]:
# 📢📢📢 스케일링
from sklearn.preprocessing import MinMaxScaler

# train_x['CCTV설치대수'] = np.log1p(train_x['CCTV설치대수'].values)
# test_x['CCTV설치대수'] = np.log1p(test_x['CCTV설치대수'].values)

# train_x['제한속도'] = np.log1p(train_x['제한속도'].values)
# test_x['제한속도'] = np.log1p(test_x['제한속도'].values)

# mms = MinMaxScaler()
# train_x['보안등개수'] = mms.fit_transform(np.array(train_x['보안등개수'].values).reshape(-1,1))
# test_x['보안등개수'] = mms.transform(np.array(test_x['보안등개수'].values).reshape(-1,1))

encoders = {}
for col in train_x.columns:
    mms = MinMaxScaler()
    train_x[col] = mms.fit_transform(np.array(train_x[col]).reshape(-1,1))
    test_x[col] = mms.transform(np.array(test_x[col]).reshape(-1,1))
    encoders[col] = mms
    
    

In [64]:
train_x

Unnamed: 0,요일,기상상태,도로형태,노면상태,사고유형,연,월,일,시간,구,동,보안등개수,어린이보호구역개수,CCTV설치대수,제한속도,계절,통행량,횡단보도개수,주말,연평균추세,시야지수,주차장지수
0,0.103513,0.040966,0.330293,0.759902,0.0,0.0,0.0,0.0,0.000000,0.000000,0.206288,0.072717,0.066667,0.122642,0.400000,0.0,0.000015,0.015882,0.0,1.0,1.0,0.453608
1,0.103513,0.320053,0.330293,0.759902,0.0,0.0,0.0,0.0,0.000000,0.144781,0.328357,0.173331,0.000000,0.000000,0.525000,0.0,0.000083,0.026787,0.0,1.0,1.0,0.051546
2,0.103513,0.040966,0.330293,0.759902,0.0,0.0,0.0,0.0,0.043478,0.349916,0.356108,0.087967,0.166667,0.000000,0.550000,0.0,0.000032,0.039218,0.0,1.0,1.0,0.000000
3,0.103513,0.040966,0.330293,0.759902,1.0,0.0,0.0,0.0,0.086957,0.275235,0.186626,0.099312,0.366667,0.301887,0.380000,0.0,0.000000,0.080170,0.0,1.0,1.0,0.237113
4,0.103513,0.040966,0.330293,0.759902,1.0,0.0,0.0,0.0,0.173913,0.655631,0.277590,0.382555,0.000000,0.000000,0.414286,0.0,0.000005,0.156873,0.0,1.0,1.0,0.020619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0.061916,0.040966,0.454309,0.759902,1.0,1.0,1.0,1.0,0.826087,0.349916,0.375015,0.000000,0.033333,0.000000,0.400000,0.0,0.007812,0.007524,0.0,0.0,0.0,0.000000
39605,0.061916,0.040966,0.330293,0.759902,1.0,1.0,1.0,1.0,0.826087,0.144781,0.209367,0.156779,0.000000,0.000000,0.444444,0.0,0.248064,0.269176,0.0,0.0,0.0,0.051546
39606,0.061916,0.040966,0.454309,0.759902,1.0,1.0,1.0,1.0,0.913043,0.144781,0.304271,0.030500,0.000000,0.000000,0.411111,0.0,0.040022,0.213018,0.0,0.0,1.0,0.020619
39607,0.061916,0.040966,0.123209,0.759902,1.0,1.0,1.0,1.0,0.956522,0.144781,0.310458,0.039055,0.000000,0.000000,0.600000,0.0,0.001298,0.021218,0.0,0.0,1.0,0.010309


In [65]:
# GPU device
# GPU 확인
print(torch.cuda.is_available())
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(torch.cuda.get_device_name())
print(torch.cuda.device_count())

True
NVIDIA GeForce RTX 3050 Ti Laptop GPU
1


In [26]:
# # pandas data >> tensor data
# batch_size = 128

# X_train = train_x[:-8000]
# X_val   = train_x[-8000:]
# y_train = train_y[:-8000]
# y_val   = train_y[-8000:]

In [27]:
# # 입력 텐서는 무조건 3차원 이상 텐서로 만든다. (맨 앞 차원은 데이터포인트의 개수를 의미)
# X_train_tensors = Variable(torch.Tensor(X_train.values.astype('float64'))).unsqueeze(1) 
# X_val_tensors = Variable(torch.Tensor(X_val.values.astype('float64'))).unsqueeze(1)
# # 2차원 텐서 또는 1차원 벡터로 만든다.
# y_train_tensors = Variable(torch.Tensor(y_train.astype('float64'))).unsqueeze(1) 
# y_val_tensors = Variable(torch.Tensor(y_val.astype('float64'))).unsqueeze(1)

# # TensorDataset객체에 짝끼리 전달하고, batch_size와 shuffle을 설정하여 DataLoader로 만든다.
# train_dataset = TensorDataset(X_train_tensors, y_train_tensors)
# val_dataset = TensorDataset(X_val_tensors, y_val_tensors)
# train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size,shuffle=True)

# # Test Tensor
# X_test_tensor = Variable(torch.Tensor(test_x.values.astype('float64'))).unsqueeze(1)

## t81_558_class 참고
https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_04_4_batch_norm.ipynb

In [70]:
import copy

# Early stopping (see module 3.4)
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,y_pred,y_true):
        rmsle = torch.sqrt(self.mse(torch.log1p(y_pred),torch.log1p(y_true)))
        return rmsle
    
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.seq = nn.Sequential(nn.Linear(input_dim, 32, bias=True),
                                nn.BatchNorm1d(1),
                                nn.ReLU(),
                                nn.Linear(32,64),
                                nn.BatchNorm1d(1),
                                nn.ReLU(),
                                nn.Linear(64,1))
    def forward(self, x):
        out = self.seq(x)
        return out

In [71]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

torch.manual_seed(42)

<torch._C.Generator at 0x1509caa37d0>

In [72]:
# prepare tensor data
train_x_tensor = Variable(torch.Tensor(train_x.values.astype('float64'))).unsqueeze(1).to(device)
train_y_tensor = Variable(torch.Tensor(train_y.astype('float64'))).unsqueeze(1).to(device)
test_x_tensor = Variable(torch.Tensor(test_x.values.astype('float64'))).unsqueeze(1).to(device)

In [73]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
patience = 50
fold = 0
for train_idx, val_idx in kf.split(train_x_tensor):
    fold += 1
    print(f'Fold #{fold}')
    x_train, x_val = train_x_tensor[train_idx], train_x_tensor[val_idx]
    y_train, y_val = train_y_tensor[train_idx], train_y_tensor[val_idx]
    
    # DataLoader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    
    # Create model
    model = Model(train_x_tensor.size(2)).to(device) # = num_features
    
    # Optimizer, criterion
    optimizer = optim.Adam(model.parameters())
    loss_fn = RMSLELoss()
    
    # EarlyStopping
    best_loss = float('inf')
    early_stopping_counter = 0
    
    # Training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping(patience=patience)
    
    while not done and epoch<EPOCHS:
        epoch += 1
        model.train() # set as train mode
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(x_batch)
            loss = loss_fn(output,y_batch)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval() # set as test mode
        with torch.no_grad():
            val_output = model(x_val)
            val_loss = loss_fn(val_output, y_val)
        if es(model, val_loss):
            done = True
    
    print(f"Epoch: {epoch}/{EPOCHS}, Validation Loss: {val_loss.item()}, {es.status}")

Fold #1
Epoch: 111/500, Validation Loss: 0.17405956983566284, Early stopping triggered after 50 epochs.
Fold #2
Epoch: 174/500, Validation Loss: 0.17597857117652893, Early stopping triggered after 50 epochs.
Fold #3
Epoch: 88/500, Validation Loss: 0.17462781071662903, Early stopping triggered after 50 epochs.
Fold #4
Epoch: 163/500, Validation Loss: 0.1695396453142166, Early stopping triggered after 50 epochs.
Fold #5
Epoch: 118/500, Validation Loss: 0.17274664342403412, Early stopping triggered after 50 epochs.


In [77]:
# Predict
final_pred = model(test_x_tensor)

In [78]:
# check output
final_pred = final_pred.detach().squeeze(1).to('cpu').numpy()
print(final_pred.shape, final_pred[:10])

(10963, 1) [[1.5912888]
 [1.5908034]
 [1.5917732]
 [1.5912142]
 [1.5918236]
 [1.5916592]
 [1.5920973]
 [1.5913432]
 [1.5914123]
 [1.5915413]]


In [80]:
sample['ECLO'] = np.expm1(final_pred)
sample

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,3.910073
1,ACCIDENT_39610,3.907690
2,ACCIDENT_39611,3.912452
3,ACCIDENT_39612,3.909707
4,ACCIDENT_39613,3.912699
...,...,...
10958,ACCIDENT_50567,3.927875
10959,ACCIDENT_50568,3.919285
10960,ACCIDENT_50569,3.916186
10961,ACCIDENT_50570,3.916654


In [81]:
# 저장
sample.to_csv("../data/submits/torch_basic_1203_002.csv", index=False)