<a href="https://colab.research.google.com/github/sgu20191816/jeju_traffic/blob/main/jeju_season(lrc)_3_1206(3_1185).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install haversine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt

import lightgbm as lgb

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold

from haversine import haversine, Unit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def get_outlier(df=None, column=None, weight=1.25):
  # target 값과 상관관계가 높은 열을 우선적으로 진행
  quantile_25 = np.percentile(df[column].values, 25)
  quantile_75 = np.percentile(df[column].values, 75)

  IQR = quantile_75 - quantile_25
  IQR_weight = IQR*weight

  lowest = quantile_25 - IQR_weight
  highest = quantile_75 + IQR_weight

  outlier_idx = df[column][ (df[column] < lowest) | (df[column] > highest) ].index
  return outlier_idx

In [None]:
#csv to parquet
# -> 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업이 가능
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
csv_to_parquet('/content/drive/MyDrive/DACON/제주도 도로 교통량 예측 AI 경진대회/open (제주 교통)/train.csv', 'train')
csv_to_parquet('/content/drive/MyDrive/DACON/제주도 도로 교통량 예측 AI 경진대회/open (제주 교통)/test.csv', 'test')

train Done.
test Done.


In [None]:
# 데이터 불러오기
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')

In [None]:
# 데이터 전처리

## 히스토그램 파악하기
# date, road 범주형 데이터로 바라보게 할 것

# '-' road_name 삭제
# cst = train[train['road_name'] == '-'].index
# train.drop(cst)

# 이상치 제거
outlier_idx = get_outlier(df=train, column='target')
train.drop(outlier_idx, axis=0, inplace=True)

# 자릿수 낮춤
train['weight_restricted'] = train['weight_restricted'] * (0.0001)
train['road_rating'] = train['road_rating'] * (0.1)
test['weight_restricted'] = test['weight_restricted'] * (0.0001)
test['road_rating'] = test['road_rating'] * (0.1)

# 라벨링
str_col = ['day_of_week','road_name','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

# train['month'] = train['month'].astype(str)
# train['base_hour'] = train['base_hour'].astype(str)
# train = pd.get_dummies(train, columns=['month','base_hour'])
# test['month'] = test['month'].astype(str)
# test['base_hour'] = test['base_hour'].astype(str)
# test = pd.get_dummies(test, columns=['month','base_hour'])

# road + latitude, longtitude
#train['road_s_lati'] = train.apply(lambda x : x['road_name']+x['start_latitude'])

print(train.head())

In [None]:
train['road_name'].unique()

In [None]:
# NEW feature 생성
## 거리 feature 생성
train['distance'] = train.apply(lambda x: haversine((x['start_latitude'],x['start_longitude']),(x['end_latitude'],x['end_longitude']),unit = 'km'), axis = 'columns')
test['distance'] = test.apply(lambda x: haversine((x['start_latitude'],x['start_longitude']),(x['end_latitude'],x['end_longitude']),unit = 'km'), axis = 'columns')

In [None]:
train['start_latitude'] = (train['start_latitude'] - 33) * 100
train['end_latitude'] = (train['end_latitude'] - 33) * 100
train['start_longitude'] = (train['start_longitude'] - 126) *100
train['end_longitude'] = (train['end_longitude'] - 126) *100

test['start_latitude'] = (test['start_latitude'] - 33) * 100
test['end_latitude'] = (test['end_latitude'] - 33) * 100
test['start_longitude'] = (test['start_longitude'] - 126) *100
test['end_longitude'] = (test['end_longitude'] - 126) *100

In [None]:
## start point(lati+longi) & end point feature 생성성
train['sLatiLongi'] = train.apply(lambda x: x['start_latitude'] + x['start_longitude'], axis = 1)
train['eLatiLongi'] = train.apply(lambda x: x['end_latitude'] + x['end_longitude'], axis = 1)
test['sLatiLongi'] = test.apply(lambda x: x['start_latitude'] + x['start_longitude'], axis = 1)
test['eLatiLongi'] = test.apply(lambda x: x['end_latitude'] + x['end_longitude'], axis = 1)

In [None]:
## weight_restricted + road_type ~ 둘의 상관관계가 높음
# train['wr_rt'] = train.apply(lambda x: (x['weight_restricted']*0.1) + x['road_type'], axis = 'columns')
# test['wr_rt'] = test.apply(lambda x: (x['weight_restricted']*0.1) + x['road_type'], axis = 'columns')

In [None]:
## max_speed limit & lane count 상관관계 높음
train['msl_lc'] = train.apply(lambda x: x['maximum_speed_limit'] / x['lane_count'], axis = 1)
test['msl_lc'] = test.apply(lambda x: x['maximum_speed_limit'] / x['lane_count'], axis = 1)

In [None]:
## month feature 생성
train['month'] = train.apply(lambda x: int(str(x['base_date'])[4:6]), axis = 'columns')
test['month'] = test.apply(lambda x: int(str(x['base_date'])[4:6]), axis = 'columns')

In [None]:
# ## month에 8이 없으므로 8은 같은 성수기인 7월로 변경
# train.loc[train['month'] == 8,'month'] = 7
# test.loc[test['month'] == 8,'month'] = 7

In [None]:
# test.head()

In [None]:
def season(x) :
    expeak = [7,8]
    midpeak_s = [4,5,6]
    midpeak_h = [9,10]
    unpeak_v = [1,2]
    unpeak_nv = [11,12,3]
    if x in expeak :
        return 5
    elif x in midpeak_s :
        return 4
    elif x in midpeak_h :
        return 3
    elif x in  unpeak_v :
        return 2
    else :
        return 1

In [None]:
train["season"] = train["month"].apply(lambda x : season(x))
test["season"] = test["month"].apply(lambda x : season(x))

In [None]:
from collections import Counter

def print_mode(df, col):

  cnt = Counter(df[col])
  list_cnt = cnt.most_common(10)
  modest = []
  for idx, value in enumerate(list_cnt):

    print(f'{col}의 최빈값 {idx+1}순위 : {value[0]} & {value[-1]}개')
    modest.append(value[0])
  print(sum(modest)/10)

In [None]:
for value in train['base_hour'].unique():

    cond = (train['base_hour']==value)
    cond_df = train.loc[cond]

    print(f'{value} 데이터 갯수 : {cond_df.shape[0]}')

    print_mode(cond_df, 'target')
    print('='*50)

In [None]:
def mode_hour(x) :
    if x == 1 :
        return 51.5
    elif x == 2 :
        return 51.5
    elif x == 3 :
        return 52.5
    elif x == 4 :
        return 52.5
    elif x == 5 :
        return 51.5
    elif x == 6 :
        return 50.5
    elif x == 7 :
        return 46.4
    elif x == 8 :
        return 48
    elif x == 9 :
        return 41.5
    elif x == 10 :
        return 36.1
    elif x == 11 :
        return 38.1
    elif x == 12 :
        return 35.8
    elif x == 13 :
        return 38
    elif x == 14 :
        return 38.1
    elif x == 15 :
        return 36.9
    elif x == 16 :
        return 35.9
    elif x == 17 :
        return 30.6
    elif x == 18 :
        return 36.1
    elif x == 19 :
        return 37.1
    elif x == 20 :
        return 39.6
    elif x == 21 :
        return 42.8
    elif x == 22 :
        return 48.7
    elif x == 23 :
        return 49.5
    else :
        return 51.5

In [None]:
train["modest_hour"] = train["base_hour"].apply(lambda x : mode_hour(x))
test["modest_hour"] = test["base_hour"].apply(lambda x : mode_hour(x))

In [None]:
train['mod_week_hour'] = train.apply(lambda x: x['modest_hour'] * (x['day_of_week']+5), axis = 1)
test['mod_week_hour'] = test.apply(lambda x: x['modest_hour'] * (x['day_of_week']+5), axis = 1)

In [None]:
# train['mod_maxlmt'] = train.apply(lambda x : abs(x['modest_hour'] - x['maximum_speed_limit']), axis = 1)
# test['mod_maxlmt'] = test.apply(lambda x : abs(x['modest_hour'] - x['maximum_speed_limit']), axis = 1)

In [None]:
# 3.1472
train['lmt_rat_cnt'] = train.apply(lambda x: x['maximum_speed_limit'] * x['road_rating'] * x['lane_count'], axis = 1)
test['lmt_rat_cnt'] = test.apply(lambda x: x['maximum_speed_limit'] * x['road_rating'] * x['lane_count'], axis = 1)

In [None]:
# # 제주특별자치도관광협회(http://visitjeju.or.kr/web/bbs/bbsList.do?bbsId=TOURSTAT)
# # 2021년 9~12월, 2022년 1~8월 데이터 사용
# # 2021년 8월 코로나 거리두기 3단계
# # 2022년 9월; 1059157

# def tour(x) :
#     if x == 1 :
#         return np.log1p(1170802)
#     elif x == 2 :
#         return np.log1p(1029503)
#     elif x == 3 :
#         return np.log1p(873086)
#     elif x == 4 :
#         return np.log1p(1178456)
#     elif x == 5 :
#         return np.log1p(1306537)
#     elif x == 6 :
#         return np.log1p(1283470)
#     elif x == 7 :
#         return np.log1p(1263332)
#     elif x == 8 :
#         return np.log1p(1281608)
#     elif x == 9 :
#         return np.log1p(872396)
#     elif x == 10 :
#         return np.log1p(1222094)
#     elif x == 11 :
#         return np.log1p(1204344)
#     else :
#         return np.log1p(1090607)

In [None]:
# train["tourist"] = train["month"].apply(lambda x :tour(x))
# test["tourist"] = test["month"].apply(lambda x : tour(x)*1.3)

In [None]:
# # 제주특별자치도-주민등록인구통계(https://www.jeju.go.kr/open/stats/list/population.htm?year=2021)
# # 2021년 8~12월, 2022년 1~7월 데이터 사용

# def reside(x) :
#     if x == 1 :
#         return np.log1p(697269)
#     elif x == 2 :
#         return np.log1p(697718)
#     elif x == 3 :
#         return np.log1p(697841)
#     elif x == 4 :
#         return np.log1p(698056)
#     elif x == 5 :
#         return np.log1p(698435)
#     elif x == 6 :
#         return np.log1p(698698)
#     elif x == 7 :
#         return np.log1p(699303)
#     elif x == 8 :
#         return np.log1p(697108)
#     elif x == 9 :
#         return np.log1p(697263)
#     elif x == 10 :
#         return np.log1p(697647)
#     elif x == 11 :
#         return np.log1p(697718)
#     else :
#         return np.log1p(697476)

In [None]:
# train["resident"] = train["month"].apply(lambda x :reside(x))
# test["resident"] = test["month"].apply(lambda x : reside(x)*1.013)

In [None]:
train_target = train['target']
train_input = train.drop(['month','road_in_use','id','base_date', 'target', 'start_node_name', 'end_node_name','vehicle_restricted','height_restricted','multi_linked'], axis=1) #'weight_restricted','road_type'

In [None]:
test = test.drop(['month','road_in_use','id','base_date', 'start_node_name', 'end_node_name','vehicle_restricted','height_restricted','multi_linked'], axis=1)

In [None]:
train_input.shape

(4698461, 23)

In [None]:
test.shape

(291241, 23)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_input, train_target, test_size=0.3, random_state=0)

In [None]:
# 스케일링
# ss = StandardScaler()
# X_train_sc = ss.fit_transform(X_train)
# X_val_sc = ss.transform(X_test)

In [None]:
# # LGBM - KFold        ## 3.11473(season{1.5}) // {1.25} // 3.0602{0.75} // 3.116553(season+mod_maxlmt) // 3.026216(month)
X = train_input
y = train_target

k_fold = KFold(n_splits = 3, shuffle=True, random_state = 10)
y_pred = np.zeros(len(test))

for train_idx, val_idx in k_fold.split(X):
  x_t = X.iloc[train_idx]
  y_t = y.iloc[train_idx]
  x_val = X.iloc[val_idx]
  y_val = y.iloc[val_idx]

  lgbm = lgb.LGBMRegressor(n_estimators = 8000,
                           learning_rate = 0.0345,
                           max_depth = 36,
                           num_leaves = 7870,
                           min_data_in_leaf = 10,
                           min_child_samples = 220,
                           subsample = 0.97,
                           metric = 'mae',
                           objective = 'regression')
  lgbm.fit(x_t, y_t, eval_set=[(x_val, y_val)], eval_metric= 'mae', early_stopping_rounds= 25,verbose = 50)
  y_pred += lgbm.predict(test) / 3

In [None]:
# 특성 중요도 시각화 하기
import plotly.express as px

feature_importance = pd.DataFrame({
    'feature': train_input.columns,
    'importance': lgbm.feature_importances_
})

feature_importance.sort_values('importance', inplace=True)
print(feature_importance)

fig = px.bar(feature_importance, x='feature', y='importance')
fig.show()

> 제출 predict

In [None]:
# preds = []
# for model in models:
#   preds.append(model.predict(test))
# preds = pd.DataFrame(preds)
# ans = preds.mean()

> Submission

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/DACON/제주도 도로 교통량 예측 AI 경진대회/open (제주 교통)/sample_submission.csv')
sample_submission['target'] = y_pred.round(0)
sample_submission.to_csv("./submit.csv", index = False)

In [None]:
sample_submission