In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [None]:
# 데이터 로드

train = pd.read_parquet('./train_pwm.parquet')
train.head()

In [None]:
from sklearn.utils import shuffle

tr = shuffle(train.sort_values('base_date')[train['base_date'] < 20220701])
te = shuffle(train.sort_values('base_date')[train['base_date'] > 20220631])

y_train = tr['target']
X_train = tr.drop(['day_of_week', 'multi_linked', 'connect_code', 'height_restricted', 'id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

y_test = te['target']
X_test = te.drop(['day_of_week', 'multi_linked', 'connect_code', 'height_restricted', 'id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

## 첫 번째 LightGBM 하이퍼 파라미터 찾기 

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV



LR = lgb.LGBMRegressor()


parameters = {'max_depth':[10,15,20],
              'num_leaves':[150, 250,300],
              'learning_rate' : [0.3,0.5, 0.6],
            'n_estimators':[100,200,500]
             }


grid_lr = GridSearchCV(LR, param_grid=parameters, cv=3)


grid_lr.fit(X_train, y_train)


scores_df = pd.DataFrame(grid_lr.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']]

In [None]:
# {'learning_rate': 0.5, 'max_depth': 20, 'num_leaves': 300},

In [None]:
LR = lgb.LGBMRegressor(random_state = 42,
                           max_depth = 20,
                           num_leaves = 300,
                           learning_rate = 0.5,
                           n_estimators = 100,
                           
                          ).fit(X_train, y_train)

pred = LR.predict(X_test)
mae = mean_absolute_error(pred, y_test)
print(mae)
print(LR.feature_importances_)

In [None]:
importances = pd.Series(LR.feature_importances_, X_train.columns)
importances.sort_values()

---

## 2번째 LightGBM 하이퍼 파라미터 찾기 (train2 로 작업 위도 경도 삭제)

In [None]:
LR2 = lgb.LGBMRegressor()


parameters_2 = {'max_depth':[10,15,20],
              'num_leaves':[150, 250,300],
              'learning_rate' : [0.3,0.5, 0.6],
            'n_estimators':[100,200,500]
             }


grid_lr_2 = GridSearchCV(LR2, param_grid=parameters_2, cv=3)


grid_lr_2.fit(X_train2, y_train2)


scores_df = pd.DataFrame(grid_lr_2.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']]

In [None]:
# learning_rate 0.3, max_depth: 10 , n_estimators:100,num_leaves:150 2번째 파라미터값

In [None]:
LR2 = lgb.LGBMRegressor()
LR2 = lgb.LGBMRegressor(random_state = 42,
                           max_depth = 20,
                           num_leaves = 300,
                           learning_rate = 0.5,
                           n_estimators = 100,
                           
                          ).fit(X_train2, y_train2)

pred2 = LR2.predict(X_test2)
mae2 = mean_absolute_error(pred2, y_test2)
print(mae2)
print(LR2.feature_importances_)

In [None]:
LR2 = lgb.LGBMRegressor(random_state = 42,
                           max_depth = 10,
                           num_leaves = 150,
                           learning_rate = 0.3,
                           n_estimators = 100,
                           
                          ).fit(X_train2, y_train2)

pred2 = LR2.predict(X_test2)
mae2 = mean_absolute_error(pred2, y_test2)
print(mae2)
print(LR2.feature_importances_)

---

## RandomForest 모델 사용 (첫 번째)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
params ={
    'n_estimators':[100],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rfr = RandomForestRegressor()
grid_cv = GridSearchCV(rfr, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)
print(f"Best Param: {grid_cv.best_params_}")

In [None]:
#max_depth=12,min_samples_leaf :8,min_samples_split:20,n_estimators:100

In [None]:
rfr = RandomForestRegressor(max_depth=12,min_samples_leaf=8,min_samples_split=20,n_estimators=100).fit(X_train, y_train)
pred=rfr.predict(X_test)
mae=mean_absolute_error(pred,y_test)
print(mae)

## RandomForest 2번째 하이퍼파라미터 찾기

In [None]:
# 2번째 파라미터를 찾는 중...
params_2 ={
    'n_estimators':[100],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rfr_2 = RandomForestRegressor()
grid_cv__2 = GridSearchCV(rfr_2, param_grid=params_2, cv=2, n_jobs=-1)
grid_cv__2.fit(X_train2,y_train2)

In [None]:
rfr_2 = RandomForestRegressor(max_depth=12,min_samples_leaf=8,min_samples_split=20,n_estimators=100).fit(X_train2, y_train2)
pred_2=rfr_2.predict(X_test2)
mae_2=mean_absolute_error(pred_2,y_test2)
print(mae)

In [None]:
rfr__2 = RandomForestRegressor(max_depth=12,min_samples_leaf=8,min_samples_split=18,n_estimators=100).fit(X_train2, y_train2)
pred__2=rfr__2.predict(X_test2)
mae__2=mean_absolute_error(pred__2,y_test2)
print(mae__2)

---

## 최종 발표 후 피드백 진행

### One-hot encoding 

In [None]:
# 원핫인코딩 데이터 전처리 후에 아래 데이터 나누는 코드 실행

dawn = [0,1,2,3,4,5]
morning = [6,7,8,9,10,11]
daytime = [12,13,14,15,16,17]
night = [18,19,20,21,22,23]

hour_label = []

for row in train['base_hour']:
    if row in dawn:
        hour_label.append(0)   # 새벽 0
    elif row in morning:
        hour_label.append(1)   # 오전 1
    elif row in daytime:
        hour_label.append(2)   # 오후 2
    elif row in night:
        hour_label.append(3)   # 저녁 3

train['hour_label'] = hour_label
train

In [None]:
# 원핫인코딩(시간) 테스트

min_num = 10
min_stack = 0
stop_num = 3

LR = lgb.LGBMRegressor(random_state = 42,
                       max_depth = 19,
                       num_leaves = 187,
                       n_estimators = 1000,
                       learning_rate = 0.3,
                      ).fit(X_train, y_train)

pred = LR.predict(X_test)
mae = mean_absolute_error(pred, y_test)


# mae 값이 3.55와 근접하다면 for문으로 파라미터 테스트 간단하게 또 진행하도록 하겠습니다.
# 근접하다면 말해주세용
print('in_holidays : ', mae)

## one-hot encoding 후 3.55값이 좋게 나타남.