In [17]:
import pandas as pd
import seaborn as sns
import numpy as np
import lightgbm as lgb
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def rmsle(y_true, y_pred):
    """Root Mean Squared Logarithmic Error"""
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))


In [13]:
# 전처리(Label Encoding)되 데이터 로드
X_train = pd.read_csv('TestData/train_data_label_encoded.csv')
y_train = pd.read_csv('TestData/train_target.csv')
X_val = pd.read_csv('TestData/val_data_label_encoded.csv')
y_val = pd.read_csv('TestData/val_target.csv')

# 학습에 사용하지 않을 컬럼 삭제
X_train.drop(['id', 'date', 'date_type', 'day'], axis=1, inplace=True)
X_val.drop(['id', 'date', 'date_type', 'day'], axis=1, inplace=True)

In [None]:
# LightGBM 모델 생성

# LightGBM 파라미터 설정
params = {
    'metric': 'rmsle',
    'num_leaves': 255,
    'learning_rate': 0.01, 
    'random_state': 10
}

# LightGBM 모델 설정
model = lgb.LGBMRegressor(
    metric=params['metric'],
    num_leaves=params['num_leaves'],
    learning_rate=params['learning_rate'],
    random_state=params['random_state']
)

In [None]:
# GVSearchCV로 LGBMRegressor 모델을 최적화하기 위한 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 63, 127],
    'max_depth': [-1, 10, 20]
}


In [18]:
# 모델을 평가하기 위한 rmsle 스코어 함수 생성
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=rmsle_scorer, verbose=1, n_jobs=-1)

# GridSearchCV 학습
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# 최적 모델로 테스트 데이터 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
y_pred = np.clip(y_pred, a_min=0, a_max=None)
score = rmsle(y_val, y_pred)
print("RMSLE:", score)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
