In [21]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

def rmsle(y_true, y_pred):
    """Root Mean Squared Logarithmic Error"""
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))


In [33]:
# 전처리(Label Encoding)되 데이터 로드
X_train = pd.read_csv('TestData/train_data_oh_scaled.csv')
y_train = pd.read_csv('TestData/train_target.csv').iloc[:, 0]
X_val = pd.read_csv('TestData/val_data_oh_scaled.csv')
y_val = pd.read_csv('TestData/val_target.csv').iloc[:, 0]

# 학습에 사용하지 않을 컬럼 삭제
X_train.drop(['id', 'date', 'date_type', 'day'], axis=1, inplace=True)
X_val.drop(['id', 'date', 'date_type', 'day'], axis=1, inplace=True)

In [34]:
# Ridge 모델 생성
model = Ridge()

In [35]:
# GVSearchCV로 Ridge 모델을 최적화하기 위한 하이퍼파라미터 그리드 설정
param_grid = {
    'alpha': [0.1, 1, 10, 100, 1000]  # 규제 강도 파라미터
}

In [36]:
# 모델을 평가하기 위한 rmsle 스코어 함수 생성
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=rmsle_scorer, verbose=1, n_jobs=-1)

# GridSearchCV 학습
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# 최적 모델로 테스트 데이터 평가
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
y_pred = np.clip(y_pred, a_min=0, a_max=None)

# Ensure y_val and y_pred are arrays
y_val = y_val.values if isinstance(y_val, pd.DataFrame) else y_val
y_pred = y_pred.values if isinstance(y_pred, pd.DataFrame) else y_pred

score = rmsle(y_val, y_pred)
print("RMSLE:", score)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'alpha': 1}
Best Score: -2.8047628592159053
RMSLE: 2.115593237960575
