<a href="https://colab.research.google.com/github/dltpffldk/quest/blob/main/ex02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import missingno as msno
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [3]:
data_dir = '/content/ex2'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

In [9]:
X_train = pd.read_csv(train_data_path) #(15035, 19)
X_test = pd.read_csv(test_data_path) #(6468, 19) 


X_train['date'] = X_train['date'].apply(lambda x: x[:6]).astype(int)
X_test['date'] = X_test['date'].apply(lambda x: x[:6]).astype(int)

y_target = X_train['price'] #(15035,)
del X_train['price']
del X_train['id']
del X_test['id']

y_target = np.log1p(y_target)

random_state = 2023

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

def rmse(y_test,y_pred):
  return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

df = {}
for model in models:
  model_name = model.__class__.__name__

  x_train,x_test,y_train,y_test = train_test_split(X_train,y_target, 
                                                   test_size=0.2, 
                                                   random_state=random_state)

  model.fit(x_train,y_train)

  y_pred = model.predict(x_test)
  df[model_name] = rmse(y_test,y_pred)

  score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)

print(df)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

model = LGBMRegressor(random_state=random_state)

grid_model = GridSearchCV(model, param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5, verbose=1, n_jobs=5)

grid_model.fit(X_train, y_target)

params = grid_model.cv_results_['params']
score = grid_model.cv_results_['mean_test_score']

result = pd.DataFrame(params)
result['score'] = score

result['RMSE'] = np.sqrt(-1 * result['score'])
result = result.rename(columns={'RMSE':'RMSLE'})

result = result.sort_values('RMSLE')
print(result)

model = LGBMRegressor(max_depth=10,n_estimators=100,
                      learning_rate=0.1,
                      feature_fraction=0.5,
                      random_state=random_state)
model.fit(X_train, y_target)
prediction = model.predict(X_test)
prediction = np.expm1(prediction)
print(prediction)

submission_data_path = join(data_dir, 'sample_submission.csv')
submission_data = pd.read_csv(submission_data_path)
submission_data['price'] = prediction
submission_data.to_csv(submission_data_path, index=False)

{'GradientBoostingRegressor': 127924.7314171738, 'XGBRegressor': 120604.4876395353, 'LGBMRegressor': 115213.99759418188, 'RandomForestRegressor': 140575.48248281624}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
   max_depth  n_estimators     score     RMSLE
3         10           100 -0.027027  0.164399
2         10            50 -0.029177  0.170814
1          1           100 -0.055020  0.234564
0          1            50 -0.073394  0.270914
[ 519143.89859889  477670.74786551 1328484.39943923 ...  432906.20840912
  314835.76664976  420600.78981096]
