# KernelRidge, GradientBoost, XGBoost, LGBM
- 참고 : https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard#Modelling

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

#한글폰트 설정
import matplotlib.font_manager as fm
path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumGothic.ttf'
# path = 'C:\\Users\\myksh\\AppData\\Local\\Microsoft\\Windows\\Fonts\\NanumSquare.ttf'
font_name = fm.FontProperties(fname=path).get_name()
print(font_name)
plt.rc('font', family=font_name)

plt.rcParams['font.family'] = 'NanumGothic'

#마이너스가 깨질 것을 방지
plt.rcParams['axes.unicode_minus'] = False

NanumGothic


# 데이터 로드 및 전처리

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [3]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [4]:
#연대 정리해주는 함수
def tail_year(x):
    if 0<=x<10:
        return '00'
    elif 10<=x<20:
        return '10'
    elif 20<=x<30:
        return '20'
    elif 30<=x<40:
        return '30'
    elif 40<=x<50:
        return '40'
    elif 50<=x<60:
        return '50'
    elif 60<=x<70:
        return '60'
    elif 70<=x<80:
        return '70'
    elif 80<=x<90:
        return '80'
    elif 90<=x<100:
        return '90'
def year_processing(x):
    xx = str(x)
    if xx[:2] == '18':
        return '18' + tail_year(int(xx[2:]))
    elif xx[:2] == '19':
        return '19' + tail_year(int(xx[2:]))
    elif xx[:2] == '20':
        return '20' + tail_year(int(xx[2:]))

In [5]:
#연대별로 변경
train['Year Built'] = train['Year Built'].apply(lambda x:year_processing(x))
train['Year Built'] = train['Year Built'].astype(int)

test['Year Built'] = test['Year Built'].apply(lambda x:year_processing(x))
test['Year Built'] = test['Year Built'].astype(int)

#연대별로 정리
train['Year Remod/Add'] = train['Year Remod/Add'].apply(lambda x:year_processing(x))
train['Year Remod/Add'] = train['Year Remod/Add'].astype(int)

test['Year Remod/Add'] = test['Year Remod/Add'].apply(lambda x:year_processing(x))
test['Year Remod/Add'] = test['Year Remod/Add'].astype(int)

In [6]:
#차고 자리 개수와 차고 면적은 의미가 비슷하므로 자리 개수를 drop
train = train.drop('Garage Cars', axis=1)
test = test.drop('Garage Cars', axis=1)

In [7]:
#2207년 데이터 삭제
train = train.drop(train[train['Garage Yr Blt']>=2022].index)

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['Exter Qual','Kitchen Qual','Bsmt Qual']
for i in cols:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

# Modeling

In [9]:
# 대회 규칙의 평가 산식 함수를 그대로 사용
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [10]:
X = train.drop('target', axis=1)
y = np.log1p(train['target'])

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((1079, 12), (270, 12))

In [117]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LassoCV, ElasticNetCV, RidgeCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [118]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [119]:
from tqdm.auto import tqdm
def gridSearchCV(models,params):
    kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
    best_models=[]
    for i in tqdm(range(0,len(models))):
        model_grid = GridSearchCV(models[i], params[i],n_jobs = -1, verbose=1, cv=kfolds)
        model_grid.fit(X_train, y_train)
        best_models.append(model_grid.best_estimator_)
    return best_models

In [120]:
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]

In [121]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

In [122]:
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
ENet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, l1_ratio=e_l1ratio, random_state=42, cv=kfolds))
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lr = LinearRegression(normalize=True)

In [38]:
# GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
#                                    max_depth=4, max_features='sqrt',
#                                    min_samples_leaf=15, min_samples_split=10, 
#                                    loss='huber', random_state =5)
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [103]:
gbr_params = {'loss' : ['huber', 'quantile'],
             'learning_rate':[0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15],
             'n_estimators':[1000, 2000, 3000],
              'max_depth':[5,7,10,15,20],
              'max_features':['sqrt'],
              'min_samples_leaf':[15],
              'min_samples_split':[10]}
xgb_params = {'n_estimators' : [1000, 2000],
              "early_stopping_rounds" : [1, 2],
              'learning_rate':[0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15]}

gbr = GradientBoostingRegressor()
model_xgb = xgb.XGBRegressor()
grid = gridSearchCV([gbr, model_xgb], [gbr_params, xgb_params])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

Fitting 5 folds for each of 330 candidates, totalling 1650 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 28.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 44.2min
[Parallel(n_jobs=-1)]: Done 1650 out of 1650 | elapsed: 59.1min finished


Fitting 5 folds for each of 44 candidates, totalling 220 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 220 out of 220 | elapsed:  2.0min finished


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.





In [104]:
GBoost = grid[0]
model_xgb = grid[1]

In [175]:
model_xgb

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=1,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.11, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [106]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [123]:
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(ridge)
print("ridge score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
# score = rmsle_cv(model_xgb)
# print("Xgboost score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})" .format(score.mean(), score.std()))

Lasso score: 0.1453 (0.0266)
ridge score: 0.1455 (0.0272)
ElasticNet score: 0.1454 (0.0266)
Kernel Ridge score: 0.1436 (0.0200)
Gradient Boosting score: 0.1461 (0.0265)
LGBM score: 0.1425 (0.0251)


In [142]:
KRR

KernelRidge(alpha=0.6, coef0=2.5, degree=2, kernel='polynomial')

In [125]:
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
ENet.fit(X_train, y_train)
KRR.fit(X_train, y_train)
GBoost.fit(X_train, y_train)
# model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
              feature_fraction=0.2319, feature_fraction_seed=9,
              learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
              objective='regression')

In [126]:
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)
enet_pred = ENet.predict(X_test)
krr_pred = KRR.predict(X_test)
gb_pred = GBoost.predict(X_test)
xgb_pred = model_xgb.predict(X_test)
lgb_pred = model_lgb.predict(X_test)

In [135]:
# final_pred = ((lasso_pred*0.4) + (enet_pred*0.3) + (krr_pred*0.3))*0.6 + ((ridge_pred*0.5)+(lgb_pred*0.5))*0.4
final_pred = ((ridge_pred*0.5)+(lgb_pred*0.5))

In [139]:
NMAE(y_test, final_pred)

0.00837709738855337

In [58]:
NMAE(y_test, final_pred)

0.00821083425712647

In [60]:
lasso_pred = lasso.predict(test)
enet_pred = ENet.predict(test)
krr_pred = KRR.predict(test)
gb_pred = GBoost.predict(test)
xgb_pred = model_xgb.predict(test)
lgb_pred = model_lgb.predict(test)

final_pred = ((lasso_pred*0.4) + (enet_pred*0.3) + (krr_pred*0.3))*0.6 + ((gb_pred*0.5)+(lgb_pred*0.5))*0.4

In [61]:
sub = pd.read_csv('./data/sample_submission.csv')
sub['target'] = np.expm1(final_pred1)
sub

Unnamed: 0,id,target
0,1,346439.374835
1,2,124849.475923
2,3,170880.716007
3,4,243356.425448
4,5,129385.819667
...,...,...
1345,1346,330427.811054
1346,1347,123317.596863
1347,1348,75590.443907
1348,1349,180893.333334


In [63]:
# sub.to_csv('./submission_data/ensemble_submission4.csv', index=False)

# Voting

In [None]:
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
ENet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, l1_ratio=e_l1ratio, random_state=42, cv=kfolds))
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [232]:
rf_params = {'n_estimators':[10, 20, 30, 1000, 2000, 3000],
             "max_features": ["auto", "sqrt", "log2"],
             "bootstrap": [True, False],
             "min_samples_split" : [2,4,8],
              'max_depth':[5,7,10,15,20]}

rf = RandomForestRegressor()
rf_grid = gridSearchCV([rf], [rf_params])[0]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed: 28.7min finished





In [233]:
rf_grid

RandomForestRegressor(max_depth=20, max_features='sqrt', min_samples_split=4,
                      n_estimators=2000)

In [245]:
from sklearn.ensemble import VotingRegressor
models = [
    ('rf', RandomForestRegressor(max_depth=20, max_features='sqrt', min_samples_split=4, n_estimators=2000)),
    ('krr', KernelRidge(alpha=0.6, coef0=2.5, degree=2, kernel='polynomial')),
    ('gbr', GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                       max_depth=10, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10, 
                                       loss='huber', random_state =5)),
    ('lgbm', lgb.LGBMRegressor(learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
                              min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
                              objective='regression'))
]

voting_rg = VotingRegressor(estimators=models)
voting_rg.fit(X_train, y_train)



VotingRegressor(estimators=[('rf',
                             RandomForestRegressor(max_depth=20,
                                                   max_features='sqrt',
                                                   min_samples_split=4,
                                                   n_estimators=2000)),
                            ('krr',
                             KernelRidge(alpha=0.6, coef0=2.5, degree=2,
                                         kernel='polynomial')),
                            ('gbr',
                             GradientBoostingRegressor(learning_rate=0.05,
                                                       loss='huber',
                                                       max_depth=10,
                                                       max_features='sqrt',
                                                       min_samples_leaf=15,
                                                       min_samples_split=10,
                                 

In [246]:
NMAE(y_test, voting_rg.predict(X_test))

0.007903898812490536

In [247]:
pred = voting_rg.predict(test)
sub = pd.read_csv('./data/sample_submission.csv')
sub['target'] = np.expm1(pred)
sub

Unnamed: 0,id,target
0,1,345558.817782
1,2,123504.614308
2,3,174845.413270
3,4,243110.508826
4,5,130882.924565
...,...,...
1345,1346,329308.652637
1346,1347,125758.743120
1347,1348,77250.396398
1348,1349,180757.513253


In [248]:
#rf(grid), krr, gbr, lgbm voting
sub.to_csv('./submission_data/voting_submission7.csv', index=False)

# Adaboost

In [250]:
# from sklearn.ensemble import AdaBoostRegressor
# adaboost = AdaBoostRegressor(base_estimator=voting_rg)
# adaboost.fit(X_train, y_train)

In [251]:
# NMAE(y_test, adaboost.predict(X_test))