In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

train_data = pd.read_csv('../_mlxtend/data_preprocessed/train_data_preprocessed.csv',index_col=0)
test_data = pd.read_csv('../_mlxtend/data_preprocessed/test_data_preprocessed.csv',index_col=0)
train_target = pd.read_csv('../_mlxtend/data_preprocessed/train_target.csv',index_col=0)

In [2]:
from sklearn.preprocessing import RobustScaler
rbs = RobustScaler()
train_data_rbs=rbs.fit_transform(train_data)
test_data_rbs=rbs.fit_transform(test_data)

from sklearn.model_selection import train_test_split,cross_val_score,KFold
kfold = KFold(n_splits=3,shuffle=True)
x_train,x_test,y_train,y_test = train_test_split(train_data_rbs,train_target)

from sklearn.metrics import mean_squared_error

In [3]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor

models = [RandomForestRegressor(),GradientBoostingRegressor(),ExtraTreesRegressor(),HistGradientBoostingRegressor(),XGBRegressor()]
for model in models:
    cv_score = cross_val_score(
        model,
        x_train,
        y_train,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
    # model.fit(x_train,y_train)
    # y_pred = model.predict(x_test)
    print(str(model).split('(')[0])
    print(np.sqrt(abs(np.mean(cv_score))))

RandomForestRegressor
0.1449869176578134
GradientBoostingRegressor
0.13170445043575568
ExtraTreesRegressor
0.1413067227591072
HistGradientBoostingRegressor
0.12845278401128135
XGBRegressor
0.14490019984378658


### GradientBoostingRegressor

In [59]:
import optuna

model = GradientBoostingRegressor()
def grb_object(trial):
    prms = {
        'learning_rate':trial.suggest_float('learning_rate',1e-5,1),
        'n_estimators':trial.suggest_int('n_estimators',100,800),
        'criterion':trial.suggest_categorical('criterion',['friedman_mse', 'squared_error', 'mse']),
        'min_samples_split':trial.suggest_int('min_samples_split',2,10),
        'min_weight_fraction_leaf':trial.suggest_float('min_weight_fraction_leaf',0,1e-3),
        'max_depth':trial.suggest_int('max_depth',3,20)
    }
    cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
    return np.sqrt(abs(np.mean(cv_score)))

study_gbr=optuna.create_study(direction='minimize')
study_gbr.optimize(grb_object,n_trials=20,show_progress_bar=True)

[32m[I 2022-11-20 20:45:35,903][0m A new study created in memory with name: no-name-306d9a7f-c3cd-4254-92c5-bd6768bee078[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-11-20 20:45:36,958][0m Trial 0 finished with value: 0.12365157392977011 and parameters: {'learning_rate': 0.4318211889773778, 'n_estimators': 540, 'criterion': 'squared_error', 'min_samples_split': 9, 'min_weight_fraction_leaf': 0.0009353114482478614, 'max_depth': 9}. Best is trial 0 with value: 0.12365157392977011.[0m
[32m[I 2022-11-20 20:45:37,988][0m Trial 1 finished with value: 0.12838856831743145 and parameters: {'learning_rate': 0.7104657828874403, 'n_estimators': 303, 'criterion': 'squared_error', 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.00020104835487930062, 'max_depth': 19}. Best is trial 0 with value: 0.12365157392977011.[0m
[32m[I 2022-11-20 20:45:39,012][0m Trial 2 finished with value: 0.1252301534507219 and parameters: {'learning_rate': 0.7997787694316617, 'n_estimators': 431, 'criterion': 'friedman_mse', 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.0003295155855490142, 'max_depth': 10}. Best is trial 0 with value: 0.12365157392977

In [60]:
gbr_best_prms=study_gbr.best_params
gbr_best_prms

{'learning_rate': 0.4099558598404632,
 'n_estimators': 174,
 'criterion': 'squared_error',
 'min_samples_split': 6,
 'min_weight_fraction_leaf': 0.0005967595718821075,
 'max_depth': 16}

In [52]:
study_gbr.best_value

0.12344906333372502

In [15]:
optuna.visualization.plot_param_importances(study_gbr)

### HistGradientBoostingRegressor

In [20]:
import optuna

model = HistGradientBoostingRegressor()
def hgrb_object(trial):
    prms = {
        'learning_rate':trial.suggest_float('learning_rate',1e-5,1),
        'loss':trial.suggest_categorical('loss',['squared_error', 'poisson']),
        'max_iter':trial.suggest_int('max_iter',100,800),
        'max_leaf_nodes':trial.suggest_int('max_leaf_nodes',30,50),
        'l2_regularization':trial.suggest_float('l2_regularization',0,1e-1),
        'max_bins':trial.suggest_int('max_bins',255,500)   
    }
    cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
    return np.sqrt(abs(np.mean(cv_score)))

study_hgbr=optuna.create_study(direction='minimize')
study_hgbr.optimize(hgrb_object,n_trials=20,show_progress_bar=True)

[32m[I 2022-11-20 20:28:17,139][0m A new study created in memory with name: no-name-f3aae29b-e456-4975-b6a0-fd5c026a3247[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2022-11-20 20:28:35,393][0m Trial 0 finished with value: 0.13172368523393915 and parameters: {'learning_rate': 0.5901705116749807, 'loss': 'squared_error', 'max_iter': 234, 'max_leaf_nodes': 30, 'l2_regularization': 0.05410920514347051, 'max_bins': 394}. Best is trial 0 with value: 0.13172368523393915.[0m
[32m[I 2022-11-20 20:28:53,677][0m Trial 1 finished with value: 0.12852328181590017 and parameters: {'learning_rate': 0.9079221396217, 'loss': 'poisson', 'max_iter': 572, 'max_leaf_nodes': 30, 'l2_regularization': 0.01805659326310959, 'max_bins': 282}. Best is trial 1 with value: 0.12852328181590017.[0m
[32m[I 2022-11-20 20:29:13,134][0m Trial 2 finished with value: 0.1299508114921602 and parameters: {'learning_rate': 0.3499153431618396, 'loss': 'squared_error', 'max_iter': 119, 'max_leaf_nodes': 47, 'l2_regularization': 0.01048684966109651, 'max_bins': 494}. Best is trial 1 with value: 0.12852328181590017.[0m
[32m[I 2022-11-20 20:29:31,336][0m Trial 3 finished with 

In [21]:
hgbr_best_prms=study_hgbr.best_params
hgbr_best_prms
hgbr_best_prms['max_bins']=255

{'learning_rate': 0.998249013133991,
 'loss': 'poisson',
 'max_iter': 160,
 'max_leaf_nodes': 38,
 'l2_regularization': 0.0747531119135251,
 'max_bins': 283}

In [22]:
study_hgbr.best_value

0.12682565928458514

In [53]:
gbr_raw = GradientBoostingRegressor()
gbr=GradientBoostingRegressor(**gbr_best_prms)
hgbr = HistGradientBoostingRegressor(**hgbr_best_prms)

In [44]:
from mlxtend.regressor import StackingRegressor
stack_rg = StackingRegressor(
    regressors=[gbr_raw,hgbr],
    meta_regressor=gbr
)

In [46]:
model = stack_rg
cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
print(np.sqrt(abs(np.mean(cv_score))))

0.20696864851380462


In [54]:
model = hgbr
cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
print(np.sqrt(abs(np.mean(cv_score))))

KeyboardInterrupt: 

In [57]:
model = gbr
cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
print(np.sqrt(abs(np.mean(cv_score))))

0.1557386439985514


In [61]:
model = GradientBoostingRegressor(criterion='mse')
cv_score = cross_val_score(
        model,
        train_data_rbs,
        train_target,
        cv=kfold,
        scoring='neg_mean_squared_error'
    )
print(np.sqrt(abs(np.mean(cv_score))))

0.12436063158172501


In [62]:
model.fit(train_data_rbs,train_target)

In [63]:
ans=model.predict(test_data_rbs)

In [66]:
test_data['SalePrice']=np.expm1(ans)

In [69]:
pd.DataFrame(test_data.loc[:,'SalePrice']).to_csv('gbr_with_mse.csv')