In [1]:
"""
# scikit_learn based process

```
1. scikit learn main model fit predict 
2. pick some fine model
3. do parameter tuning by optuna
4. combination by mlxtend
```
"""

'\n# scikit_learn based process\n\n```\n1. scikit learn main model fit predict \n2. pick some fine model\n3. do parameter tuning by optuna\n4. combination by mlxtend\n```\n'

In [2]:
import pandas as pd 
import numpy as np 
import mlxtend
import optuna
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
train_data = pd.read_csv('../_mlxtend/data_preprocessed/train_data_preprocessed.csv',index_col=0)
test_data = pd.read_csv('../_mlxtend/data_preprocessed/test_data_preprocessed.csv',index_col=0)
train_target = pd.read_csv('../_mlxtend/data_preprocessed/train_target.csv',index_col=0)

In [5]:
# 일단 RobustScaler 진행
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
rbs = RobustScaler()
train_data_rbs=rbs.fit_transform(train_data)
test_data_rbs=rbs.transform(test_data)

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_data_rbs,
                                                 train_target,
                                                 test_size=0.2,
                                                 shuffle=True)

In [7]:
for model in [RandomForestRegressor(),XGBRegressor(),AdaBoostRegressor()]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    result=cross_val_score(
        model,
        train_data_rbs,
        train_target,
        scoring='neg_mean_squared_error',
        cv=KFold(n_splits=5,shuffle=True)
    )
    print(np.sqrt(np.mean(result)*-1))
    
for model in [GradientBoostingRegressor(), ExtraTreesRegressor()]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    result=cross_val_score(
        model,
        train_data_rbs,
        train_target,
        scoring='neg_mean_squared_error',
        cv=KFold(n_splits=5,shuffle=True)
    )
    print(np.sqrt(np.mean(result)*-1))

RandomForestRegressor
0.13838992570090486
XGBRegressor
0.133800876421534
AdaBoostRegressor
0.16935690392888839
GradientBoostingRegressor
0.12078684574581879
ExtraTreesRegressor
0.13239482764375315


In [10]:
train_target.to_numpy()

(1456, 1)

In [56]:
# gradient, randomforest, xgbregressor, extraregressor 사용하자

# gradientboosting hyper parmetertuning

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate

def objective(trial, X, y, cv, scoring):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 9),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9, step=0.1),
        "max_features": trial.suggest_categorical(
            "max_features", ["auto", "sqrt", "log2"]
        ),
        "random_state": 1121218,
        "n_iter_no_change": 50,  # early stopping
        "validation_fraction": 0.05,
    }
    # Perform CV
    gr_reg = GradientBoostingRegressor(**params)
    scores = cross_validate(gr_reg, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    # Compute RMSLE
    rmsle = np.sqrt(-scores["test_score"].mean())

    return rmsle


# Create study that minimizes
study = optuna.create_study(direction="minimize")

# Wrap the objective inside a lambda with the relevant arguments
kf = KFold(n_splits=3, shuffle=True, random_state=1121218)
# Pass additional arguments inside another function
func = lambda trial: objective(trial, train_data_rbs, train_target.to_numpy().ravel(), cv=kf, scoring="neg_mean_squared_error")

# Start optimizing with 100 trials
study.optimize(func, n_trials=10)

print(f"Optimized RMSLE: {study.best_value:.5f}")

[32m[I 2022-11-16 22:30:26,013][0m A new study created in memory with name: no-name-a8a3ca30-d09c-456e-a8f0-c75b0cc78f33[0m
[32m[I 2022-11-16 22:30:27,123][0m Trial 0 finished with value: 0.1287283508368892 and parameters: {'n_estimators': 3200, 'learning_rate': 0.003070555283990485, 'max_depth': 7, 'subsample': 0.8, 'max_features': 'log2'}. Best is trial 0 with value: 0.1287283508368892.[0m
[32m[I 2022-11-16 22:30:27,357][0m Trial 1 finished with value: 0.12948407533358086 and parameters: {'n_estimators': 1600, 'learning_rate': 0.13255431061095146, 'max_depth': 5, 'subsample': 0.7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.1287283508368892.[0m
[32m[I 2022-11-16 22:30:29,600][0m Trial 2 finished with value: 0.1375149899200377 and parameters: {'n_estimators': 4000, 'learning_rate': 0.02694657099498981, 'max_depth': 9, 'subsample': 0.8, 'max_features': 'auto'}. Best is trial 0 with value: 0.1287283508368892.[0m
[32m[I 2022-11-16 22:30:30,590][0m Trial 3 finishe

Optimized RMSLE: 0.12001


In [57]:
best_gbr_prms=study.best_params
best_gbr_prms

{'n_estimators': 4800,
 'learning_rate': 0.04127859055165332,
 'max_depth': 3,
 'subsample': 0.5,
 'max_features': 'sqrt'}

In [58]:
# gradient, randomforest, xgbregressor, extraregressor 사용하자

# RandomFroestRegressor hyper parmetertuning

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

def objective(trial, X, y, cv, scoring):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=100),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 20),
    }
    # Perform CV
    rf_reg = RandomForestRegressor(**params)
    scores = cross_validate(rf_reg, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    # Compute RMSLE
    rmsle = np.sqrt(-scores["test_score"].mean())

    return rmsle


# Create study that minimizes
study = optuna.create_study(direction="minimize")

# Wrap the objective inside a lambda with the relevant arguments
kf = KFold(n_splits=3, shuffle=True, random_state=1121218)
# Pass additional arguments inside another function
func = lambda trial: objective(trial, train_data_rbs, train_target.to_numpy().ravel(), cv=kf, scoring="neg_mean_squared_error")

# Start optimizing with 100 trials
study.optimize(func, n_trials=10)

print(f"Optimized RMSLE: {study.best_value:.5f}")

[32m[I 2022-11-16 22:30:37,035][0m A new study created in memory with name: no-name-7ac350dc-3600-4c53-b898-22ca93ed930b[0m
[32m[I 2022-11-16 22:30:40,668][0m Trial 0 finished with value: 0.14628696679786957 and parameters: {'n_estimators': 600, 'max_leaf_nodes': 60, 'max_depth': 19}. Best is trial 0 with value: 0.14628696679786957.[0m
[32m[I 2022-11-16 22:30:55,887][0m Trial 1 finished with value: 0.14058665448528296 and parameters: {'n_estimators': 1700, 'max_leaf_nodes': 796, 'max_depth': 20}. Best is trial 1 with value: 0.14058665448528296.[0m
[32m[I 2022-11-16 22:31:08,265][0m Trial 2 finished with value: 0.14141587615963852 and parameters: {'n_estimators': 1700, 'max_leaf_nodes': 640, 'max_depth': 10}. Best is trial 1 with value: 0.14058665448528296.[0m
[32m[I 2022-11-16 22:31:15,012][0m Trial 3 finished with value: 0.14111582173366724 and parameters: {'n_estimators': 900, 'max_leaf_nodes': 163, 'max_depth': 12}. Best is trial 1 with value: 0.14058665448528296.[0m


Optimized RMSLE: 0.14034


In [59]:
best_rfr_prms=study.best_params
best_rfr_prms

{'n_estimators': 2000, 'max_leaf_nodes': 291, 'max_depth': 14}

In [60]:
# gradient, randomforest, xgbregressor, extraregressor 사용하자

# XGBRegressor hyper parmetertuning

from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate

def objective(trial, X, y, cv, scoring):
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step=100),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    # Perform CV
    xgb_reg = XGBRegressor(**params)
    scores = cross_validate(xgb_reg, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    # Compute RMSLE
    rmsle = np.sqrt(-scores["test_score"].mean())

    return rmsle


# Create study that minimizes
study = optuna.create_study(direction="minimize")

# Wrap the objective inside a lambda with the relevant arguments
kf = KFold(n_splits=3, shuffle=True, random_state=1121218)
# Pass additional arguments inside another function
func = lambda trial: objective(trial, train_data_rbs, train_target.to_numpy().ravel(), cv=kf, scoring="neg_mean_squared_error")

# Start optimizing with 100 trials
study.optimize(func, n_trials=10)

print(f"Optimized RMSLE: {study.best_value:.5f}")

[32m[I 2022-11-16 22:32:20,847][0m A new study created in memory with name: no-name-96d85042-81c9-4aea-b76b-6dde7f93a00b[0m
[32m[I 2022-11-16 22:32:26,773][0m Trial 0 finished with value: 0.16457826788690222 and parameters: {'lambda': 2.3742371713355803, 'alpha': 5.293100780049855, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 4400, 'max_depth': 5, 'min_child_weight': 172}. Best is trial 0 with value: 0.16457826788690222.[0m
[32m[I 2022-11-16 22:32:27,804][0m Trial 1 finished with value: 0.17742119400751852 and parameters: {'lambda': 0.013851415619290397, 'alpha': 0.007737787612032798, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.012, 'n_estimators': 400, 'max_depth': 11, 'min_child_weight': 100}. Best is trial 0 with value: 0.16457826788690222.[0m
[32m[I 2022-11-16 22:32:30,402][0m Trial 2 finished with value: 0.16828192455830607 and parameters: {'lambda': 0.009064115832410544, 'alpha': 0.0030975887938759317, 'colsample_b

Optimized RMSLE: 0.12539


In [61]:
best_xgbr_prms=study.best_params
best_xgbr_prms

{'lambda': 0.10252576383026227,
 'alpha': 0.43737951603549957,
 'colsample_bytree': 0.9,
 'subsample': 0.8,
 'learning_rate': 0.018,
 'n_estimators': 1400,
 'max_depth': 13,
 'min_child_weight': 35}

In [62]:
# gradient, randomforest, xgbregressor, extraregressor 사용하자

# extraregressor hyper parmetertuning

from sklearn.ensemble import ExtraTreesRegressor

def objective(trial, X, y, cv, scoring):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=100),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 1000),
        "max_depth": trial.suggest_int("max_depth", 10, 20),
    }
    # Perform CV
    et_reg = ExtraTreesRegressor(**params)
    scores = cross_validate(et_reg, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    # Compute RMSLE
    rmsle = np.sqrt(-scores["test_score"].mean())

    return rmsle


# Create study that minimizes
study = optuna.create_study(direction="minimize")

# Wrap the objective inside a lambda with the relevant arguments
kf = KFold(n_splits=3, shuffle=True, random_state=1121218)
# Pass additional arguments inside another function
func = lambda trial: objective(trial, train_data_rbs, train_target.to_numpy().ravel(), cv=kf, scoring="neg_mean_squared_error")

# Start optimizing with 100 trials
study.optimize(func, n_trials=10)

print(f"Optimized RMSLE: {study.best_value:.5f}")

[32m[I 2022-11-16 22:33:01,983][0m A new study created in memory with name: no-name-e61d61bf-33d8-47b0-aa48-91fb00b88de1[0m
[32m[I 2022-11-16 22:33:13,489][0m Trial 0 finished with value: 0.13559434971055873 and parameters: {'n_estimators': 1700, 'max_leaf_nodes': 691, 'max_depth': 13}. Best is trial 0 with value: 0.13559434971055873.[0m
[32m[I 2022-11-16 22:33:16,251][0m Trial 1 finished with value: 0.13862885308554787 and parameters: {'n_estimators': 500, 'max_leaf_nodes': 712, 'max_depth': 10}. Best is trial 0 with value: 0.13559434971055873.[0m
[32m[I 2022-11-16 22:33:28,305][0m Trial 2 finished with value: 0.1356224614215705 and parameters: {'n_estimators': 1700, 'max_leaf_nodes': 985, 'max_depth': 14}. Best is trial 0 with value: 0.13559434971055873.[0m
[32m[I 2022-11-16 22:33:36,955][0m Trial 3 finished with value: 0.13555316467278172 and parameters: {'n_estimators': 1100, 'max_leaf_nodes': 867, 'max_depth': 18}. Best is trial 3 with value: 0.13555316467278172.[0m

Optimized RMSLE: 0.13555


In [63]:
best_etr_prms=study.best_params
best_etr_prms

{'n_estimators': 1100, 'max_leaf_nodes': 867, 'max_depth': 18}

In [102]:
# gradient, randomforest, xgbregressor, extraregressor 사용하자

gbr = GradientBoostingRegressor(**best_gbr_prms)
gb = GradientBoostingRegressor(**best_gbr_prms)
rfr=RandomForestRegressor(**best_rfr_prms)
xgbr=XGBRegressor(**best_xgbr_prms)
extr=ExtraTreesRegressor(**best_etr_prms)

In [103]:
from sklearn.ensemble import VotingRegressor
vtr=VotingRegressor(
    [('gbr',gbr),
     ('rfr',rfr),
     ('xgbr',xgbr),
     ('extr',extr)]
)

In [92]:
gbr = GradientBoostingRegressor()
gb = GradientBoostingRegressor()
rfr=RandomForestRegressor()
xgbr=XGBRegressor()
extr=ExtraTreesRegressor()

In [69]:
for model in [gbr,rfr,xgbr,extr]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    print(mean_squared_error(y_pred,y_test,squared=False))

GradientBoostingRegressor
0.10899359385895453
RandomForestRegressor
0.1182312982791943
XGBRegressor
0.11371639455011129
ExtraTreesRegressor
0.11213718004868929


In [93]:
from mlxtend.regressor import StackingRegressor
stregr=StackingRegressor(regressors=[gb,xgbr,rfr,extr],
                         meta_regressor=gbr)


In [97]:
stregr.fit(train_data_rbs,train_target)

In [95]:
from sklearn.model_selection import GridSearchCV

prms={
    'meta_regressor__n_estimators': [4800],
    'meta_regressor__learning_rate': [0.04127859055165332],
    'meta_regressor__max_depth': [3],
    'meta_regressor__subsample': [0.5],
    'meta_regressor__max_features': ['sqrt'],
    
    'GradientBoostingRegressor__n_estimators': [4800],
    'GradientBoostingRegressor__learning_rate': [0.04127859055165332],
    'GradientBoostingRegressor__max_depth': [3],
    'GradientBoostingRegressor__subsample': [0.5],
    'GradientBoostingRegressor__max_features': ['sqrt'],
    
    'RandomForestRegressor__n_estimators': [2000], 
    'RandomForestRegressor__max_leaf_nodes': [291],
    'RandomForestRegressor__max_depth': [14],
    
    'ExtraTreesRegressor__n_estimators': [1100], 
    'ExtraTreesRegressor__max_leaf_nodes': [867], 
    'ExtraTreesRegressor__max_depth': [18],
    
    'XGBRegressor__lambda': [0.10252576383026227],
    'XGBRegressor__alpha': [0.43737951603549957],
    'XGBRegressor__colsample_bytree': [0.9],
    'XGBRegressor__subsample': [0.8],
    'XGBRegressor__learning_rate': [0.018],
    'XGBRegressor__n_estimators': [1400],
    'XGBRegressor__max_depth': [13],
    'XGBRegressor__min_child_weight': [35]
}

grid = GridSearchCV(estimator = stregr, 
                    param_grid=prms,
                    cv=KFold(n_splits=3,shuffle=True),
                    refit=True, 
                    verbose=1,
                    n_jobs=-1,
                    scoring="neg_mean_squared_error")

grid.fit(train_data_rbs, train_target)
grid_best = grid.best_estimator_
print(grid_best)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


ValueError: Invalid parameter 'ExtraTreesRegressor' for estimator StackingRegressor(meta_regressor=GradientBoostingRegressor(),
                  regressors=[GradientBoostingRegressor(),
                              XGBRegressor(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=None, gpu_id=None,
                                           grow_policy=No...
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,
                                           max_cat_threshold=None,
                                           max_cat_to_onehot=None,
                                           max_delta_step=None, max_depth=None,
                                           max_leaves=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           predictor=None, random_state=None, ...),
                              RandomForestRegressor(), ExtraTreesRegressor()]). Valid parameters are: ['meta_regressor', 'multi_output', 'refit', 'regressors', 'store_train_meta_features', 'use_features_in_secondary', 'verbose'].

In [104]:
vtr.fit(train_data_rbs,train_target)
y_pred=vtr.predict(test_data_rbs)

In [105]:
my_ans=test_data.copy()
my_ans['SalePrice'] = np.expm1(y_pred)

In [106]:
my_ans.loc[:,'SalePrice'].to_csv('vtr_tuned.csv')