In [None]:
"""
# scikit_learn based process

```
1. scikit learn main model fit predict 
2. pick some fine model
3. do parameter tuning by optuna
4. combination by mlxtend
```
"""

In [1]:
import pandas as pd 
import numpy as np 
import mlxtend
import optuna
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train_data = pd.read_csv('../_mlxtend/data_preprocessed/train_data_preprocessed.csv',index_col=0)
test_data = pd.read_csv('../_mlxtend/data_preprocessed/test_data_preprocessed.csv',index_col=0)
train_target = pd.read_csv('../_mlxtend/data_preprocessed/train_target.csv',index_col=0)

In [3]:
# 일단 RobustScaler 진행
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
rbs = RobustScaler()
train_data_rbs=rbs.fit_transform(train_data)
test_data_rbs=rbs.transform(test_data)

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_data_rbs,
                                                 train_target,
                                                 test_size=0.2,
                                                 shuffle=True)

In [5]:
for model in [RandomForestRegressor(),XGBRegressor(),AdaBoostRegressor()]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    result=cross_val_score(
        model,
        train_data_rbs,
        train_target,
        scoring='neg_mean_squared_error',
        cv=KFold(n_splits=5,shuffle=True)
    )
    print(np.sqrt(np.mean(result)*-1))
    
for model in [GradientBoostingRegressor(), HistGradientBoostingRegressor()]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    result=cross_val_score(
        model,
        train_data_rbs,
        train_target,
        scoring='neg_mean_squared_error',
        cv=KFold(n_splits=5,shuffle=True)
    )
    print(np.sqrt(np.mean(result)*-1))

RandomForestRegressor
0.1401667477042479
XGBRegressor
0.1341032819973866
AdaBoostRegressor
0.16892854079467953
GradientBoostingRegressor
0.1240604213391761
HistGradientBoostingRegressor
0.13073578080764361


In [6]:
# gradientboosting이 압도적인데
# gradientboosting hyper parmetertuning
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()



{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [None]:
# [RandomForestRegressor(),XGBRegressor()]

# 1. randomforest hyperparmeter tuning

# 2. xgbregressor hyperparmeter tuning

In [None]:
# base_line optuna
import optuna

def my_objective(trial):
    """
    1. make parameter dictionary
    2. define model for the parameter
    3. fit model
    4. get score based on purpose
    5. return
    """
    #1
    prms={
    'booster': trial.suggest_categorical('booster',['gbtree','gblinear','dart']),
    'learning_rate': trial.suggest_float('learning_rate',0,1) ,
    'max_delta_step': trial.suggest_float('max_delta_step',0,100),
    'max_depth': trial.suggest_int('max_depth',0,100),
    'min_child_weight': trial.suggest_float('min_child_weight',0,10),
    'n_estimators': trial.suggest_int('n_estimators',100,300)
    }
    # 2.
    xgbc = XGBClassifier(**prms)
    # 3.
    xgbc.fit(x_train,y_train)
    y_pred=xgbc.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    return acc

my_study = optuna.create_study(direction='maximize')
my_study.optimize(my_objective,n_trials=20)

In [None]:
# [GradientBoostingRegressor(),HistGradientBoostingRegressor()]

# 1. GradientBoostingRegressor hyperparmeter tuning
def my_objective(trial):
    #1
    prms={
    'learning_rate': trial.suggest_float('learning_rate',0,1) ,
    'n_estimators': trial.suggest_int('n_estimators',100,3000),
    'alpha': trial.suggest_float('alpha',0,1) ,
    'max_depth': trial.suggest_int('max_depth',1,100),
    'tol': trial.suggest_float('tol',0,1),
    }

    # 2.
    gbr = GradientBoostingRegressor(**prms)
    # 3.
    gbr.fit(x_train,y_train)
    y_pred=gbr.predict(x_test)
    rmse = mean_squared_error(y_test,y_pred,squared=False)
    return rmse

my_gbr_study = optuna.create_study(direction='minimize')
my_gbr_study.optimize(my_objective,n_trials=20)

# 2. HistGradientBoostingRegressor hyperparmeter tuning
def my_objective(trial):
    #1
    prms={
    'learning_rate': trial.suggest_float('learning_rate',0,1) ,
    'l2_regularization': trial.suggest_float('l2_regularization',1e-5,1e-2),
    'max_depth': trial.suggest_int('max_depth',1,100),
    'tol': trial.suggest_float('tol',0,1),
    }

    # 2.
    hgbr = HistGradientBoostingRegressor(**prms)
    # 3.
    hgbr.fit(x_train,y_train)
    y_pred=hgbr.predict(x_test)
    rmse = mean_squared_error(y_test,y_pred,squared=False)
    return rmse

my_hgbr_study = optuna.create_study(direction='minimize')
my_hgbr_study.optimize(my_objective,n_trials=20)


In [None]:
gbr_best_prms=my_gbr_study.best_params
hgbr_best_prms=my_hgbr_study.best_params

In [None]:
best_gbr = GradientBoostingRegressor(**gbr_best_prms)
best_hgbr = HistGradientBoostingRegressor(**hgbr_best_prms)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

x_train,x_test,y_train,y_test = train_test_split(train_data_rbs,train_target)

# Instantiate the Regressor
voting_reg = VotingRegressor(
    estimators=[('hgbr', best_hgbr), ('gbr', best_gbr)],
    
    )

for model in [best_gbr,best_hgbr,voting_reg]:
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(str(model).split('(')[0])
    print(mean_squared_error(y_pred,y_test,squared=False))

In [None]:
voting_reg.fit(train_data_rbs,train_target)
y_pred=voting_reg.predict(test_data_rbs)

In [None]:
my_ans=test_data.copy()
my_ans['SalePrice'] = np.expm1(y_pred)

In [None]:
my_ans.loc[:,'SalePrice'].to_csv('voting_reg.csv')