In [1]:
# load data
import pandas as pd
data=pd.read_csv('../data/preprocessed_data/weighted_data.csv', encoding='cp949', index_col=0)
target=pd.read_csv('../data/preprocessed_data/y_data.csv', encoding='cp949', index_col=0)

In [2]:
# import models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

models_li = [RandomForestClassifier(), GradientBoostingClassifier(), ExtraTreesClassifier(), XGBClassifier()]

In [3]:
# standard_scaling
from sklearn.preprocessing import StandardScaler
ssc = StandardScaler()
data_ssc=ssc.fit_transform(data)

In [4]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,KFold
kfold = KFold(n_splits=3,random_state=42,shuffle=True)

In [5]:
# RandomForestClassifier_parameter_tuning
# based on optuna
# trial = 20 but for explain trial = 2 

trial = 2 
import optuna

def rfc_object(trial):
    prms={
    'bootstrap': trial.suggest_categorical('bootstrap',[True, False]),
    'max_depth': trial.suggest_int('max_depth',10,1000),
    'max_features': trial.suggest_categorical('max_features',['log2','sqrt']),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf',1,5),
    'min_samples_split': trial.suggest_int('min_samples_split',2,10),
    'n_estimators': trial.suggest_int('n_estimators',200,2000,step=200)
    }
    model =RandomForestClassifier(**prms,n_jobs=-1)
    result = cross_val_score(
        model,
        data_ssc,
        target['10일 뒤 종가'],
        cv=kfold
    )
    return np.mean(result)

study = optuna.create_study(direction="minimize")
study.optimize(rfc_object,n_trials=trial,n_jobs=-1)

[32m[I 2022-11-19 12:54:42,639][0m A new study created in memory with name: no-name-2373bebb-024b-463d-84d1-f6a3cecf6634[0m
  warn(
  warn(
  warn(
[32m[I 2022-11-19 12:54:45,609][0m Trial 0 finished with value: 0.5115286748874536 and parameters: {'bootstrap': False, 'max_depth': 261, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 600}. Best is trial 0 with value: 0.5115286748874536.[0m
[32m[I 2022-11-19 12:54:45,732][0m Trial 1 finished with value: 0.5191818359757291 and parameters: {'bootstrap': False, 'max_depth': 730, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 1200}. Best is trial 0 with value: 0.5115286748874536.[0m


In [7]:
rfc_best_prms=study.best_params
rfc_best_prms

{'bootstrap': False,
 'max_depth': 261,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 8,
 'n_estimators': 600}

In [10]:
# GradientBoostingClassifier_parameter_tuning
# based on optuna

trial = 2 
def gbc_object(trial):
    prms={
    'n_estimators': trial.suggest_int('n_estimators',200,2000,step=200),
    'max_depth': trial.suggest_int('max_depth',10,1000),
    "learning_rate": trial.suggest_float("learning_rate",1e-5,1),
    }
    model =GradientBoostingClassifier(**prms)
    result = cross_val_score(
        model,
        data_ssc,
        target['10일 뒤 종가'],
        cv=kfold
    )
    return np.mean(result)

study = optuna.create_study(direction="minimize")
study.optimize(gbc_object,n_trials=trial)

[32m[I 2022-11-19 12:55:42,356][0m A new study created in memory with name: no-name-a543999f-0109-4bee-92b5-f49ffa40595a[0m
[32m[I 2022-11-19 12:55:45,116][0m Trial 0 finished with value: 0.5065766294773929 and parameters: {'n_estimators': 1800, 'max_depth': 676, 'learning_rate': 0.5519041742687582}. Best is trial 0 with value: 0.5065766294773929.[0m
[32m[I 2022-11-19 12:55:49,991][0m Trial 1 finished with value: 0.5193384223918575 and parameters: {'n_estimators': 1400, 'max_depth': 873, 'learning_rate': 0.14431979548768067}. Best is trial 0 with value: 0.5065766294773929.[0m


In [11]:
gbc_best_prms=study.best_params
gbc_best_prms

{'n_estimators': 1800, 'max_depth': 676, 'learning_rate': 0.5519041742687582}

In [14]:
# ExtraTreesClassifier_parameter_tuning
# based on optuna
trial = 2 

def ettc_object(trial):
    prms={
    'n_estimators': trial.suggest_int('n_estimators',200,2000,step=200),
    'max_depth': trial.suggest_int('max_depth',10,1000),
    # "learning_rate": trial.suggest_float("learning_rate",1e-5,1),
    }
    model =ExtraTreesClassifier(**prms)
    result = cross_val_score(
        model,
        data_ssc,
        target['10일 뒤 종가'],
        cv=kfold
    )
    return np.mean(result)

study = optuna.create_study(direction="minimize")
study.optimize(ettc_object,n_trials=trial)

[32m[I 2022-11-19 12:57:03,742][0m A new study created in memory with name: no-name-be4607fa-1230-442d-b505-c9ec1fa31820[0m
[32m[I 2022-11-19 12:57:06,882][0m Trial 0 finished with value: 0.5523977294969661 and parameters: {'n_estimators': 2000, 'max_depth': 57}. Best is trial 0 with value: 0.5523977294969661.[0m
[32m[I 2022-11-19 12:57:08,801][0m Trial 1 finished with value: 0.5319240555881777 and parameters: {'n_estimators': 1200, 'max_depth': 898}. Best is trial 1 with value: 0.5319240555881777.[0m


In [15]:
ettc_best_prms=study.best_params
ettc_best_prms

{'n_estimators': 1200, 'max_depth': 898}

In [16]:
# XGBClassifier_parameter_tuning
# based on optuna
trial = 2 

def xgbc_object(trial):
    prms={
    'min_child_weight': trial.suggest_int('min_child_weight',1,10),
    'gamma': trial.suggest_float('gamma',0.5,5),
    "subsample": trial.suggest_float("subsample",1e-1,1),
    'colsample_bytree': trial.suggest_float('colsample_bytree',1e-1,1),
    'max_depth': trial.suggest_int('max_depth', 5, 30)
    }
    model =XGBClassifier(**prms,n_jobs=-1)
    result = cross_val_score(
        model,
        data_ssc,
        target['10일 뒤 종가'],
        cv=kfold
    )
    return np.mean(result)

study = optuna.create_study(direction="minimize")
study.optimize(xgbc_object,n_trials=trial,n_jobs=-1)

[32m[I 2022-11-19 12:57:58,098][0m A new study created in memory with name: no-name-ac82d1c1-aaf9-4d85-b69b-7d1e7d02bbd1[0m
[32m[I 2022-11-19 12:57:59,387][0m Trial 0 finished with value: 0.4551967116852613 and parameters: {'min_child_weight': 5, 'gamma': 4.662535007797089, 'subsample': 0.1044570723498921, 'colsample_bytree': 0.6528742991907969, 'max_depth': 14}. Best is trial 0 with value: 0.4551967116852613.[0m
[32m[I 2022-11-19 12:58:00,250][0m Trial 1 finished with value: 0.4987864552750049 and parameters: {'min_child_weight': 6, 'gamma': 1.483904049608391, 'subsample': 0.3792930389785073, 'colsample_bytree': 0.5699951595715426, 'max_depth': 14}. Best is trial 0 with value: 0.4551967116852613.[0m


In [18]:
xgbc_best_prms=study.best_params
xgbc_best_prms

{'min_child_weight': 5,
 'gamma': 4.662535007797089,
 'subsample': 0.1044570723498921,
 'colsample_bytree': 0.6528742991907969,
 'max_depth': 14}

In [38]:
gbc = GradientBoostingClassifier(**gbc_best_prms)
xgbc = XGBClassifier(**xgbc_best_prms)
rfc = RandomForestClassifier(**rfc_best_prms)
ettc_raw = ExtraTreesClassifier()
ettc_best = ExtraTreesClassifier(**ettc_best_prms)

from mlxtend.classifier import StackingClassifier

stc = StackingClassifier(
    classifiers=[gbc,xgbc,rfc,ettc_raw],
    meta_classifier=ettc_best
)

In [39]:
stc.get_params()

{'average_probas': False,
 'classifiers': [GradientBoostingClassifier(learning_rate=0.5519041742687582, max_depth=676,
                             n_estimators=1800),
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=0.6528742991907969, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=4.662535007797089, gpu_id=None, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=None, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=14,
                max_leaves=None, min_child_weight=5, missing=nan,
                monotone_constraints=None, n_estimators=100, n_jobs=None,
                num_parallel_tree=None, predictor=None, random_state=None, ...),
  RandomForestClassifier(bootstrap=

In [41]:
rfc

In [42]:
stc = StackingClassifier(
    classifiers=[gbc,xgbc,rfc,ettc_raw],
    meta_classifier=ettc_best
)

In [40]:
prms={
    'xgbclassifier__objective':['binary:logistic','multi:softprob'],
    'xgbclassifier__n_estimators':[100,500],
    'randomforestclassifier__bootstrap':[True,False],
    'randomforestclassifier__criterion':['gini','entropy','log_loss'],

    }


In [44]:
from sklearn.model_selection import GridSearchCV
grs_cv = GridSearchCV(
    stc,
    param_grid=prms,
    verbose=2
)
grs_cv.fit(data,target['10일 뒤 종가'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=gini, xgbclassifier__n_estimators=100, xgbclassifier__objective=binary:logistic; total time=   2.4s
[CV] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=gini, xgbclassifier__n_estimators=100, xgbclassifier__objective=binary:logistic; total time=   2.3s
[CV] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=gini, xgbclassifier__n_estimators=100, xgbclassifier__objective=binary:logistic; total time=   2.3s
[CV] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=gini, xgbclassifier__n_estimators=100, xgbclassifier__objective=binary:logistic; total time=   2.3s
[CV] END randomforestclassifier__bootstrap=True, randomforestclassifier__criterion=gini, xgbclassifier__n_estimators=100, xgbclassifier__objective=binary:logistic; total time=   2.3s
[CV] END randomforestcl

In [47]:
grs_cv.best_params_

{'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__criterion': 'entropy',
 'xgbclassifier__n_estimators': 100,
 'xgbclassifier__objective': 'multi:softprob'}