In [23]:
#import lightgbm as lgb
import joblib
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import make_scorer
import optuna
import optuna.integration.lightgbm as lgb
from optuna.integration import LightGBMPruningCallback
from lightgbm import LGBMClassifier
#from optuna.integration._lightgbm_tuner.sklearn import LGBMClassifier
from utils import g_score

In [29]:
X_train = np.load('X_train.npy',allow_pickle=True)
y_train = np.load('y_train.npy')
feature_names = list(np.load('feature_names.npy'))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [30]:
X_train = X_train[:20000]
y_train = y_train[:20000]

In [15]:
def objective(trial, X_train, y_train, cv):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    model = LGBMClassifier(**param_grid)
    return cross_val_score(model, X_train, y_train, scoring=g_score, cv=cv, n_jobs=-1).mean()

In [31]:
optuna.logging.set_verbosity(optuna.logging.INFO)
sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(direction="maximize", study_name="LGBM_ht", sampler=sampler)
joblib.dump(study, "LGBM_ht.pkl")
func = lambda trial: objective(trial, X_train, y_train, cv)
try:
    study.optimize(func, n_jobs= -1, timeout= 100, show_progress_bar=True)  #n_trials=20
    joblib.dump(study, "LGBM_ht.pkl")
except Exception as error:
    joblib.dump(study, "LGBM_ht.pkl")
    print(error)



[32m[I 2023-02-10 06:35:50,261][0m A new study created in memory with name: LGBM_ht[0m




[32m[I 2023-02-10 06:36:19,570][0m Trial 0 finished with value: 17904000.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.17837287633030227, 'num_leaves': 140, 'max_depth': 10, 'min_data_in_leaf': 8800, 'lambda_l1': 0, 'lambda_l2': 45, 'min_gain_to_split': 3.8058627535561356, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 0 with value: 17904000.0.[0m




[32m[I 2023-02-10 06:37:26,259][0m Trial 3 finished with value: 17904000.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.1955233250224563, 'num_leaves': 1560, 'max_depth': 5, 'min_data_in_leaf': 4000, 'lambda_l1': 20, 'lambda_l2': 35, 'min_gain_to_split': 13.952728375181495, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 0 with value: 17904000.0.[0m








[32m[I 2023-02-10 06:38:56,864][0m Trial 1 finished with value: 18407200.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.10561605271803372, 'num_leaves': 2600, 'max_depth': 11, 'min_data_in_leaf': 4700, 'lambda_l1': 45, 'lambda_l2': 60, 'min_gain_to_split': 10.16832626602109, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 1 with value: 18407200.0.[0m




[32m[I 2023-02-10 06:41:13,216][0m Trial 4 finished with value: 17904000.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.05134445766466694, 'num_leaves': 200, 'max_depth': 11, 'min_data_in_leaf': 8900, 'lambda_l1': 100, 'lambda_l2': 20, 'min_gain_to_split': 11.969011090852904, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 1 with value: 18407200.0.[0m




[32m[I 2023-02-10 06:42:05,330][0m Trial 2 finished with value: 18256800.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.061485924907781216, 'num_leaves': 2760, 'max_depth': 6, 'min_data_in_leaf': 2300, 'lambda_l1': 65, 'lambda_l2': 50, 'min_gain_to_split': 12.12310600597895, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 1 with value: 18407200.0.[0m




[32m[I 2023-02-10 06:44:38,339][0m Trial 5 finished with value: 18866800.0 and parameters: {'n_estimators': 10000, 'learning_rate': 0.0818879444738594, 'num_leaves': 80, 'max_depth': 10, 'min_data_in_leaf': 2200, 'lambda_l1': 5, 'lambda_l2': 30, 'min_gain_to_split': 8.380538283610717, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 5 with value: 18866800.0.[0m


In [32]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_bagging_fraction,params_bagging_freq,params_feature_fraction,params_lambda_l1,params_lambda_l2,params_learning_rate,params_max_depth,params_min_data_in_leaf,params_min_gain_to_split,params_n_estimators,params_num_leaves,state
0,0,17904000.0,2023-02-10 06:35:50.277806,2023-02-10 06:36:19.569576,0 days 00:00:29.291770,0.5,1,0.2,0,45,0.178373,10,8800,3.805863,10000,140,COMPLETE
1,1,18407200.0,2023-02-10 06:35:50.559298,2023-02-10 06:38:56.862807,0 days 00:03:06.303509,0.7,1,0.8,45,60,0.105616,11,4700,10.168326,10000,2600,COMPLETE
2,2,18256800.0,2023-02-10 06:35:50.566691,2023-02-10 06:42:05.329282,0 days 00:06:14.762591,0.9,1,0.3,65,50,0.061486,6,2300,12.123106,10000,2760,COMPLETE
3,3,17904000.0,2023-02-10 06:35:50.569035,2023-02-10 06:37:26.258477,0 days 00:01:35.689442,0.3,1,0.7,20,35,0.195523,5,4000,13.952728,10000,1560,COMPLETE
4,4,17904000.0,2023-02-10 06:36:19.578848,2023-02-10 06:41:13.215669,0 days 00:04:53.636821,0.6,1,0.5,100,20,0.051344,11,8900,11.969011,10000,200,COMPLETE
5,5,18866800.0,2023-02-10 06:37:26.268246,2023-02-10 06:44:38.338480,0 days 00:07:12.070234,0.8,1,0.4,5,30,0.081888,10,2200,8.380538,10000,80,COMPLETE


In [26]:
print(f"\tBest value (ganancia): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (ganancia): 832000.00000
	Best params:
		n_estimators: 10000
		learning_rate: 0.2594876362814767
		num_leaves: 2140
		max_depth: 9
		min_data_in_leaf: 700
		lambda_l1: 50
		lambda_l2: 85
		min_gain_to_split: 0.18124776275421972
		bagging_fraction: 0.30000000000000004
		bagging_freq: 1
		feature_fraction: 0.8
