Idea of the notebook is how to use hyperopt and flaml library to tune parameters for lightgbm.

If you're interested, @mlconsult also published a great notebook on [Tuning lightgbm with optuna](https://www.kaggle.com/mlconsult/how-to-tune-lgbm-with-optuna)

### Reference

Most of this notebook is inspired from the wonderful gitrepos

1. [ml course ai hyperopt](https://github.com/Yorko/mlcourse.ai/blob/master/jupyter_english/tutorials/hyperparameters_tunning_ilya_larchenko.ipynb)

2. [Flaml github](https://github.com/microsoft/FLAML)


Thanks @devinanzelmo for the [wifi features](https://www.kaggle.com/devinanzelmo/wifi-features) on how to use wifi features. Here I'm using 1000 as the min count for wifi bssid

In [None]:
!pip install flaml[notebook]

## Libraries import

In [None]:
# common imports
import os
import glob
import random
import numpy as np
import pandas as pd
from scipy.stats import randint

# models libraries
#from lightgbm.sklearn import LGBMRegressor
from lightgbm import LGBMRegressor,LGBMClassifier
from sklearn.svm import SVR

# sklearn imports 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

# hyperopt imports to perform bayesian optimisation 
from hyperopt import Trials, anneal, fmin, hp, tpe

%matplotlib inline

In [None]:
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()

## Helper functions

In [None]:
# the metric used in this competition
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True

SEED = 42
seed_everything(SEED)


# cv strategy 
N_FOLDS = 5
folds = GroupKFold(n_splits=N_FOLDS)

# which optimisation to perform
perform_RandomCVSearch = False
perform_hyperoptParsenEstimator = False
perform_hyperoptSimpleAnnealing = False
perfom_flaml = True


# number of experiments to perform for hyperopt
n_iter = 100

# target time for flaml, in seconds
timeLimit = 30 

## Read sample data

In [None]:
feature_dir = "../input/indoor-navigation-and-location-wifi-features/wifi_features"

# get our train and test files
train_files = sorted(glob.glob(os.path.join(feature_dir, 'train/*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, 'test/*_test.csv')))
ssubm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv', index_col=0)
print(len(train_files),len(test_files))

In [None]:
# selecting a particular site and choosing y coorindate
e = 0
data = pd.read_csv(train_files[e], index_col=0)
print(data.shape)
data.head(3)

## Prepare model inputs and outputs

In [None]:
x_train = data.iloc[:,:-4].values.astype(int)
y_trainy = data.iloc[:,-3].values.astype(float)
y_trainx = data.iloc[:,-4].values.astype(float)
y_trainf = data.iloc[:,-2].values.astype(float)
groups = data["path"]

In [None]:
# normlise inputs
stdScaler = StandardScaler()
x_train = stdScaler.fit_transform(x_train)

## Baseline Lightgbm and SVR model 

In [None]:
%%time
# baseline lightgbm model
model = LGBMRegressor(n_estimators=125, num_leaves=90, random_state=SEED)
results = -cross_val_score(model, X=x_train, y=y_trainy, groups=groups, 
                              scoring="neg_mean_squared_error", cv=folds, n_jobs=-1)
print(f"Cross val score for y coordinate is {results.mean()}")
print(results)

In [None]:
# %%time
# baseline svm model
# svrModel = SVR(C=100.0, epsilon=0.01)
# results = -cross_val_score(svrModel, X=x_train, y=y_trainy, groups=groups, 
#                              scoring="neg_mean_squared_error", cv=folds, n_jobs=-1)
# print(f"Cross val score for y coordinate is {results.mean()}")
# print(results)

## Randomized grid search

In [None]:
%%time
if perform_RandomCVSearch == True:

    param_grid_rand = {
    "learning_rate": np.logspace(-5, 0, 100),
    "max_depth": randint(2, 20),
    "n_estimators": randint(100, 2000),
    "random_state": [SEED],
    }
    
    rs = RandomizedSearchCV(model,
        param_grid_rand,
        n_iter=n_iter,
        scoring="neg_mean_squared_error",
        #fit_params=None,
        n_jobs=-1,
        cv=folds,
        verbose=True,
        random_state=SEED,
    )

    rs.fit(x_train, y_trainy, groups=groups)
    print("Best MSE {:.3f} params {}".format(-rs.best_score_, rs.best_params_))

In [None]:
if perform_RandomCVSearch == True:
    rs_results_df = pd.DataFrame(
        np.transpose(
            [
                -rs.cv_results_["mean_test_score"],
                rs.cv_results_["param_learning_rate"].data,
                rs.cv_results_["param_max_depth"].data,
                rs.cv_results_["param_n_estimators"].data,
            ]
        ),
        columns=["score", "learning_rate", "max_depth", "n_estimators"],
    )
    rs_results_df.plot(subplots=True, figsize=(10, 10))

## Hyperopt tuning methods
### Tree-structured Parzen Estimator and Simple Annealing

In [None]:
def gb_mse_cv(params, X=x_train, y=y_trainy, cv=folds,random_state=SEED):
    # the function gest a set of variable parameters in "param"
    lgb_params = {
        "objective": "regression",
        "metric": "l2",
        "verbosity": -1,
        
        # fixed params
        "boosting_type": "gbdt", 
        "subsample_freq":20,
        "max_depth":6,

        # variable parameters
        "num_leaves": int(params["num_leaves"]),
        "feature_fraction": float(params["feature_fraction"]),
        "bagging_fraction": float(params["bagging_fraction"]),        
        "learning_rate": float(params["learning_rate"]),
        "n_estimators": int(params["n_estimators"]),
        "lambda_l1": float(params["lambda_l1"]),
        "lambda_l2": float(params["lambda_l2"]),
        "min_child_samples": int(params["min_child_samples"]),
    }
    
    # we use this params to create a new LGBM Regressor
    model = LGBMRegressor(random_state=SEED, **lgb_params)

    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X=X, y=y, groups=groups, scoring="neg_mean_squared_error",
                             cv=folds, n_jobs=-1).mean()
    return score

In [None]:
# possible values of parameters
space = {
        # variable parameters
        "num_leaves": hp.quniform("num_leaves", 10, 100, 1),
        "feature_fraction": hp.choice('feature_fraction', np.linspace(0.4, 0.7, 3,dtype=float)),
        "bagging_fraction": hp.choice('bagging_fraction', np.linspace(0.4, 0.7, 3,dtype=float)),        
        "learning_rate": hp.loguniform("learning_rate", -2, -1), 
        "n_estimators": hp.quniform("n_estimators", 500, 10000, 1),
        "lambda_l1": hp.loguniform("lambda_l1", -6, 1.0), 
        "lambda_l2": hp.loguniform("lambda_l2", -6, 1.0), 
        "min_child_samples": hp.quniform("min_child_samples", 5, 100, 1)
        }

# trials will contain logging information
trials = Trials()

In [None]:
tuningAlgorithm = None

# choice of tuning algorithm
if perform_hyperoptParsenEstimator == True:
    tuningAlgorithm = tpe.suggest
if perform_hyperoptSimpleAnnealing == True:
    tuningAlgorithm = anneal.suggest
if perfom_flaml == True:
    tuningAlgorithm = 'flaml'

In [None]:
print(tuningAlgorithm)

In [None]:
%%time
if((perform_hyperoptParsenEstimator == True) or (perform_hyperoptSimpleAnnealing == True)):
    best = fmin(
        fn=gb_mse_cv,                       # function to optimize
        space=space,                        # search space
        algo=tuningAlgorithm,               # optimization algorithm, hyperotp will select its parameters automatically
        max_evals=n_iter,                   # maximum number of iterations
        trials=trials,                      # logging
        show_progressbar=True,
        rstate=np.random.RandomState(SEED), # fixing random state for the reproducibility
    )
    print("Best MSE {:.3f} params {}".format(gb_mse_cv(best), best))

## Plot optimizer results

In [None]:
if((perform_hyperoptParsenEstimator == True) or (perform_hyperoptSimpleAnnealing == True)):
    optimizer_results = np.array([[
                x["result"]["loss"],  
                x["misc"]["vals"]["n_estimators"][0],    
                x["misc"]["vals"]["learning_rate"][0],
                x["misc"]["vals"]["num_leaves"][0],
                x["misc"]["vals"]["feature_fraction"][0],
                x["misc"]["vals"]["bagging_fraction"][0],
                x["misc"]["vals"]["lambda_l1"][0],
                x["misc"]["vals"]["lambda_l2"][0],
                x["misc"]["vals"]["min_child_samples"][0],        
            ] for x in trials.trials ])

    # create a df to plot
    results_columns = ["score", "n_estimators", "learning_rate", "num_leaves", "feature_fraction",
                       "bagging_fraction", "lambda_l1", "lambda_l2", "min_child_samples"]
    optimizer_results_df = pd.DataFrame(optimizer_results, columns=results_columns)
    optimizer_results_df.plot(subplots=True, figsize=(10, 10));

In [None]:
if perfom_flaml == True:
    settings = {
        "metric": 'mse', # primary metrics for regression can be chosen from: ['mae','mse','r2']
        "task": 'regression', # task type        
        "log_file_name": 'lightgbm_ycoorindate.log', # flaml log file    
        "estimator_list": ['lgbm', 'xgboost'], # list of ML learners; we tune lightgbm in this example
        "time_budget": timeLimit, # total running time in seconds
        "eval_method": 'cv',
        "n_splits" : N_FOLDS, 
    }

    # fit algorithms
    automl.fit(X_train = x_train, y_train = y_trainy, **settings)

    print('Best hyperparmeter config:', automl.best_config)
    print('Best mse on validation data: {0:.4g}'.format(automl.best_loss))
    print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
    
    print(automl.model)