In [None]:
import time
import numpy as np
import pandas as pd   # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb

from tune_sklearn import TuneSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import ray

# init ray and attach it to local node ray instance
ray.init(address='auto')

# function to perform the tuning using tune-search library
# add function decorator
@ray.remote
def tune_search_tuning():

    # Input data files are available in the "/var/data/" directory.
    train_df = pd.read_csv("train.csv")
    dataset_size = 1000
    train_df = train_df.iloc[0:dataset_size, :]
    
    y = train_df.label.values
    x = train_df.drop('label', axis=1).values

    # define the train set and test set
    # in principle the test (valid) data is not used later, 
    # so we minimize the size to just 5%.
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.05)
    print("Shapes - X_train: ", x_train.shape, ", X_val: ", x_val.shape, ", y_train: ", y_train.shape, ", y_val: ", y_val.shape)

    # numpy arrays are not accepted in params attributes, 
    # so we use python comprehension notation to build lists
    params = {'max_depth': [3, 6, 10, 15],
              'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
              'subsample': [0.5 + x / 100 for x in range(10, 50, 10)],
              'colsample_bytree': [0.5 + x / 100 for x in range(10, 50, 10)],
              'colsample_bylevel': [0.5 + x / 100 for x in range(10, 50, 10)],
              'n_estimators': [100, 500, 1000],
              'num_class': [10]
              }

    # define the booster classifier indicating the objective as 
    # multiclass "multi:softmax" and try to speed up execution
    # by setting parameter tree_method = "hist"
    xgbclf = xgb.XGBClassifier(objective="multi:softmax",
                               tree_method="hist")

    # replace RamdomizedSearchCV by TuneSearchCV
    # n_trials sets the number of iterations (different hyperparameter combinations)
    # that will be evaluated
    # verbosity can be set from 0 to 3 (debug level).
    tune_search = TuneSearchCV(estimator=xgbclf,
                               param_distributions=params,
                               scoring='accuracy',
                               n_trials=25,
                               verbose=1)

    # perform hyperparameter tuning
    tune_search.fit(x_train, y_train)

    print("cv results: ", tune_search.cv_results_)

    best_combination = tune_search.best_params_
    print("Best parameters:", best_combination)

    # evaluate accuracy based on the test dataset

    return best_combination

if __name__ == '__main__':

    start_time = time.time()

    # create the task
    remote_clf = tune_search_tuning.remote()

    # get the task result
    best_params = ray.get(remote_clf)

    stop_time = time.time()
    print("Stopping at :", stop_time)
    print("Total elapsed time: ", stop_time - start_time)

    print("Best params from main function: ", best_params)