In [None]:
import time
import numpy as np
import pandas as pd   # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb

from tune_sklearn import TuneSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
import ray

# init ray and attach it to local node ray instance
ray.init(address='auto')

# function to perform the tuning using tune-search library
# add function decorator
@ray.remote
def tune_search_tuning():

    # Input data files are available in the "/var/data/" directory.
    train_df = pd.read_csv("/home/ubuntu/de2/lab3/train.csv")
    dataset_size = 1000
    train_df = train_df.iloc[0:dataset_size, :]
    
    y = train_df.label.values
    x = train_df.drop('label', axis=1).values

    # define the train set and test set
    # in principle the test (valid) data is not used later, 
    # so we minimize the size to just 5%.
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.05)
    print("Shapes - X_train: ", x_train.shape, ", X_val: ", x_val.shape, ", y_train: ", y_train.shape, ", y_val: ", y_val.shape)

    # numpy arrays are not accepted in params attributes, 
    # so we use python comprehension notation to build lists
    params = {'max_depth': [3, 6, 10, 15],
              'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
              'subsample': [0.5 + x / 100 for x in range(10, 50, 10)],
              'colsample_bytree': [0.5 + x / 100 for x in range(10, 50, 10)],
              'colsample_bylevel': [0.5 + x / 100 for x in range(10, 50, 10)],
              'n_estimators': [100, 500, 1000],
              'num_class': [10]
              }

    # define the booster classifier indicating the objective as 
    # multiclass "multi:softmax" and try to speed up execution
    # by setting parameter tree_method = "hist"
    xgbclf = xgb.XGBClassifier(objective="multi:softmax",
                               tree_method="hist")

    # replace RamdomizedSearchCV by TuneSearchCV
    # n_trials sets the number of iterations (different hyperparameter combinations)
    # that will be evaluated
    # verbosity can be set from 0 to 3 (debug level).
    tune_search = TuneSearchCV(estimator=xgbclf,
                               param_distributions=params,
                               scoring='accuracy',
                               n_trials=25,
                               verbose=1)

    # perform hyperparameter tuning
    tune_search.fit(x_train, y_train)

    print("cv results: ", tune_search.cv_results_)

    best_combination = tune_search.best_params_
    print("Best parameters:", best_combination)

    # evaluate accuracy based on the test dataset

    return best_combination

if __name__ == '__main__':

    start_time = time.time()

    # create the task
    remote_clf = tune_search_tuning.remote()

    # get the task result
    best_params = ray.get(remote_clf)

    stop_time = time.time()
    print("Stopping at :", stop_time)
    print("Total elapsed time: ", stop_time - start_time)

    print("Best params from main function: ", best_params)

In [None]:
import time
import numpy as np
import pandas as pd   # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb

from tune_sklearn import TuneSearchCV
from sklearn import metrics

# function to perform the tuning using tune-search library
def tune_search_tuning():

    # Input data files are available in the "./data/" directory.
    train_df = pd.read_csv("/home/ubuntu/train.csv")
    test_df = pd.read_csv("/home/ubuntu/test.csv")

    # limit dataset size to 1000 samples
    dataset_size = 1000
    train_df = train_df.iloc[0:dataset_size, :]
    test_df = test_df.iloc[0:dataset_size, :]

    print("Reduced dataset size: ", train_df.shape)

    y_train = train_df.label.values
    x_train = train_df.drop('label', axis=1).values

    y_test = test_df.label.values
    x_test = test_df.drop('label', axis=1).values

    params = {'max_depth': [6, 10],
              'learning_rate': [0.1, 0.3, 0.4],
              'subsample': [0.6, 0.7, 0.8, 0.9, 1],
              'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
              'colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1],
              'n_estimators': [500, 1000],
              'num_class': [10]
              }

    start_time = time.time()
    print("starting at: ", start_time)

    # define the booster classifier indicating the objective
    # as multiclass "multi:softmax" and try to speed up execution
    # by setting parameter tree_method = "hist"
    xgbclf = xgb.XGBClassifier(objective="multi:softmax",
                               tree_method="hist")

    # replace RamdomizedSearchCV by TuneSearchCV
    # n_trials sets the number of iterations (different hyperparameter combinations)
    # that will be evaluated

    # verbosity can be set from 0 to 3 (debug level).
    tune_search = TuneSearchCV(estimator=xgbclf,
                               param_distributions=params,
                               scoring='accuracy',
                               n_trials=20,
                               n_jobs=8,
                               verbose=2)

    # perform hyperparameter tuning
    tune_search.fit(x_train, y_train)

    stop_time = time.time()
    print("Stopping at :", stop_time)
    print("Total elapsed time: ", stop_time - start_time)

    best_combination = tune_search.best_params_

    # evaluate accuracy based on the test dataset
    predictions = tune_search.predict(x_test)

    accuracy = metrics.accuracy_score(y_test, predictions)
    print("Accuracy: ", accuracy)

    return best_combination

if __name__ == '__main__':

    best_params = tune_search_tuning()
    print("Best parameters:", best_params)

[2m[1m[36m(scheduler +33m46s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[2m[1m[36m(scheduler +33m46s)[0m Adding 1 nodes of type local.cluster.node.
[2m[1m[36m(scheduler +34m1s)[0m Resized to 4 CPUs.
[2m[1m[36m(scheduler +50m18s)[0m Removing 1 nodes of type local.cluster.node (idle).
[2m[1m[36m(scheduler +50m28s)[0m Resized to 3 CPUs.


In [None]:
from sklearn.ensemble import RandomForestClassifier
import sklearn
import time
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import statistics
import pandas as pd
ray.init(address="auto")

def train(config):
    trainDF = pd.read_csv("/home/ubuntu/test.csv")
    trainFeatures = trainDF.iloc[:, :-1] # all rows, all but last column
    trainClasses = trainDF.iloc[:, -1] # all rows, only last column

    RFclassifier = RandomForestClassifier()
    params = RFclassifier.get_params()
    sklearn.model_selection.cross_validate(RFclassifier, trainFeatures, trainClasses)
    #print (train_df.shape, test_df.shape)
    RFclassifier = RandomForestClassifier(**config)
    params = RFclassifier.get_params()
    xval = sklearn.model_selection.cross_validate(RFclassifier, trainFeatures, trainClasses)
    #tune.report(mean_accuracy=statistics.mean(xval["test_score"]))

if __name__ == '__main__':
    search_space = {
    "max_depth": tune.grid_search([5*i for i in range(1,10)]),
    "n_estimators": tune.grid_search([10,100,200,300]),
    "ccp_alpha": tune.grid_search([0.0,0.1,0.2]),}
    start_time = time.time()

    # create the task
    remote_clf = train(search_space)

    # get the task result
    #best_params = ray.get(remote_clf)

    stop_time = time.time()
    print("Stopping at :", stop_time)
    print("Total elapsed time: ", stop_time - start_time)