## GPU based PySpark XGBoost

##### Importing XGBoost, hyperopt, scikit learn, pandas and other helper function packages

In [3]:
import xgboost as xgb

from hyperopt import hp, fmin, tpe, STATUS_OK, SparkTrials

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd

import os
import shutil
import tempfile

## Data Loading

##### For a large dataset, broadcasting the dataset would take significant cluster resources. We store the data on DBFS and load it back on workers via DBFS' local file interface.

See Databricks best practices for HyperOpt: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html

In [6]:
def load(path):
    """
    Loads saved data (a tuple of numpy arrays).
    Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html
    """
    return list(np.load(path).values())
    
def save_to_dbfs(data):
    """
    Saves input data (a tuple of numpy arrays) to a temporary file on DBFS and returns its path.
    Refernce: https://docs.databricks.com/applications/machine-learning/automl/hyperopt/hyperopt-best-practices.html
    """
    # Save data to a local file first.
    data_filename = "data.npz"
    local_data_dir = tempfile.mkdtemp()
    local_data_path = os.path.join(local_data_dir, data_filename)
    np.savez(local_data_path, *data)
    # Move it to DBFS, which is shared among cluster nodes.
    dbfs_tmp_dir = "/dbfs/ml/tmp/hyperopt"
    os.makedirs(dbfs_tmp_dir, exist_ok=True)
    dbfs_data_dir = tempfile.mkdtemp(dir=dbfs_tmp_dir)  
    dbfs_data_path = os.path.join(dbfs_data_dir, data_filename)  
    shutil.move(local_data_path, dbfs_data_path)
    return dbfs_data_path

##### Preparing XGBoost Data

In [8]:
def prepare_xgb_data(data, id_col="Id", label_col="Label", test_size=0.2):
    """
    Prepare data for xgboost training
    """
    # Make sure last column is label, first col
    data[label_col+"Temp"] = data[label_col]
    data = data.drop([id_col, label_col], axis=1)
    data.rename(columns={label_col+"Temp": label_col}, inplace=True)
    
    # Prepare data
    X, y = data.iloc[:,:-1],data.iloc[:,-1]
    data_dmatrix = xgb.DMatrix(data=X,label=y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=21)
    return X_train, X_test, y_train, y_test
    
def get_raw_data(file_name, sample_size):
    input_file_location = "/dbfs/FileStore/tables/" + file_name
    pdf = pd.read_csv(input_file_location).dropna().sample(n=sample_size)
    return pdf

## XGBoost Regression with Hyperopt + Spark Trials

In [10]:
def objective(space):
    """
    Train and search input space
    """
    clf = xgb.XGBRegressor(objective ='reg:squarederror', 
                            n_estimators = int(space['n_estimators']),
                            colsample_bytree = space['colsample_bytree'],
                            learning_rate = space['learning_rate'],
                            max_depth = int(space['max_depth']),
                            alpha = space['alpha'],
                            tree_method= space['tree_method']
                          )
    
    # Load data
    data = load(data_large_path)
    X_train, X_test, y_train, y_test = data[0], data[1], data[2], data[3]
    eval_set  = [(X_train, y_train), (X_test, y_test)]

    # Train
    clf.fit(X_train, y_train,
            eval_set=eval_set, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    
    # Validate
    pred = clf.predict(X_test)
    mse_scr = mean_squared_error(y_test, pred)

    return {'loss': mse_scr, 'status': STATUS_OK}

def run_hyperopt(df, treemethod, parallelism, max_evals):
    """
    Run hyperopt and return best params
    """
    # Hyperopt search space
    space ={'max_depth': hp.quniform('max_depth', 4, 16, 1),
            'alpha' : hp.uniform('alpha', 1, 10),
            'colsample_bytree' : hp.uniform('colsample_bytree', 0.1, 1),
            'learning_rate' : hp.uniform('learning_rate', 0.1, 1),
            'n_estimators': hp.quniform('n_estimators', 25, 500, 25),
            'tree_method': treemethod
        }
    if parallelism is None:
        trials = SparkTrials()
    else:
        trials = SparkTrials(parallelism=parallelism)

    # Hyperopt
    best_param = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)
    print(best_param)
    
    return best_param

## Train

##### Parallelism parameter is set "2" for 2 GPUs, which is effectively using 2 GPUs in parallel. Each new hyperparameter setting tested will be chosen based on previous results. Setting parallelism in between 1 and max_evals allows you to trade off scalability (getting results faster) and adaptiveness (sometimes getting better models). For GPU, is is advised to set number of GPUs used for training.

In [13]:
# Dataset
file_name = "your_file_name.csv" # dataset file name
id_col="unique_id_column_name" # unique id for each row
label_col="label_column_name" # label column name

# Load data
df = get_raw_data(file_name=file_name, sample_size=10000)
data_large = prepare_xgb_data(df, id_col=id_col, label_col=label_col, test_size=0.2)
data_large_path = save_to_dbfs(data_large)

# Run training
best_param = run_hyperopt(df, treemethod='gpu_hist', parallelism=2, max_evals=10) # Set parallelism = Number of GPUs

# Cleanup
shutil.rmtree(data_large_path, ignore_errors=True)