In [None]:
############ ANALYSIS OF PARALLELIZATION OF XGBOOST-ON-RAY #############

FINDINGS:
    
- BEST SETUP:
    - USE GPUs
        - as many actors as GPUs
        - USE MAX(AVAILABLE! cpus_per_actor)
    - USE ray.init() at the beging of the code
    - USE as many file.shardes as GPUs or multiple of GPUs (possibly some minor improvment achived when using a lot of smaller files)
    - USE RayDMatrix(..., distributed=True)

- GENERAL:
    - USE ray.init() at the beging of the code:
        - for some reason it does speed up execution, 
        - although it is discussed nowhere in the documentation or other on-line help materials
    - DATA SOURCE:
        - RECOMMENDED - (MULTIPLE) PARQUET files
            - Thus installation of pyarrow
            - number of shards defines available prallelization level            
        - INPUT SIZE:
            - the bigger the clearer impact of prallelization
    - USE RayDMatrix(..., distributed=True)
    - USE as many file.shardes as GPUs or multiple of GPUs (possibly some minor improvment achived when using a lot of smaller files)             
    - n_jobs & num_actors has no impact on execution time -- often it makes it even worse probabaly due to comm-overhead
        
- CPU SPECIFIC:
    - "no need to specify the cpu_per_actor as RAY utilizes multithreading and thus uses all avalube CPUs as default":
        - Lost reference but made by a member of the RAY.CORE team
        - indeed ON CPU when working with non-distributed dataset specifying this parameter has no significant impact on execution time
    - num_actors * cpus_per_actor <= num_CPU_cores:

- GPU SPECIFIC:
    - EVEN SINGLE GPU HAS FASTER EXECUTION THAN MULT_CPU INSTANCE
    - 4 GPUs has 3x SPEEDUP over 1 GPU
    
- ESTIMATORS:
    - RayXGBClassifier(n_jobs=???) - n_jobs param has no impact on execution time
    

In [None]:
# ENV PREPARATION & ANALYSIS

!pip install "xgboost_ray"
!pip install "pyarrow"

#display(ray.cluster_resources())
#print(ray.cluster_resources()['CPU'], 
#ray.cluster_resources()['GPU'])
#display(dir(ray_params.get_tune_resources()))
#ray_params.get_tune_resources().head_cpus, ray_params.get_tune_resources().bundles, 

In [None]:
# TEST 1

import xgboost_ray as xr
import ray
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer
import time
import numpy as np

ray.init()

train_x, train_y = load_breast_cancer(return_X_y=True)

# NEED TO ARTIFICALLY BOOST MODEL SIZE DUE TO TOO QUICK TRAINING 
for i in range(6): # 6 = x64  # on AWS c5.xLarge 6 is MAX
    train_x = np.vstack([train_x,train_x])
    train_x = np.hstack([train_x,train_x])    
    train_y = np.hstack([train_y, train_y])

print(train_x.shape, train_y.shape)

train_set = RayDMatrix(train_x, train_y)

#print(train_x.shape, type(train_x))

evals_result = {}

start_time = time.time()

bst = train({"objective": "binary:logistic",
             "eval_metric": ["logloss", "error"],},
            train_set,
            evals_result=evals_result,
            evals=[(train_set, "train")],
            verbose_eval=True,
            ray_params=RayParams(num_actors = 1, # Number of Remote Actors
                                 gpus_per_actor = 1,
                                 cpus_per_actor = 1))

print("TRAIN TIME", time.time() - start_time)

bst.save_model("model.xgb")

print("Final training error: {:.4f}, {:.4f}".format(
    evals_result["train"]["error"][-1],
    evals_result["train"]["logloss"][-1]))

ray.shutdown()

In [None]:
### M1

TO utilize parallelism you need to add ray.init(num_cpu = XXX) or just ray.init()
# num_actors=1, cpu_per_actor = Not Specified <-- BEST (no need to specify the cpu_per_actor as RAY utilizes 
                                                        multithreading and thus uses all avalube CPUs as dfault)

WITH ray.init() # NICE SPEEDUP 
TRAIN TIME 2.66068696975708
Final training error: 0.0000, 0.0253


NO ray.init()
# num_actors=1, cpu_per_actor = 1
TRAIN TIME 6.129012107849121
Final training error: 0.0000
    
# num_actors=1, cpu_per_actor = 10
TRAIN TIME 6.171634197235107
Final training error: 0.0000
    
# num_actors=2, cpu_per_actor = 1
TRAIN TIME 7.246088027954102
Final training error: 0.0000
    
# num_actors=2, cpu_per_actor = 5
TRAIN TIME 6.13xxxxxxxxxxxxx
Final training error: 0.0000 
    
# num_actors=3, cpu_per_actor = 3
TRAIN TIME 7.325688123703003
Final training error: 0.0000
    
# num_actors=10, cpu_per_actor = 1
TRAIN TIME 10.49834394454956
Final training error: 0.0000



### AWS c5.xlarge -- 4 CPU, 0 GPU, NO ray.init()
-- HERE CPU per actor works as expected ALTHOUGH not big boost

# num_actors=1, cpu_per_actor = 1 - range(4)
TRAIN TIME 11.816567659378052
Final training error: 0.0000, 0.0253

# num_actors=1, cpu_per_actor = 1 - range(6)
TRAIN TIME 63.819782972335815
Final training error: 0.0000, 0.0238


# num_actors=1, cpu_per_actor = 4 - range(4)
TRAIN TIME 9.398769855499268
Final training error: 0.0000, 0.0253

# num_actors=1, cpu_per_actor = 4 - range(6)
TRAIN TIME 54.842820167541504
Final training error: 0.0000, 0.0238



### AWS p3.2xlarge -- 4 CPU, 1 GPU, NO ray.init()

# num_actors=1, cpu_per_actor = 1 - range(6)
TRAIN TIME 36.15526247024536
Final training error: 0.0000, 0.0238

# num_actors=1, cpu_per_actor = 4 - range(6)
TRAIN TIME 32.72776222229004
Final training error: 0.0000, 0.0238

# num_actors=1, cpu_per_actor = 1, GPU = 1 - range(6)
TRAIN TIME 35.24531555175781
Final training error: 0.0000, 0.0238

In [None]:
# TEST 2

# NOT SURE IF I RECOREDED RESULTS -- PROBABLY SEARCH FOR SPECIFICATION THAT RESULTS IN VISIBLE PRALLELIZATION
from xgboost_ray import RayDMatrix, RayParams, predict
from sklearn.datasets import load_breast_cancer
import xgboost as xgb

train_x, train_y = load_breast_cancer(return_X_y=True)

"""
# NEED TO ARTIFICALLY BOOST MODEL SIZE DUE TO TOO QUICK TRAINING 
for i in range(4): # 6 = x64
    train_x = np.vstack([train_x,train_x])
    train_x = np.hstack([train_x,train_x])    
    train_y = np.hstack([train_y, train_y])
"""

print(train_x.shape, train_y.shape)

dpred = RayDMatrix(train_x, train_y)

bst = xgb.Booster(model_file="model.xgb")

start_time = time.time()

pred_ray = predict(bst, dpred, ray_params=RayParams(num_actors=4))

print("TRAIN TIME", time.time() - start_time)

print(pred_ray)

ray.shutdown()

In [None]:
# TEST 3

# DATA GENERATION FOR THE NEXT CELL
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import time
import ray

# define dataset
X, y = make_classification(
    n_samples=20000, #20000
    n_features=1000,
    n_informative=50,
    n_redundant=0,
    random_state=1)
# summarize the dataset
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.50, random_state=1)

from xgboost_ray import RayDMatrix, RayParams, train

train_set = RayDMatrix(X_train, y_train)
eval_set = RayDMatrix(X_test, y_test)
print("DONE!")

In [None]:
#ray.init(num_cpus = 2, num_gpus = 1) #, ignore_reinit_error=True)

evals_result = {}

start_time = time.time()

bst = train({"objective": "binary:logistic",
             "eval_metric": ["logloss", "error"],
             "tree_method": "gpu_hist",},
            train_set,
            num_boost_round=10,
            evals_result=evals_result,
            evals=[(train_set, "train"), 
                   (eval_set, "eval")],
            verbose_eval=True,
            ray_params=RayParams(num_actors=1,
                                 gpus_per_actor=1,
                                 cpus_per_actor=1,  # Divide evenly across actors per machine
                                ))

print("TRAIN TIME", time.time() - start_time)

bst.save_model("model.xgb")

print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1]))
print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1]))

ray.shutdown()

In [None]:
### M1
#num_actors = 1/cpus_per_actor = 1
TRAIN TIME 7.220772981643677
Final training error: 0.0723
Final validation error: 0.1279
    
#2/1
TRAIN TIME 7.784131050109863
Final training error: 0.0730
Final validation error: 0.1317    
    
#10/1 -- Failed

#1/1
TRAIN TIME 9.191377878189087
Final training error: 0.0568
Final validation error: 0.1448
    
#1/2
TRAIN TIME 8.250745058059692
Final training error: 0.0568
Final validation error: 0.1448
    
#1/4
TRAIN TIME 8.249265909194946
Final training error: 0.0568
Final validation error: 0.1448  
    
#1/10    
TRAIN TIME 8.197720766067505
Final training error: 0.0568
Final validation error: 0.1448



### AWS c5.xlarge -- FOr this task PARALLELIZATION DOES NOTG WORK?!?!?!?

#1/1, n_samples = 20000
TRAIN TIME 17.756388425827026
Final training error: 0.0568
Final validation error: 0.1448

#1/4, 20000
TRAIN TIME 18.528228521347046
Final training error: 0.0568
Final validation error: 0.1448

#4/1, 20000 -- FAILED DUE TO MEMORY 
TRAIN TIME 18.528228521347046
Final training error: 0.0568
Final validation error: 0.1448

#4/1, 10000
TRAIN TIME 31.477298498153687
Final training error: 0.0342
Final validation error: 0.1736

#1/1, 10000
TRAIN TIME 17.673475980758667
Final training error: 0.0302
Final validation error: 0.1650



### AWS p3.2xlarge

#1/1, 10000
TRAIN TIME 19.729962587356567
Final training error: 0.0302
Final validation error: 0.1650

#1/1, 20000
TRAIN TIME 21.833100080490112
Final training error: 0.0568
Final validation error: 0.1448

#1/8, 20000
TRAIN TIME 18.813719272613525
Final training error: 0.0568
Final validation error: 0.1448

#1/8/1 GPU, 20000
TRAIN TIME 9.853558540344238
Final training error: 0.0576
Final validation error: 0.1478

#1/1/1 GPU, 20000
TRAIN TIME 9.803324222564697
Final training error: 0.0576
Final validation error: 0.1478

In [None]:
#TEST 4

from xgboost_ray import RayXGBClassifier, RayParams
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import time

seed = 42

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=0.25, 
                                                    random_state=42)

clf = RayXGBClassifier(n_jobs=1,  # In XGBoost-Ray, n_jobs sets the number of actors
                       random_state=seed
                      )

# scikit-learn API will automatically convert the data
# to RayDMatrix format as needed.
# You can also pass X as a RayDMatrix, in which case
# y will be ignored.
"""
start_time = time.time()
clf.fit(X_train, y_train)
print("TRAIN TIME", time.time() - start_time)
"""

"""
pred_ray = clf.predict(X_test)
print("PREDICTION", pred_ray)

pred_proba_ray = clf.predict_proba(X_test)
print("PREDICTION_PRBABILITY", pred_proba_ray)
"""

# It is also possible to pass a RayParams object
# to fit/predict/predict_proba methods - will override
# n_jobs set during initialization

start_time = time.time()
clf.fit(X_train, y_train, ray_params=RayParams(num_actors=1))
print("TRAIN TIME", time.time() - start_time)

"""
pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=2))
print("PREDICTION", pred_ray)
"""
ray.shutdown()

print("DONE!")

In [None]:
# TIMING IS CONTRARY TO EXPECTATIONS BUT WE CAN SEE VISIBLE IMPACT OF actgors/jobs on it.

n_jobs = 1
TRAIN TIME 3.673287868499756 -- it was also as. much as 7.xxx

n_jobs = 2
TRAIN TIME 7.201511859893799

n_jobs = 4
TRAIN TIME 7.115774154663086

n_jobs = 10
TRAIN TIME 8.667701005935669


num_actors = 1
TRAIN TIME 7.2665088176727295

num_actors = 2
TRAIN TIME 7.180140018463135

num_actors = 4
TRAIN TIME 6.277163028717041

num_actors = 10
TRAIN TIME 8.599561929702759

In [None]:
#TEST 5

import ray
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

num_actors = 1 # number of acoters per tune_job 
                #(i.e. if you have 10 CPU cores you will be able to run run only 5 Tune Jopbs in parallel)
num_cpus_per_actor = 1

ray_params = RayParams(num_actors=num_actors,
                       gpus_per_actor=1,
                       cpus_per_actor=num_cpus_per_actor)

def train_model(config):
    train_x, train_y = load_breast_cancer(return_X_y=True)
    train_set = RayDMatrix(train_x, train_y)

    evals_result = {}
    
    bst = train(params=config,
                dtrain=train_set,
                evals_result=evals_result,
                evals=[(train_set, "train")],
                verbose_eval=False,
                ray_params=ray_params)
    
    bst.save_model("model.xgb")

from ray import tune

# Specify the hyperparameter search space.
config = {"tree_method": "approx",
          "objective": "binary:logistic",
          "eval_metric": ["logloss", "error"],
          "eta": tune.loguniform(1e-4, 1e-1),
          "subsample": tune.uniform(0.5, 1.0),
          "max_depth": tune.randint(1, 9),
          "tree_method": "gpu_hist",
         }

# Make sure to use the `get_tune_resources` method to set the `resources_per_trial`

import time
start_time = time.time()
analysis = tune.run(train_model,
                    config=config,
                    metric="train-error",
                    mode="min",
                    num_samples=20,
                    resources_per_trial=ray_params.get_tune_resources())

print("TRAIN TIME", time.time() - start_time)

display("Best hyperparameters", analysis.best_config)

ray.shutdown()

print("DONE!")

In [None]:
num_actors: # number of acoters per tune_job 
            # (i.e. if you have 10 CPU cores and value is 2 you will be able to run run only 5 Tune Jobs in parallel)
            # (i.e. for value 1 you will be able to run run 10 Tune Jobs in parallel)

            
### M1

num_actors = 1, num_CPUs = 1, num_samples_4, ray_params.get_tune_resources()
TRAIN TIME 12.8049880027771

num_actors = 1, num_CPUs = 1, num_samples_10, ray_params.get_tune_resources()
TRAIN TIME 19.9549880027771

num_actors = 1, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 34.25311517715454

num_actors = 2, num_CPUs = 1, num_samples_4, ray_params.get_tune_resources()
TRAIN TIME 13.603897094726562

num_actors = 2, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 39.90977382659912

num_actors = 10, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 138.29227209091187

num_actors = 1, num_CPUs = 2, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 32.94475817680359

num_actors = 1, num_CPUs = 4, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 47.004079818725586

num_actors = 1, num_CPUs = 10, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 70.54199194908142



### AWS c5.xlarge -- 4CPUs

num_actors = 1, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 68.40848469734192

num_actors = 1, num_CPUs = 4, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 102.29872679710388

num_actors = 4, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 162.59184432029724



### AWS p5.2xlarge -- 8CPUs, 1 GPU

num_actors = 1, num_CPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 49.652567625045776

num_actors = 1, num_CPUs = 1, num_GPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 179.6664788722992

num_actors = 1, num_CPUs = 8, num_GPUs = 1, num_samples_20, ray_params.get_tune_resources()
TRAIN TIME 184.71277117729187


In [None]:
import xgboost_ray as xr
import ray
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer
import time, os, glob
import numpy as np
import pandas as pd

parallel_num = 1
#ray.init(num_cpus = parallel_num)
ray.init(ignore_reinit_error=True)

train_x, train_y = load_breast_cancer(return_X_y=True)

for i in range(15): # 6 = x64 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    train_x = np.vstack([train_x,train_x])
    #print("1", train_x.shape)
    if i <= -1: #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        train_x = np.hstack([train_x,train_x])
        #print("2", train_x.shape)    
    train_y = np.hstack([train_y, train_y])
    print("3", i, train_y.shape)    
    
print(train_x.shape, train_y.shape)

train_data = np.concatenate([train_x, train_y.reshape([-1,1])], axis = 1)
print(train_data.shape)

train_data_DF = pd.DataFrame(train_data)

col_names = train_data_DF.columns
print("col_names", col_names)

col_names_str = [str(i) for i in col_names]
col_names_str[-1] = "Target"
print(col_names_str)

train_data_DF.columns = col_names_str
                 
train_data_DF.to_parquet('train_data.parquet', 
                         #compression = 'brotli',
                         index = False )

# Parquet file takes up little space 
print(os.path.getsize('train_data.parquet'))

def get_parquet_files(path, size = 10):
    files = sorted(glob.glob(path))
    while size > len(files):
        files = files + files
    files = files[0:size]
    return files

data_path = f"./*.parquet"

data_files_1 = get_parquet_files(data_path, size = 1) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
data_files_2 = get_parquet_files(data_path, size = 2)
data_files_3 = get_parquet_files(data_path, size = 3)
data_files_4 = get_parquet_files(data_path, size = 4)
data_files_5 = get_parquet_files(data_path, size = 5)
data_files_10 = get_parquet_files(data_path, size = 10)
data_files_13 = get_parquet_files(data_path, size = 13)
data_files_16 = get_parquet_files(data_path, size = 16)
data_files_100 = get_parquet_files(data_path, size = 100)

xgboost_params = {"tree_method": "gpu_hist", #"approx",
                 "objective": "binary:logistic",
                 "eval_metric": ["logloss", "error"]}

def train_xgboost(config, files, ray_params, progress_bar=False):
    target_column = "Target"
    
    train_set = RayDMatrix(files, target_column, distributed=False) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    test_set = RayDMatrix(files[0], target_column)
    
    evals_results = {}
    
    start_time = time.time()
    bst = train(params = config,
               dtrain = train_set,
               #evals = [(test_set, "eval")],
               #evals_results = evals_results,
               #verbose_eval = False,
               num_boost_round = 100, #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
               #callbacks = [TqdmCallback(10)] if progress_bar else [],
               ray_params = ray_params)
    print("TRAIN TIME", time.time() - start_time)
    
    return bst

bst = train_xgboost(xgboost_params, data_files_1, RayParams(num_actors = 1, cpus_per_actor = 16)) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

ray.shutdown()


In [None]:
# M1

range(9), <=3, num_boost_round = 1, data_files_1, RayParams(num_actors = 1, cpus_per_actor = 1))

range(8), <=3, num_boost_round = 1, data_files_2, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 5.729285955429077

range(8), <=3, num_boost_round = 10, data_files_2, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 11.767457962036133

range(8), <=3, num_boost_round = 10, data_files_2, RayParams(num_actors = 2, cpus_per_actor = 1))
TRAIN TIME 10.743774890899658

range(8), <=3, num_boost_round = 10, data_files_3, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 17.33546018600464

range(8), <=3, num_boost_round = 10, data_files_3, RayParams(num_actors = 2, cpus_per_actor = 1))
TRAIN TIME 14.88601303100586

range(8), <=3, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 1))
TRAIN TIME 14.014294862747192

range(8), <=3, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 12.830633401870728

range(7), <=4, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 14.921172857284546

range(7), <=4, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 1))
TRAIN TIME 15.9930899143219

range(7), <=4, num_boost_round = 10, data_files_3, RayParams(num_actors = 2, cpus_per_actor = 1))
TRAIN TIME 15.79627275466919

range(7), <=4, num_boost_round = 10, data_files_3, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 15.83592414855957

range(9), <=2, num_boost_round = 10, data_files_3, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 16.924689769744873

range(9), <=2, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 12.809518098831177

range(12), <=-1, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 14.860911130905151

range(12), <=-1, num_boost_round = 10, data_files_3, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 19.94079089164734

range(10), <=-1, num_boost_round = 10, data_files_10, RayParams(num_actors = 10, cpus_per_actor = 1))
TRAIN TIME 11.22438383102417

range(10), <=-1, num_boost_round = 10, data_files_10, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 16.732402801513672

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 22.021054983139038

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 10, cpus_per_actor = 1))
TRAIN TIME 14.368117094039917

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 5, cpus_per_actor = 1))
TRAIN TIME 14.076416015625

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 5, cpus_per_actor = 2))
TRAIN TIME 14.095832109451294

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 2, cpus_per_actor = 1))
TRAIN TIME 16.76328182220459

range(10), <=-1, num_boost_round = 10, data_files_13, RayParams(num_actors = 2, cpus_per_actor = 5))
TRAIN TIME 14.764201879501343

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 10, cpus_per_actor = 1))
TRAIN TIME 84.82407999038696

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 153.25520396232605

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 2, cpus_per_actor = 1))
TRAIN TIME 117.27706384658813

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 2, cpus_per_actor = 5))
TRAIN TIME 103.20557403564453

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 5, cpus_per_actor = 2))
TRAIN TIME 86.4503219127655

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 5, cpus_per_actor = 1))
TRAIN TIME 89.2873420715332

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 4, cpus_per_actor = 2))
TRAIN TIME 88.2671947479248

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 3, cpus_per_actor = 2))
TRAIN TIME 94.20934391021729

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 3, cpus_per_actor = 3))
TRAIN TIME 93.2304048538208

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 6, cpus_per_actor = 1))
TRAIN TIME 87.40560507774353

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 7, cpus_per_actor = 1))
TRAIN TIME 83.48936223983765

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 8, cpus_per_actor = 1))
TRAIN TIME 82.53302001953125

range(10), <=-1, num_boost_round = 100, data_files_13, RayParams(num_actors = 9, cpus_per_actor = 1))
TRAIN TIME 84.2537407875061


### AWS c5.xlarge -- 4CPUs

range(10), <=-1, num_boost_round = 100, data_files_4, RayParams(num_actors = 4, cpus_per_actor = 1))
TRAIN TIME 225.0817904472351

range(10), <=-1, num_boost_round = 100, data_files_5, RayParams(num_actors = 4, cpus_per_actor = 1))
TRAIN TIME 298.0817904472351

range(10), <=-1, num_boost_round = 100, data_files_6, RayParams(num_actors = 4, cpus_per_actor = 1))
TRAIN TIME 350.51361751556396

range(11), <=-1, num_boost_round = 100, data_files_4, RayParams(num_actors = 4, cpus_per_actor = 1))
TRAIN TIME 450.78320503234863

range(11), <=-1, num_boost_round = 100, data_files_4, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 475.6997878551483

range(11), <=-1, num_boost_round = 100, data_files_4, RayParams(num_actors = 1, cpus_per_actor = 4))
TRAIN TIME 429.6034173965454



### AWS c5.4xlarge -- 16CPUs
range(11), <=-1, num_boost_round = 100, data_files_16, RayParams(num_actors = 16, cpus_per_actor = 1))
TRAIN TIME 536.9499912261963

range(11), <=-1, num_boost_round = 100, data_files_16, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 770.5534617900848

range(11), <=-1, num_boost_round = 100, data_files_16, RayParams(num_actors = 1, cpus_per_actor = 16))
TRAIN TIME 579.2549285888672

range(11), <=-1, num_boost_round = 100, data_files_1, RayParams(num_actors = 1, cpus_per_actor = 16))
TRAIN TIME 33.595520973205566

range(11), <=-1, num_boost_round = 100, data_files_1, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 47.637054204940796

range(11), <=-1, num_boost_round = 100, data_files_16, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 535.7039926052094


range(15), <=-1, num_boost_round = 1, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 34.72002863883972

range(15), <=-1, num_boost_round = 1, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 35.39907503128052

range(15), <=-1, num_boost_round = 1, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 16))
TRAIN TIME 33.274383783340454


range(15), <=-1, num_boost_round = 100, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 16)) # BEST
TRAIN TIME 580.9696714878082

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 769.5239207744598

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 1))
TRAIN TIME 770.5029118061066

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 16))
TRAIN TIME 592.8946852684021

In [None]:
# GPUs

import xgboost_ray as xr
import ray
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer
import time, os, glob
import numpy as np
import pandas as pd

parallel_num = 1
#ray.init(num_cpus = parallel_num)
ray.init(ignore_reinit_error=True)

train_x, train_y = load_breast_cancer(return_X_y=True)

for i in range(15): # 6 = x64 #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    train_x = np.vstack([train_x,train_x])
    #print("1", train_x.shape)
    if i <= -1: #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        train_x = np.hstack([train_x,train_x])
        #print("2", train_x.shape)    
    train_y = np.hstack([train_y, train_y])
    print("3", i, train_y.shape)    
    
print(train_x.shape, train_y.shape)

train_data = np.concatenate([train_x, train_y.reshape([-1,1])], axis = 1)
print(train_data.shape)

train_data_DF = pd.DataFrame(train_data)

col_names = train_data_DF.columns
print("col_names", col_names)

col_names_str = [str(i) for i in col_names]
col_names_str[-1] = "Target"
print(col_names_str)

train_data_DF.columns = col_names_str
                 
train_data_DF.to_parquet('train_data.parquet', 
                         #compression = 'brotli',
                         index = False )

# Parquet file takes up little space 
print(os.path.getsize('train_data.parquet'))

def get_parquet_files(path, size = 10):
    files = sorted(glob.glob(path))
    while size > len(files):
        files = files + files
    files = files[0:size]
    return files

data_path = f"./*.parquet"

data_files_1 = get_parquet_files(data_path, size = 1) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
data_files_2 = get_parquet_files(data_path, size = 2)
data_files_3 = get_parquet_files(data_path, size = 3)
data_files_4 = get_parquet_files(data_path, size = 4)
data_files_5 = get_parquet_files(data_path, size = 5)
data_files_8 = get_parquet_files(data_path, size = 8)
data_files_10 = get_parquet_files(data_path, size = 10)
data_files_13 = get_parquet_files(data_path, size = 13)
data_files_16 = get_parquet_files(data_path, size = 16)
data_files_100 = get_parquet_files(data_path, size = 100)

xgboost_params = {"tree_method": "gpu_hist", #"approx",
                 "objective": "binary:logistic",
                 "eval_metric": ["logloss", "error"]}

def train_xgboost(config, files, ray_params, progress_bar=False):
    target_column = "Target"
    
    train_set = RayDMatrix(files, target_column, distributed=True) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    test_set = RayDMatrix(files[0], target_column)
    
    evals_results = {}
    
    start_time = time.time()
    bst = train(params = config,
               dtrain = train_set,
               #evals = [(test_set, "eval")],
               #evals_results = evals_results,
               #verbose_eval = False,
               num_boost_round = 100, #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
               #callbacks = [TqdmCallback(10)] if progress_bar else [],
               ray_params = ray_params)
    print("TRAIN TIME", time.time() - start_time)
    
    return bst

bst = train_xgboost(xgboost_params, 
                    data_files_16, 
                    RayParams(num_actors = 1, 
                              cpus_per_actor = 8, 
                              gpus_per_actor = 1)) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

ray.shutdown()




In [None]:
# p3.2xlarge  == 1 GPU
- range() can go as high up as 16. but i keep 15 for broader comparison

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 28.9706072807312, TRAIN TIME 28.95785355567932 !!!!

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 30.178287029266357

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 30.19754147529602

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 30.209967136383057

range(15), <=-1, num_boost_round = 100, data_files_2, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 55.98624062538147, TRAIN TIME 56.40909028053284

range(15), <=-1, num_boost_round = 100, data_files_2, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 57.961164712905884

range(15), <=-1, num_boost_round = 100, data_files_2, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 55.60385990142822

range(15), <=-1, num_boost_round = 100, data_files_2, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 54.5383083820343


# p3.8xlarge  == 4 GPU

range(15), <=-1, num_boost_round = 100, data_files_1, distributed=False, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 28.587210178375244

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=False, RayParams(num_actors = 4, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 73.05642509460449

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=False, RayParams(num_actors = 4, cpus_per_actor = 4, gpus_per_actor = 1))
TRAIN TIME 72.00212550163269

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=True, RayParams(num_actors = 4, cpus_per_actor = 4, gpus_per_actor = 1))
TRAIN TIME 33.4779314994812

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=True, RayParams(num_actors = 4, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 34.28415489196777

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 1, gpus_per_actor = 1))
TRAIN TIME 97.59232831001282

range(15), <=-1, num_boost_round = 100, data_files_4, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 98.0595326423645

range(15), <=-1, num_boost_round = 100, data_files_8, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))
TRAIN TIME 197.67186951637268

range(15), <=-1, num_boost_round = 100, data_files_8, distributed=True, RayParams(num_actors = 4, cpus_per_actor = 2, gpus_per_actor = 1))
TRAIN TIME 63.57376527786255

range(15), <=-1, num_boost_round = 100, data_files_8, distributed=True, RayParams(num_actors = 1, cpus_per_actor = 8, gpus_per_actor = 1))

range(15), <=-1, num_boost_round = 100, data_files_8, distributed=True, RayParams(num_actors = 4, cpus_per_actor = 2, gpus_per_actor = 1))
TRAIN TIME 136.5337245464325

In [None]:
"8 Shards", 63.57376527786255/197.67186951637268, "4 Shardes", 33.4779314994812/97.59232831001282