# Ray Tune - Comparision of RandomSearchCV and TuneSearchCV with XGBoost

<img src = "../images/tune/driver.png" align="center" height=300 width=300>

[Porto Seguro](https://www.portoseguro.com.br/), one of Brazil’s largest auto and homeowner insurance companies, completely agrees. Inaccuracies in car insurance company’s claim predictions raise the cost of insurance for good drivers and reduce the price for bad ones.

A data set from Porto Seguro was used in the [Kaggle's machine learning competetion](https://www.kaggle.com/c/porto-seguro-safe-driver-prediction). The data set is used build a classification model to predict the probability that a driver will initiate an auto insurance claim in the next year. The predictions can be used to further tailor insurance prices, and hopefully make auto insurance coverage more accessible to more drivers.

In this exercise we show two things:

1. Composibility of using different algorithms and hyperameters tuning using sklearn and xgboost
2. Tune's drop-in replacements (or wrappers) for RandomSearchCV to use optuna as a choice of search algorithm.

Although, drop-in replacements was introduced in earlier in [03-Ray-Tune-with-Sklearn](03-Ray-Tune-with-Sklearn.ipynb), this demonstrates
with the larger dataset and real life use case.

We need some python packages, so let's install them

In [None]:
!pip install -U boto3 plotly xgboost optuna tune-sklearn scikit-learn

In [None]:
import boto3
from io import BytesIO
import joblib
import numpy as np
import pandas as pd
import pandas as pd
import plotly.express as px
import ray
from ray import tune
from ray.tune.callback import Callback
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import tqdm
from tqdm.notebook import trange, tqdm
from tune_sklearn import TuneSearchCV
from xgboost import XGBClassifier

from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster
import ray 

setup_ray_cluster(
  num_worker_nodes=2,
  num_cpus_per_node=4,
  collect_log_to_path="/dbfs/path/to/ray_collected_logs"
)
ray.init()

In [None]:
import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb
xgb.set_config(verbosity=1)

In [None]:
import os
os.environ["TUNE_DISABLE_AUTO_CALLBACK_SYNCER"] = "1"

# desired maximum number of concurrent trials
os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "64"

class TqdmCallback(Callback):
    def setup(self,
              stop = None,
              num_samples = None,
              total_num_samples = None,
              **info):
        self.pbar = tqdm(total=total_num_samples)

    def on_trial_complete(self, **info):
        self.pbar.update(1)

    def on_experiment_end(self, **info):
        self.pbar.close()

In [None]:
def plot_cv_score(analysis):
    df = analysis.results_df[["average_test_score", "timestamp"]]
    df["timestamp"] = pd.to_datetime(df['timestamp'], unit='s')
    df.set_index("timestamp", inplace=True)
    df.sort_index(inplace=True)
    df["cummax_cv_score"] = df["average_test_score"].cummax()
    df = df[~df.index.duplicated(keep="last")]
    df = df["cummax_cv_score"].resample("1S").bfill()
    fig = px.line(df, y="cummax_cv_score")
    fig.show()

### Step 1: Read our data from S3

In [None]:
%%time
DATA_URL = "https://ray-ci-higgs.s3.us-west-2.amazonaws.com/" \
                      "safe_driver.csv"
print("Reading data from S3...")
train_df = pd.read_csv(DATA_URL, dtype={'id': np.int32, 'target': np.int8})

y = train_df['target'].values
X = train_df.drop(['target', 'id'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1234)
t, f = train_df.shape
print(f'training set: {t}, features:{f}')

#### Define some utility functions

In [None]:
def print_test_score(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, y_pred[:,1])
    print("**************** roc_auc score: {} ****************".format(roc_auc))

def train_model_and_print_test_score(model, X_train, y_train, X_test, y_test):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)
    run_cv = RandomizedSearchCV(model, 
                                param_distributions= {},  # no parameters distribution
                                n_iter=1, 
                                scoring='roc_auc', 
                                n_jobs=-1, 
                                cv=skf.split(X_train,y_train), 
                                verbose=1, 
                                random_state=1001)
    run_cv.fit(X_train, y_train)
    print_test_score(run_cv.best_estimator_, X_test, y_test)

### Step 2: Define our XGBoost classifier 

In [None]:
model = XGBClassifier(objective='binary:logistic', 
                      n_jobs=1, 
                      eval_metric='auc', 
                      random_state=1234, 
                      verbosity=1, 
                      use_label_encoder=False)

### Step 3: Use Vanilla sklearn RandomSearchCV without parameters search space

In [None]:
%%time
train_model_and_print_test_score(model, X_train, y_train, X_test, y_test)

### Step 4: Use Vanilla scikit-learn RandomSearchCV with parameters search space

In [None]:
params = {
        "max_depth": randint(1, 5),
        "min_child_weight": loguniform(0.001, 128),
        "subsample": uniform(0.1, 1.0),
        "colsample_bylevel": uniform(0.01, 1.0),
        "colsample_bytree": uniform(0.01, 1.0),
        "reg_alpha": loguniform(1 / 1024, 10.0),
        "reg_lambda": loguniform(1 / 1024, 10.0),
        "scale_pos_weight": [1, 26],
}
number_of_cv_splits = 3

In [None]:
%%time
gs = RandomizedSearchCV(
    model, 
    params,
    cv=number_of_cv_splits,
    n_iter=100, 
    scoring='roc_auc', 
    n_jobs=-1, # use all cores in a single node
    verbose=1,
)

gs.fit(X_train, y_train)                    

In [None]:
# Report some time and performance statistics
total_tuning_compute_time = np.sum(gs.cv_results_['mean_fit_time'])
average_train_time = np.mean(gs.cv_results_['mean_fit_time'])
print(f'Sklearn total tuning time search took: {average_train_time:02f} seconds')
print(f'Sklearn search took: {average_train_time:02f} seconds')
print(f'Best score for AUC: {gs.best_score_:.3f}') 

### Step 5: Let's try with Ray Tune

Taking an exisiting scikit-learn program and converting to Ray Tune, using its drop-in replacement, takes only few lines of code changes.

**Note**: Runing this locally on Ray will take a long time. Suggest using Ray cluster on Anyscale or AWS

In [None]:
CONNECT_TO_ANYSCALE=True
if ray.is_initialized:
    ray.shutdown()
    if CONNECT_TO_ANYSCALE:
        ray.init("anyscale://jsd-tune-scikit-learn-xgboost-demo", cluster_env="jsd-tune-demo-env:10") 
        #ray.init(address="auto", ignore_reinit_error=True)     # Connects to a Ray cluster   
    else:
        ray.init(ignore_reinit_error=True)                       # Runs locally on my laptop

In [None]:
ray.cluster_resources()

### Define our hyperparameter config space using tune

In [None]:
tune_config_params = {
        "max_depth": tune.randint(1, 5),
        "min_child_weight": tune.loguniform(0.001, 128),
        "subsample": tune.uniform(0.1, 1.0),
        "colsample_bylevel": tune.uniform(0.01, 1.0),
        "colsample_bytree": tune.uniform(0.01, 1.0),
        "reg_alpha": tune.loguniform(1 / 1024, 10.0),
        "reg_lambda": tune.loguniform(1 / 1024, 10.0),
        "scale_pos_weight": tune.choice([1, 26]),
}

### Use Tune's drop-in replacement: TuneSearchCV

In [None]:
%%time
tune_gs = TuneSearchCV(
    model, 
    tune_config_params,
    cv=3,
    n_trials=100, 
    scoring='roc_auc', 
    n_jobs=100,  # change to 40 if running on a ray cluster 
                 # or equal to total number of CPUs 
    verbose=1,
    # Custom Key word arguments
    early_stopping=True,
    max_iters=10,   #equivalent to epoch in NN      
    loggers = ["tensorboard"],
    search_optimization="optuna", # Default is ASHA
    name="tune-experiment"
)

tune_gs.fit(X_train, y_train, tune_params=dict(callbacks=[TqdmCallback()]))

In [None]:
print(f'Best parameters: {tune_gs.best_params_}')
print(f'Best AUC score : {tune_gs.best_score_}')

#### Plot the times

In [None]:
plot_cv_score(tune_gs.analysis_)

In [None]:
shutdown_ray_cluster()