# Criteo 1 TiB benchmark - Spark.ML random forest

Specialization of the experimental notebook for Spark.ML random forest.

# Table of contents

* [Configuration](#Configuration)
* [Distributed training](#Distributed-training)
* [End](#End)

In [None]:
%load_ext autotime
%matplotlib inline

from __future__ import print_function

## Configuration
[_(back to toc)_](#Table-of-contents)

Paths:

In [None]:
libsvm_data_remote_path = 'criteo/libsvm'
local_runtime_path = 'criteo/runtime'

In [None]:
import os


libsvm_train_template = os.path.join(libsvm_data_remote_path, 'train', '{}')
libsvm_test_template = os.path.join(libsvm_data_remote_path, 'test', '{}')

In [None]:
def ensure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

Samples to take:

In [None]:
train_samples = [
    10000, 30000,  # tens of thousands
    100000, 300000,  # hundreds of thousands
    1000000, 3000000,  # millions
    10000000, 30000000,  # tens of millions
    100000000, 300000000,  # hundreds of millions
    1000000000, 3000000000,  # billions
]

test_samples = [1000000]

Spark configuration and initialization:

In [None]:
executor_instances = 64
executor_cores = 4
memory_per_core = 4

In [None]:
app_name = 'Criteo experiment'

master = 'yarn'

settings = {
    'spark.network.timeout': '600',
    
    'spark.driver.cores': '16',
    'spark.driver.maxResultSize': '16G',
    'spark.driver.memory': '32G',
    
    'spark.executor.cores': str(executor_cores),
    'spark.executor.instances': str(executor_instances),
    'spark.executor.memory': str(memory_per_core * executor_cores) + 'G',
    
    'spark.speculation': 'true',
    
    'spark.yarn.queue': 'root.HungerGames',
}

In [None]:
from pyspark.sql import SparkSession


builder = SparkSession.builder

builder.appName(app_name)
builder.master(master)
for k, v in settings.items():
    builder.config(k, v)

spark = builder.getOrCreate()
sc = spark.sparkContext

sc.setLogLevel('ERROR')

Logging:

In [None]:
import sys
import logging
reload(logging)


handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('[%(asctime)s] %(message)s')
handler.setFormatter(formatter)

ensure_directory_exists(local_runtime_path)
file_handler = logging.FileHandler(filename=os.path.join(local_runtime_path, 'mylog.log'), mode='a')
file_handler.setFormatter(formatter)

logger = logging.getLogger()
logger.addHandler(handler)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)

In [None]:
logger.info('Spark version: %s.', spark.version)

Plot measurements:

In [None]:
import pandas


def extract_data_for_plotting(df, what):
    return reduce(
        lambda left, right: pandas.merge(left, right, how='outer', on='Train size'),
        map(
            lambda name: df[df.Engine == name][['Train size', what]].rename(columns={what: name}),
            df.Engine.unique(),
        ),
    )   

def plot_stuff(df, what, ylabel=None, **kwargs):
    data = extract_data_for_plotting(df, what).set_index('Train size')
    ax = data.plot(marker='o', figsize=(6, 6), title=what, grid=True, linewidth=2.0, **kwargs)  # xlim=(1e4, 4e9)
    if ylabel is not None:
        ax.set_ylabel(ylabel)

Let's name samples as their shortened "engineering" notation - 1e5 is 100k etc.:

In [None]:
def sample_name(sample):
    return str(sample)[::-1].replace('000', 'k')[::-1]

## Distributed training
[_(back to toc)_](#Table-of-contents)

Loading of LibSVM data as Spark.ML dataset:

In [None]:
from pyspark.ml.linalg import SparseVector


def parse_libsvm_line_for_rf(line):
    parts = line.split(' ')
    label = int(parts[0])
    indices, values = zip(*map(lambda s: s.split(':'), parts[1:]))
    return (label, SparseVector(40, map(int, indices), map(int, values)))

In [None]:
task_splitting = 1  # tasks per core

def load_ml_data_for_rf(template, sample):
    path = template.format(sample_name(sample))
    return sc.textFile(path).map(parse_libsvm_line_for_rf).toDF(['label', 'features']).repartition(executor_cores * executor_instances * task_splitting)

Evaluating a model:

In [None]:
from matplotlib import pyplot
from sklearn.metrics import auc, log_loss, roc_curve


def calculate_roc(predictions):
    labels, scores = zip(*predictions.rdd.map(lambda row: (row.label, row.probability[1])).collect())
    fpr, tpr, _ = roc_curve(labels, scores)
    roc_auc = auc(fpr, tpr)
    ll = log_loss(labels, scores)
    return fpr, tpr, roc_auc, ll

def evaluate_model(name, model, test, train=None):
    metrics = dict()
    
    figure = pyplot.figure(figsize=(6, 6))
    ax = figure.gca()
    ax.set_title('ROC - ' + name)
    
    if train is not None:
        train_predictions = model.transform(train)
        train_fpr, train_tpr, train_roc_auc, train_log_loss = calculate_roc(train_predictions)
        
        metrics['train_roc_auc'] = train_roc_auc
        metrics['train_log_loss'] = train_log_loss
        
        ax.plot(train_fpr, train_tpr, linewidth=2.0, label='train (auc = {:.3f})'.format(train_roc_auc))
    
    test_predictions = model.transform(test)
    test_fpr, test_tpr, test_roc_auc, test_log_loss = calculate_roc(test_predictions)
    
    metrics['test_roc_auc'] = test_roc_auc
    metrics['test_log_loss'] = test_log_loss
    
    ax.plot(test_fpr, test_tpr, linewidth=2.0, label='test (auc = {:.3f})'.format(test_roc_auc))
    
    ax.plot([0.0, 1.0], [0.0, 1.0], linestyle='--', c='gray')
    ax.legend()
    pyplot.show()
    
    return metrics

Models to work on:

In [None]:
from pyspark.ml.classification import (
    RandomForestClassifier, 
)

In [None]:
classifiers = {
    'rf': RandomForestClassifier(featureSubsetStrategy='sqrt', impurity='entropy', minInstancesPerNode=3, maxBins=64, maxDepth=10, numTrees=160),
}

Monkey-patch RDDs and DataFrames for context persistence:

In [None]:
import pyspark


def enter_method(self):
    self.persist()

def exit_method(self,exc_type, exc, traceback):
    self.unpersist()


pyspark.sql.dataframe.DataFrame.__enter__ = enter_method
pyspark.sql.dataframe.DataFrame.__exit__ = exit_method

Do distributed training:

In [None]:
import time

from pyspark.ml.evaluation import BinaryClassificationEvaluator


test_sample = test_samples[-1]

evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='probability', metricName='areaUnderROC')

new_quality_data = []
new_data_rows = []

logger.info('Loading "%s" test samples for rf.', test_sample)
test_df = load_ml_data_for_rf(libsvm_test_template, test_sample)
with test_df:
    logger.info('Loaded "%s" lines.', test_df.count())

    for train_sample in train_samples:

        logger.info('Working on "%s" train sample.', train_sample)

        logger.info('Loading "%s" train samples for rf.', train_sample)
        train_df = load_ml_data_for_rf(libsvm_train_template, train_sample)
        with train_df:
            logger.info('Loaded "%s" lines.', train_df.count())

            for classifier_name, classifier in classifiers.items():

                logger.info('Training a model "%s" on sample "%s".', classifier_name, train_sample)

                start = time.time()
                model = classifier.fit(train_df)
                duration = time.time() - start

                logger.info('Training a model "%s" on sample "%s" took "%g" seconds.', classifier_name, train_sample, duration)

                logger.info('Evaluating the model "%s" trained on sample "%s".', classifier_name, train_sample)
                metrics = evaluate_model(classifier_name + ' - ' + sample_name(train_sample), model, test_df, train=(train_df if train_sample <= 1000000 else None))

                test_predictions = model.transform(test_df)
                ml_metric_value = evaluator.evaluate(test_predictions)

                logger.info(
                    'For the model "%s" trained on sample "%s" metrics are: "%s"; ROC AUC calculated by Spark is "%s".',
                    classifier_name,
                    train_sample,
                    metrics,
                    ml_metric_value,
                )

                data_row = {
                    'Train size': train_sample,
                    'ROC AUC': metrics['test_roc_auc'],
                    'Log loss': metrics['test_log_loss'],
                    'Duration': duration,
                    'Engine': classifier_name,
                }
                new_quality_data.append(data_row)
                data_row_string = '\t'.join(str(data_row[field]) for field in ['Engine', 'Train size', 'ROC AUC', 'Log loss', 'Duration'])
                new_data_rows.append(data_row_string)
                logger.info('Data row: "%s".', data_row_string)

Plot metrics:

In [None]:
measurements_df = pandas.DataFrame(new_quality_data).sort_values(by=['Train size'])
plot_stuff(measurements_df, 'ROC AUC', logx=True)
plot_stuff(measurements_df, 'Log loss', logx=True, ylim=(0.135, 0.145))
plot_stuff(measurements_df, 'Duration', loglog=True, ylabel='s')

In [None]:
for row in new_data_rows:
    print(row)

## End
[_(back to toc)_](#Table-of-contents)

Work done, stop Spark:

In [None]:
spark.stop()