In [1]:
from spark.spark_tuner import SparkTuner
from spark.config.config_set import UniversalConfigSet
from spark.config.config_set import ConfigSet
from spark.config.parameter import Parameter
from spark.config.domain import IntRangeDomain

In [2]:
config_set = UniversalConfigSet(10, 1024 * 10)
tuner = SparkTuner(config_set)

training_sample_1 = {
    "spark.executor.memory": 1024 * 5,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 4,
    "spark.driver.memory": 1024
}
training_sample_2 = {
    "spark.executor.memory": 1024 * 10,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 8,
    "spark.driver.memory": 1024 * 3
}

tuner.add_sample_to_train_data(training_sample_1, 12)
tuner.add_sample_to_train_data(training_sample_2, 4)

tuner.get_next_best_config()


OrderedDict([('spark.sql.shuffle.partitions', 1610.0),
             ('spark.executor.memory', 9113.0),
             ('spark.driver.memory', 1024.0),
             ('spark.executor.cores', 7.0)])

### Example with TPC-DS Q17


|conf|value|
|:-|:-|
|spark.driver.memory|2g|
|spark.executor.cores|3|
|spark.executor.memory|2g|
|spark.sql.shuffle.partitions|400|


<h5 align="center">Timing for the query: q17 - 806750 </h5>

================================================================================================================================================
<br>


|conf|value|
|:-|:-|
|spark.driver.memory|4g|
|spark.executor.cores|8|
|spark.executor.memory|5g|
|spark.sql.shuffle.partitions|600|


<h4 align="center">Timing for the query: q17 - 1191319 </h4> 

================================================================================================================================================
<br>


|conf|value|
|:-|:-|
|spark.driver.memory|1g|
|spark.executor.cores|2|
|spark.executor.memory|2g|
|spark.sql.shuffle.partitions|100|


<h4 align="center">Timing for the query: q17 - 1138390 </h4> 


In [3]:
# config_set = UniversalConfigSet(20, 152500)
config_set = ConfigSet()
config_set.add_param(Parameter('spark.sql.shuffle.partitions', IntRangeDomain(10, 2000, 50)))\
    .add_param(Parameter('spark.executor.memory',
                         IntRangeDomain(1000,  # min executor memory
                                        28672,  # max executor memory
                                        512)))\
    .add_param(Parameter('spark.driver.memory',
                         IntRangeDomain(512, 15200, 256))) \
    .add_param(Parameter('spark.executor.cores',
                         IntRangeDomain(1, 4, 1)))

tuner = SparkTuner(config_set)
training_sample_1 = {
    "spark.executor.memory": 2000,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 3,
    "spark.driver.memory": 2000
}
training_sample_2 = {
    "spark.executor.memory": 4000,
    "spark.sql.shuffle.partitions": 600,
    "spark.executor.cores": 8,
    "spark.driver.memory": 4000
}
training_sample_3 = {
    "spark.executor.memory": 1000,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 2,
    "spark.driver.memory": 1000
}
tuner.add_sample_to_train_data(training_sample_1, 806750)
tuner.add_sample_to_train_data(training_sample_2, 1191319)
tuner.add_sample_to_train_data(training_sample_3, 1138390)

In [4]:
tuner.get_next_best_config()

OrderedDict([('spark.sql.shuffle.partitions', 1910.0),
             ('spark.executor.memory', 10728.0),
             ('spark.driver.memory', 10752.0),
             ('spark.executor.cores', 4.0)])

In [5]:
# Output with the predicted config - 
1066135

1066135

In [None]:
from hyperopt import fmin, tpe, hp, Trials
import numpy as np
import math

def minimize_training_loss(params):
    try:
        # ToDo: Tune each dimension of beta, gamma and theta individually
        loss = 0.0
        alpha = params['alpha']
        beta = np.ones((1, tuner.model.config_set.get_size()), float).transpose() * params['beta']
        gamma = np.ones(tuner.model.config_set.get_size(), float) * params['gamma']
        theta = np.ones(tuner.model.config_set.get_size(), float) * params['theta'] * 0.01
        for config_value, actual_out in zip(tuner.model.get_sampled_configs(), tuner.model.training_out):
            out = tuner.model.predict(config_value, alpha, beta, gamma, theta)
            loss = loss + abs(out - actual_out)

        return {'loss': loss, 'status': 'ok'}
    except Exception as e:
#         print(e)
        return {'loss': 0, 'status': 'fail'}

space={'alpha': hp.uniform('alpha', 0, 1000),
       'beta': hp.uniform('beta', pow(10, 2), pow(10, 7)),
       'gamma': hp.uniform('gamma', 0, 10),
       'theta': hp.uniform('theta', 0, 10),
      }

number_of_experiments = 1000
best = fmin(minimize_training_loss,
            space=space,
            algo=tpe.suggest,
            max_evals=number_of_experiments)

 51%|█████▏    | 514/1000 [00:04<00:05, 84.72it/s, best loss: 1219335.23851] 

In [12]:
print best

{'theta': 7.230629701585487, 'beta': 412696.6304998827, 'alpha1': 998.0269377497508, 'gamma': 0.5985143784548778}
