In [22]:
from spark.spark_tuner import SparkTuner
from spark.config.config_set import UniversalConfigSet
from spark.config.config_set import ConfigSet
from spark.config.parameter import Parameter
from spark.config.domain import IntRangeDomain

In [10]:
config_set = UniversalConfigSet(10, 1024 * 10)
tuner = SparkTuner(config_set)

training_sample_1 = {
    "spark.executor.memory": 1024 * 5,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 4,
    "spark.driver.memory": 1024
}
training_sample_2 = {
    "spark.executor.memory": 1024 * 10,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 8,
    "spark.driver.memory": 1024 * 3
}

tuner.add_sample_to_train_data(training_sample_1, 12)
tuner.add_sample_to_train_data(training_sample_2, 4)

tuner.get_next_best_config()


OrderedDict([('spark.sql.shuffle.partitions', 1610.0),
             ('spark.executor.memory', 9113.0),
             ('spark.driver.memory', 1024.0),
             ('spark.executor.cores', 7.0)])

### Example with TPC-DS Q17


|conf|value|
|:-|:-|
|spark.driver.memory|2g|
|spark.executor.cores|3|
|spark.executor.memory|2g|
|spark.sql.shuffle.partitions|400|


<h5 align="center">Timing for the query: q17 - 806750 </h5>

================================================================================================================================================
<br>


|conf|value|
|:-|:-|
|spark.driver.memory|4g|
|spark.executor.cores|8|
|spark.executor.memory|5g|
|spark.sql.shuffle.partitions|600|


<h4 align="center">Timing for the query: q17 - 1191319 </h4> 

================================================================================================================================================
<br>


|conf|value|
|:-|:-|
|spark.driver.memory|1g|
|spark.executor.cores|2|
|spark.executor.memory|2g|
|spark.sql.shuffle.partitions|100|


<h4 align="center">Timing for the query: q17 - 1138390 </h4> 


In [19]:
# config_set = UniversalConfigSet(20, 152500)
config_set = ConfigSet()
config_set.add_param(Parameter('spark.sql.shuffle.partitions', IntRangeDomain(10, 2000, 50)))\
    .add_param(Parameter('spark.executor.memory',
                         IntRangeDomain(1000,  # min executor memory
                                        28672,  # max executor memory
                                        512)))\
    .add_param(Parameter('spark.driver.memory',
                         IntRangeDomain(512, 15200, 256))) \
    .add_param(Parameter('spark.executor.cores',
                         IntRangeDomain(1, 4, 1)))

tuner = SparkTuner(config_set)
training_sample_1 = {
    "spark.executor.memory": 2000,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 3,
    "spark.driver.memory": 2000
}
training_sample_2 = {
    "spark.executor.memory": 4000,
    "spark.sql.shuffle.partitions": 600,
    "spark.executor.cores": 8,
    "spark.driver.memory": 4000
}
training_sample_3 = {
    "spark.executor.memory": 1000,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 2,
    "spark.driver.memory": 1000
}
tuner.add_sample_to_train_data(training_sample_1, 806750)
tuner.add_sample_to_train_data(training_sample_2, 1191319)
tuner.add_sample_to_train_data(training_sample_3, 1138390)

In [20]:
tuner.get_next_best_config()

OrderedDict([('spark.sql.shuffle.partitions', 1910.0),
             ('spark.executor.memory', 10728.0),
             ('spark.driver.memory', 10752.0),
             ('spark.executor.cores', 4.0)])

In [24]:
# Output with the predicted config - 
1066135

1066135