In [1]:
from spark.config.config_set import UniversalConfigSet
from spark.model.gaussian_model import GaussianModel
from spark.model.training_data import TrainingData
import numpy as np

In [2]:
training_data = TrainingData()
config_set = UniversalConfigSet(10, 1024 * 10)
model = GaussianModel(config_set, training_data)

In [3]:
training_sample_1 = {
    "spark.executor.memory": 1024 * 5,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 4,
    "spark.driver.memory": 1024
}
training_sample_2 = {
    "spark.executor.memory": 1024 * 10,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 8,
    "spark.driver.memory": 1024 * 3
}
model.add_sample_to_train_data(training_sample_1, 12)
model.add_sample_to_train_data(training_sample_2, 4)

In [7]:
model.train()
print model.get_training_pairwise_correlation()
print model.get_best_config()

[[1.         0.96131309]
 [0.96131309 1.        ]]
{'spark.executor.memory': 5017.0, 'spark.driver.memory': 768.0, 'spark.executor.cores': 7.0, 'spark.sql.shuffle.partitions': 60.0}


In [35]:
print model.get_training_pairwise_correlation()

[[1.         0.96131309]
 [0.96131309 1.        ]]


In [6]:
normalized_values = model.get_sampled_configs()
model.train()

In [7]:
# DEBUG Paragraph
# Mean Calculation
config = model.get_sampled_configs()[0]
print config
print model.beta
term1 = np.dot(config, model.beta)
print "term1 %s" % term1
term2 = model.training_out - np.dot(model.get_training_params(), model.beta)
print "term2 %s" % term2
term31 = model.get_correlation_with_train_data(config).transpose()
term32 = np.linalg.inv(model.get_training_pairwise_correlation())
term3 = np.dot(term31, term32)
print "term3 %s" % term3
term4 = np.dot(term3, term2)
print "term4 %s" % term4
print term1 + term4

[0.8461538461538461, 0.1875, -0.0, -0.0]
[[1.e-06]
 [1.e-06]
 [1.e-06]
 [1.e-06]]
term1 [1.03365385e-06]
term2 [[11.99999819]
 [ 3.99999425]]
term3 [[0.93987784 0.03812624]]
term4 [[11.43103713]]
[[11.43103816]]


In [13]:
# DEBUG Paragraph
# Variance calculation
config = model.get_sampled_configs()[0]
corr_with_train_data = model.get_correlation_with_train_data(config)
corr_pairwise_train_data = model.get_training_pairwise_correlation()
print corr_with_train_data
print corr_pairwise_train_data
term1 = np.dot(corr_with_train_data.transpose(), np.linalg.inv(corr_pairwise_train_data))
term2 = np.dot(term1, corr_with_train_data)
term3 = 1 - term2
print term3
print np.linalg.det(term3)

[[0.9765291 ]
 [0.94164311]]
[[1.         0.96131309]
 [0.96131309 1.        ]]
[[0.04628063]]
0.046280627946319715


In [6]:
#DEBUG Paragraph
import sys
from spark.discretizer.normalizer import ConfigNormalizer
normalized_values = model.get_sampled_configs()
best_config_value = None
best_config = {}
best_out = sys.maxint
# for config in list(itertools.product(*normalized_values)):
for config in normalized_values:
    out = model.predict(config)
    if out < best_out:
        best_out = out
        best_config_value = config
print best_out
print best_config_value
# model.normalizer.denormalize_config(best_config_value)
i = 0
for param in model.normalizer.get_params():
    print ConfigNormalizer.denormalize_value(param, best_config_value[i])

[4.62752883e-205]
[0.02564102564102564, 0.5, 0.6666666666666666, 0.625]
60.0
1131.0
276.0
2.0


In [8]:
#DEBUG Paragraph
import sys
from spark.discretizer.normalizer import ConfigNormalizer

normalized_values = model.get_sampled_configs()
best_config_value = None
best_config = {}
best_out = sys.maxint
# for config in list(itertools.product(*normalized_values)):
for config in normalized_values:
    out = model.predict(config)
    if out < best_out:
        best_out = out
        best_config_value = config

denorm_best_config = model.normalizer.denormalize_config(best_config_value)
print denorm_best_config
for param in model.normalizer.get_params():
    print param.get_name() 

[60.0, 5017.0, 768.0, 7.0]
spark.sql.shuffle.partitions
spark.executor.memory
spark.driver.memory
spark.executor.cores


In [4]:
from spark.spark_tuner import SparkTuner
tuner = SparkTuner(config_set)

In [6]:
training_sample_1 = {
    "spark.executor.memory": 1024 * 5,
    "spark.sql.shuffle.partitions": 100,
    "spark.executor.cores": 4,
    "spark.driver.memory": 1024
}
training_sample_2 = {
    "spark.executor.memory": 1024 * 10,
    "spark.sql.shuffle.partitions": 400,
    "spark.executor.cores": 8,
    "spark.driver.memory": 1024 * 3
}
tuner.add_sample_to_train_data(training_sample_1, 12)
tuner.add_sample_to_train_data(training_sample_2, 4)

In [7]:
tuner.get_next_best_config()

{'spark.driver.memory': 768.0,
 'spark.executor.cores': 7.0,
 'spark.executor.memory': 5017.0,
 'spark.sql.shuffle.partitions': 60.0}

In [1]:
from spark.config.config_set import UniversalConfigSet
from spark.model.gaussian_model import GaussianModel
from spark.model.training_data import TrainingData
from spark.config.config import Config
from spark.discretizer.normalizer import ConfigNormalizer

In [2]:
training_data = TrainingData()
config_set = UniversalConfigSet(4, 26544)
model = GaussianModel(config_set, training_data)
training_sample_1 = {
    "spark.executor.memory": 11945,
    "spark.sql.shuffle.partitions": 200,
    "spark.executor.cores": 2,
    "spark.driver.memory": 1024 * 4
}
training_sample_2 = {
    "spark.executor.memory": 5972,
    "spark.sql.shuffle.partitions": 300,
    "spark.executor.cores": 1,
    "spark.driver.memory": 1024 * 2
}
training_sample_3 = {
    "spark.executor.memory": 11945,
    "spark.sql.shuffle.partitions": 460,
    "spark.executor.cores": 2,
    "spark.driver.memory": 1024 * 4
}
training_sample_4 = {
    "spark.executor.memory": 10068,
    "spark.sql.shuffle.partitions": 1660,
    "spark.executor.cores": 1,
    "spark.driver.memory": 1024
}
model.add_sample_to_train_data(training_sample_1, 131)
model.add_sample_to_train_data(training_sample_2, 143)
model.add_sample_to_train_data(training_sample_3, 155)
model.add_sample_to_train_data(training_sample_4, 343)
model.train()
config = Config(4, 26544)
params = config_set.get_params()
# for param in params:
#     if param.get_name() == 'spark.executor.memory':
#         config.add_param(param, 10068)
#     elif param.get_name() == 'spark.sql.shuffle.partitions':
#         config.add_param(param, 1660)
#     elif param.get_name() == 'spark.executor.cores':
#         config.add_param(param, 1)
#     elif param.get_name() == 'spark.driver.memory':
#         config.add_param(param, 1024)
for param in params:
    if param.get_name() == 'spark.executor.memory':
        config.add_param(param, 10068)
    elif param.get_name() == 'spark.sql.shuffle.partitions':
        config.add_param(param, 1660)
    elif param.get_name() == 'spark.executor.cores':
        config.add_param(param, 1)
    elif param.get_name() == 'spark.driver.memory':
        config.add_param(param, 1024)


In [3]:
# print config.get_all_param_names()
# print config.get_all_param_values()
norm_value = model.normalizer.normalize_config(config.get_all_param_values())
print norm_value
# print model.normalizer.denormalize_config(norm_value)
model.predict(norm_value)

[0.8461538461538461, 0.23529411764705882, 0.0, 0.0]


array([147.39096822])

In [22]:
i = 0
for param in model.normalizer.get_params():
    print param.get_name()
    print config.get_all_param_values()[i]
    print param.get_domain().get_min()
    print param.get_domain().get_max()    
    a = ConfigNormalizer.normalize_value(param, config.get_all_param_values()[i])
    print a
    print ConfigNormalizer.denormalize_value(param, a)
    i = i + 1
    print ">>>>>>>>"

spark.sql.shuffle.partitions
1660
10
1960
0.153846153846
310.0
>>>>>>>>
spark.executor.memory
10068
5972
23380
0.764705882353
19284.0
>>>>>>>>
spark.driver.memory
1024
1024
6400
1.0
6400.0
>>>>>>>>
spark.executor.cores
1
1
4
1.0
4.0
>>>>>>>>


In [24]:
(1660 - 10) / float(1960 - 10)

0.8461538461538461