In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, Normalizer, StandardScaler, MaxAbsScaler, KBinsDiscretizer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, RationalQuadratic, Matern, DotProduct, WhiteKernel

from sklearn.metrics import pairwise_distances_argmin_min, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [3]:
class DataRepo:
    '''
    Data repository. Contains methods to prune metrics and preprocess knobs.
    '''
    def __init__(self, params, offline_path='./data/train/offline_workload.csv'):
        self.OFFLINE_WL_PATH = offline_path
        self.METRICS_START_IDX = 14
        self.LATENCY_IDX = 13
        self.INT_KNOBS_IDXS = [9, 10, 11, 12]
        self.CONT_KNOBS_IDXS = [1, 2, 3, 4, 5, 6, 8]
        self.BOOL_KNOS_IDX = 7
        self.pruned_metrics_idxs = None
        self.pruned_metrics_names = None
        
        # Hyperparameters
        self.dim_reducer = None
        self.kmeans = None
        self.int_encoder = None
        self.cont_encoder = None 
        
        self.__set_hyperparams(params)
        
    def __set_hyperparams(self, params):
        self.dim_reducer = params['dim_reducer']
        self.kmeans = params['kmeans']
        self.int_encoder = params['int_encoder']
        self.cont_encoder = params['cont_encoder']
    
    def _build(self):
        '''
        Run only once by OtterTune object.
        Prunes metrics and preprocesses knobs in offline workloads.
        Final processed data is not saved, rather returned to 
        OtterTune to create Workload objects.
        '''
        print('Pruning metrics and pre-processing knobs...')
        pruned_data = self.__prune_offline_metrics(self.OFFLINE_WL_PATH)
        processed_data = self.__preprocess_workload_knobs(pruned_data)
        return processed_data
    
    def process_online_workload(self, raw_workload):
        '''
        Prune metrics and preprocess knobs of online workloads.
        '''
        pruned_data = self.__prune_online_metrics(raw_workload)
        return self.__preprocess_workload_knobs(pruned_data, online=True)
    
    def process_test_knobs(self, test_knobs):
        '''
        Preprocess test knobs.
        '''
        return self.__preprocess_workload_knobs(test_knobs, online=True, only_knobs=True)
          
    def __prune_offline_metrics(self, file_path=None):
        '''
        Prune offline workloads metrics using FA + KMeans.
        NOTE: Modularize to use any technique.
        '''
        data = pd.read_csv(file_path)
        metrics = data.to_numpy()[:, self.METRICS_START_IDX:].T

        metric_factors = self.dim_reducer.fit_transform(metrics)
        kmeans = self.kmeans.fit(metric_factors)
        closest_idxs, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, metric_factors)
        self.pruned_metrics_idxs = closest_idxs
        closest_idxs_raw = [self.METRICS_START_IDX + idx for idx in closest_idxs]
        self.pruned_metrics_names = data.columns[closest_idxs_raw].tolist()
        
        pruned_metrics = metrics[self.pruned_metrics_idxs].T
        n_cols = data.shape[1]
        metric_cols = np.linspace(self.METRICS_START_IDX, n_cols - 1, n_cols - self.METRICS_START_IDX, dtype=int)
        data.drop(data.columns[metric_cols], axis=1, inplace=True)
        pruned_data = pd.concat([data, pd.DataFrame(pruned_metrics)], axis=1)
        return pruned_data
        
    def __prune_online_metrics(self, raw_workload):
        '''
        Prune online workloads metrics using identified
        non-redundant metrics from offline workloads.
        '''
        data = raw_workload.reset_index(drop=True)
        metrics = data.to_numpy()[:, self.METRICS_START_IDX:].T
        pruned_metrics = metrics[self.pruned_metrics_idxs].T
        
        n_cols = data.shape[1]
        metric_cols = np.linspace(self.METRICS_START_IDX, n_cols - 1, n_cols - self.METRICS_START_IDX, dtype=int)
        data.drop(data.columns[metric_cols], axis=1, inplace=True)
        pruned_data = pd.concat([data, pd.DataFrame(pruned_metrics)], axis=1)
        return pruned_data
    
    def __preprocess_workload_knobs(self, pruned_data, online=False, only_knobs=False):
        '''
        Preprocess knobs.
        If online is True, transform using fitted encoders (online knobs)
        Otherwise, fit and then transform (offline knobs)
        For test knobs, only_knobs is True.
        '''
        col_names = pruned_data.columns.tolist()
        pruned_n = pruned_data.to_numpy()
        int_knobs = self.INT_KNOBS_IDXS
        cont_knobs = self.CONT_KNOBS_IDXS
        bool_knob = self.BOOL_KNOS_IDX
        
        if only_knobs:
            int_knobs = [idx - 1 for idx in int_knobs]
            cont_knobs = [idx - 1 for idx in cont_knobs]
            bool_knob = bool_knob - 1
            online = True
        
        if not online:
            if self.int_encoder:
                pruned_n[:, int_knobs] = self.int_encoder.fit_transform(pruned_n[:, int_knobs])
            if self.cont_encoder:
                pruned_n[:, cont_knobs] = self.cont_encoder.fit_transform(pruned_n[:, cont_knobs])
        else:
            if self.int_encoder:
                pruned_n[:, int_knobs] = self.int_encoder.transform(pruned_n[:, int_knobs])
            if self.cont_encoder:
                pruned_n[:, cont_knobs] = self.cont_encoder.transform(pruned_n[:, cont_knobs])
        
        pruned_n[:, bool_knob] = pruned_n[:, bool_knob].astype(int)
        return pd.DataFrame(pruned_n, columns=col_names)
    

In [4]:
class OtterTune:
    '''
    Main OtterTune system. Contains methods to perform workload mapping and predicting latency.
    '''
    def __init__(self, repo, params):
        self.repo = repo
        self.metric_model = None
        self.metric_binner = None
        self.N_METRICS = None
        
        self.workloads = []
        self.__set_hyperparams(params)
        self.__build_workloads()
        
    def __set_hyperparams(self, params):
        self.metric_model = params['metric_model']
        self.metric_binner = params['metric_binner']
        
    def __build_workloads(self):
        '''
        Run only once at OtterTune object creation.
        Creates Workload objects and build metric models on each.
        '''
        data = self.repo._build()        
        latency_idx = self.repo.LATENCY_IDX
        wl_ids = data['workload id'].unique()
        
        for wl_id in tqdm(wl_ids, desc='Building Offline Workloads'):
            wl_data = data[data['workload id'] == wl_id].to_numpy()
            knobs = wl_data[:, 1:latency_idx]
            metrics = wl_data[:, latency_idx:]
            if not self.N_METRICS:
                self.N_METRICS = metrics.shape[1]
            workload = Workload(wl_id, knobs, metrics, self.metric_model)
            workload.build_metric_models()
            self.workloads.append(workload)    
    
    def predict(self, raw_workload, test_knobs):
        '''
        Predicts latency for test knobs given online workload.
        Uses helper functions for workload mapping and to
        augment online workload with matched offline workload.
        '''
        processed_wl = self.repo.process_online_workload(raw_workload)
        processed_wl_metrics = processed_wl.iloc[:, 13:]
        processed_wl_knobs = processed_wl.iloc[:, 1:13]
        processed_test_knobs = self.repo.process_test_knobs(test_knobs)

        best_wl_idx = self.__get_best_workload(processed_wl_knobs, processed_wl_metrics)
        aug_wl = self.__get_augmented_workload(best_wl_idx, processed_wl)
        
        gpr = clone(self.metric_model)
        gpr.fit(aug_wl[:, :-1], aug_wl[:, -1])
        preds = gpr.predict(processed_test_knobs)
        return preds, self.workloads[best_wl_idx].wl_id
    
    def __get_augmented_workload(self, best_wl_idx, processed_wl):
        '''
        Given matched workload, augment current online workload data.
        '''
        w = self.workloads[best_wl_idx]
        w_knobs, w_latency = w.knobs, w.metrics[:, 0].reshape(-1, 1)
        offline = np.concatenate((w_knobs, w_latency), 1)
        
        online = processed_wl.iloc[:, 1:14].to_numpy()
        aug_wl = np.concatenate((offline, online), 0)
        return aug_wl
        
    def __get_best_workload(self, wl_knobs, wl_metrics):
        '''
        Performs workload mapping given online workload (knobs, metrics).
        '''
        n_wls, n_configs = len(self.workloads), len(wl_knobs)
        S = self.__build_distance_matrix(wl_knobs)
        
        binned_S, transf = self.__bin_metrics(S)
        online_metrics = self.__bin_online_metrics(wl_metrics, transf)
        
        best_wl_idx = np.argmin(np.mean(np.sqrt(np.sum((binned_S - online_metrics)**2, axis=2)), axis=0))
        return best_wl_idx
    
    def __build_distance_matrix(self, train_knobs):
        '''
        Build distance matrix S (paper section 6.1).
        Helps efficiently calculate closest offline workload.
        '''
        n_wls, n_configs = len(self.workloads), len(train_knobs)
        S = np.zeros((self.N_METRICS, n_wls, n_configs))
        for metric_idx in range(self.N_METRICS):
            for wl_idx, w in enumerate(self.workloads):
                row = w.predict_metric(metric_idx, train_knobs)
                S[metric_idx, wl_idx, :] = row
        return S

    def __bin_metrics(self, S):
        '''
        Normalizes metrics with bin number using deciles.
        Needed to perform accurate distance comparisons.
        '''
        n_metrics, n_wls, n_configs = S.shape
        sr = S.reshape(n_wls*n_configs, n_metrics)
        metric_binner = clone(self.metric_binner)
        sr = metric_binner.fit_transform(sr)
        S = sr.reshape(n_metrics, n_wls, n_configs)
        return S, metric_binner
        
    def __bin_online_metrics(self, wl_metrics, metric_binner):
        '''
        Normalizes online metrics with bin number using deciles.
        Uses previsouly used encoder (transf).
        '''
        online_metrics = metric_binner.transform(wl_metrics).T
        online_metrics = np.repeat(online_metrics[:, np.newaxis, :], len(self.workloads), axis=1)
        return online_metrics

In [5]:
class Workload:
    '''
    Models each workload. Contains methods to train GPR models on each metric.
    Predicts latency (metric index 0).
    '''
    def __init__(self, wl_id, knobs, metrics, metric_model):
        self.wl_id = wl_id
        self.knobs = knobs
        self.metrics = metrics
        self.metric_model = metric_model
        self.models = {}
        self.N_METRICS = metrics.shape[1]
        
    def build_metric_models(self):
        '''
        Train GPR models on each metric.
        '''
        for metric_idx in range(self.N_METRICS):
            model = clone(self.metric_model)
            model.fit(self.knobs, self.metrics[:, metric_idx])
            self.models[metric_idx] = model
        
    def predict_metric(self, metric_idx, knobs):
        '''
        Predict a metric using existing model.
        '''
        return self.models[metric_idx].predict(knobs)
    

In [6]:
class Tester:
    '''
    Driver class to run val/test workloads and report performance.
    Each workload required 3 files.
    online_path - Online workloads file
    test_path - Test Knobs file
    true_path - True Latency for test knobs file
    '''
    def __init__(self, ottertune, mode='val'):
        self.mode = mode # either 'val' or 'test'
        self.ONLINE_PATH = None # online workload
        self.TEST_PATH = None # test knobs (like test.csv)
        self.TRUE_PATH = None # true latency (only for 'val')
        self.RESULT_PATH = None # To write out results
        
        self.o = ottertune
        self.online_workloads = {}
        self.test_knobs = {}
        self.true_preds = None
        self.wl_ids = None
        
        self.__set_file_paths()
        self.__load_data()
        
    def __set_file_paths(self):
        '''
        Sets file paths based on val/test dataset.
        '''
        path = './data/' + self.mode + '/'
        self.ONLINE_PATH = path + 'online_workload.csv' 
        self.TEST_PATH = path + 'test_knobs.csv' 
        
        # Not a CSV as the MSE is added to file name later
        self.RESULT_PATH = './data/out/' + self.mode + '_results'
        if self.mode == 'val' or self.mode == 'sanity':
            self.TRUE_PATH = path + 'true_latency.csv'
        
    def __load_data(self):
        '''
        Run only once at creation of Tester.
        Loads all 3 required files.
        '''
        online = pd.read_csv(self.ONLINE_PATH)
        knobs = pd.read_csv(self.TEST_PATH)
        if self.mode == 'val' or self.mode == 'sanity':
            self.true_preds = pd.read_csv(self.TRUE_PATH, header=None).to_numpy().reshape(-1)
        wl_ids = online['workload id'].unique().tolist()
        self.wl_ids = wl_ids
        for wl_id in tqdm(wl_ids, desc='Loading Online Workloads'):
            w = online[online['workload id'] == wl_id]
            k = knobs[knobs['workload id'] == wl_id].iloc[:, 1:]
            self.online_workloads[wl_id] = w
            self.test_knobs[wl_id] = k
                
    def run(self):
        '''
        Runs each workload to predict latency for each test knob.
        Saves result file with true/pred workload id and latency.
        Prints MSE across all workloads.
        '''
        preds_arr = []
        pi = 0
        for wl_id in tqdm(self.wl_ids, desc='Running Target Workloads'):
            online_wl = self.online_workloads[wl_id]
            test_knobs = self.test_knobs[wl_id]
            preds, best_wl_id = self.o.predict(online_wl, test_knobs)
            for p in preds:
                if self.mode == 'val' or self.mode == 'sanity':
                    preds_arr.append([wl_id, best_wl_id, self.true_preds[pi], p])
                elif self.mode == 'test':
                    preds_arr.append([wl_id, best_wl_id, p])                    
                pi += 1
        
        if self.mode == 'val' or self.mode == 'sanity':
            column_names = ['wl_id', 'mapped_wl_id', 'true_latency', 'latency_pred']
            df = pd.DataFrame(preds_arr, columns=column_names)
            mae = mean_absolute_error(self.true_preds, df.iloc[:, -1].to_numpy())
            df.to_csv(self.RESULT_PATH + '_(' + str(round(mae, 2)) + ').csv')
            print('MAE:', mae)
        elif self.mode == 'test':
            column_names = ['wl_id', 'mapped_wl_id', 'latency_pred']
            df = pd.DataFrame(preds_arr, columns=column_names)
            df.to_csv(self.RESULT_PATH + '.csv')
        

# Workflow

#### DataRepo
* Load offline workloads, prune metrics, pre-process knobs.
* Saves pruned metrics, knob encoders to be used later to transform online workloads.

#### OtterTune 
* Create offline Workload objects from processed offline data from DataRepo  
* Given online workload and test knobs, performs workload mapping
* Augments current workload with mapped workload
* Predict latency for test knobs  
* Uses pruned metric info and trained knob encoders in DataRepo to transform online workloads

#### Workload
* Models a single workload
* Contains GPR models trained on each metric (knobs -> GPR -> latency/metric)

#### Tester
* Works given a mode ('val' or 'test')
* Loads online workloads and their respective test knobs
* If 'val', loads true latencies to report mean absolute error (MAE)
* Saves a result file under `./data/out/` with mapped workload info and predicted latency
* Result file name `val_results_({MAE}).csv`

#### Datasets
* Train - offline_workload.csv (makes our DataRepo)
* Val – online_workload_B.csv (100 workloads with 6 configs each) split into 3 files
    1. `online_workload.csv` (100 workloads with 5 configs each, randomly chosen)
    2. `test_knobs.csv` (100 workloads with left out 1 config each)
    3. `true_latency.csv` (True latency values for each test knob in test_knobs.csv)
* Test – online_workload_C.csv and provided test knobs

## Set Parameters

In [12]:
kernel = DotProduct() + WhiteKernel() # Some reason RBF predicts "0" values, RationalQuadratic works well sometimes
params = {
    'dim_reducer': PCA(n_components=2),
    'kmeans': KMeans(n_clusters=10),
    'int_encoder': None,
    'cont_encoder': None,
    'metric_model': GaussianProcessRegressor(kernel=kernel),
    'metric_binner': KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
}

## Validation Run

### NOTE
* Results show lots of '0' value predictions for latency.  
* Will create a dummy val dataset from offline workload so that we know what the correct workload mapping is. I think this would be a good sanity check before we tune all the hyperparameters for better MAE.

In [13]:
repo = DataRepo(params)
o = OtterTune(repo, params)
t = Tester(o, mode='val')

Pruning metrics and pre-processing knobs...


Building Offline Workloads: 100%|██████████| 58/58 [00:34<00:00,  1.67it/s]
Loading Online Workloads: 100%|██████████| 100/100 [00:00<00:00, 355.73it/s]


In [14]:
t.run()

Running Target Workloads: 100%|██████████| 100/100 [01:20<00:00,  1.25it/s]

MAE: 47.14169693982604





## Workload Mapping Sanity Check
Online workloads are same as offline, just different configs. Wanted to see if mapped workloads are correct.

MAE is ~28. I rigged prediction by directly providing it the right workload mapping and MAE was ~20. I think at that point, MAE could be improved by tuning GPR's. However, workload mapping is not great. Lots of incorrect mappings (check `./data/out/`). I looked a bit into this, turns out implementation is right however, the pruned metrics are just not helping to differentiate them enough to make the distance comparison between workloads accurate.

Some ideas to improve workload mapping I could come up with
* Tuning no. of clusters (increasing it helped as more pruned metrics are involved, but increasing more didn't)
* Some of the pruned metrics are in the range 10^8 - 10^12. Maybe we should think about scaling these before we train GPR's? I looked into some predictions are I guess because of the large range, predictions are off by a lot compared to say latency.
* Choice of kernel had big difference. DotProduct + WhiteKernel or RationalQuadratic worked well. RBF was terrible with lots of '0' value predictions for some reason.


In [10]:
repo = DataRepo(params, offline_path='./data/sanity/offline_workload.csv')
o = OtterTune(repo, params)
t = Tester(o, mode='sanity')

Pruning metrics and pre-processing knobs...


Building Offline Workloads: 100%|██████████| 58/58 [00:31<00:00,  1.83it/s]
Loading Online Workloads: 100%|██████████| 58/58 [00:00<00:00, 391.30it/s]


In [11]:
t.run()

Running Target Workloads: 100%|██████████| 58/58 [00:44<00:00,  1.30it/s]

MAE: 28.468814798389925



