# Load data 

#### First, we import some libraries

In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# Decision Tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# get interactions of features
from sklearn.preprocessing import PolynomialFeatures
# loss functions and metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# we use it to interact with the file system
import os
# compute time
from time import time

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# no warning
import warnings
warnings.filterwarnings("ignore")

### Import data

In [2]:
data_dir = "../data/"

# the name of the systems we are testing
name_systems = ["nodejs", "poppler", "xz", "x264", "gcc", "lingeling", "sqlite", "imagemagick"]

# final results
data = dict()

inputs_name = dict()
inputs_count = dict()
inputs_perf = dict()
inputs_feat = dict()
inputs_categ = dict()
inputs_prop = dict()
inputs_num = dict()


# name of the performance properties
inputs_perf["gcc"] = ["size", "ctime", "exec"]
inputs_perf["imagemagick"] = ["size", "time"]
inputs_perf["lingeling"] = ["conflicts", "cps", "reductions"]
inputs_perf["nodejs"] = ["ops"]
inputs_perf["poppler"] = ["size", "time"]
inputs_perf["sqlite"] = ["q"+str(i+1) for i in range(15)]
inputs_perf["x264"] = ["size", "kbs", "fps", "etime", "cpu"]
inputs_perf["xz"] = ["size", "time"]

# name of features for each system
inputs_feat["gcc"] = ["optim","-floop-interchange","-fprefetch-loop-arrays","-ffloat-store","-fno-asm"]
inputs_feat["imagemagick"] = ["memory_r", "posterize_r", "gaussian-blur", "thread", "quality"]
inputs_feat["lingeling"] = ["--boost", "--carduse", "--decompose", "--gluescale", "--lkhd", "--memlim", 
"--minimize", "--prbsimple", "--sweepirr", "--sweepred"]
inputs_feat["nodejs"] = ["--jitless", "--experimental-wasm-modules", "--experimental-vm-modules",
                         "--preserve-symlinks-main","--no-warnings","--node-memory-debug"]
inputs_feat["poppler"] = ["format","j","jp2","jbig2","ccitt"]
inputs_feat["sqlite"] = ["-deserialize", "-memtrace", "-maxsize", "-append", "-output"]
inputs_feat["x264"] = ["cabac", "ref", "deblock", "analyse", "me", "subme", "mixed_ref", "me_range", "trellis", 
                "8x8dct", "fast_pskip", "chroma_qp_offset", "bframes", "b_pyramid", "b_adapt", "direct", 
                "weightb", "open_gop", "weightp", "scenecut", "rc_lookahead", "mbtree", "qpmax", "aq-mode"]
inputs_feat["xz"] = ["memory","format","level","depth"]

# just to isolate the options that have categorial values
# because it is more difficult to handle for ML algorithms
inputs_categ["gcc"] = ["optim"]
inputs_categ["imagemagick"] = []
inputs_categ["lingeling"] = []
inputs_categ["nodejs"] = []
inputs_categ["poppler"] = ["format"]
inputs_categ["sqlite"] = []
inputs_categ["x264"] = ['analyse', 'me', 'direct', 'deblock', 'b_adapt', 'b_pyramid', 
                        'open_gop', 'rc_lookahead', 'scenecut', 'weightb']
inputs_categ["xz"] = ['memory', 'format']

for ns in name_systems:
    
    data_path = data_dir+ns+'/'
    
    inputs_prop[ns] = pd.read_csv(data_path+'/others/properties.csv').set_index('id')
    
    inputs = [str(name)+'.csv' for name in inputs_prop[ns]['name']]

    inputs_name[ns] = inputs
    inputs_count[ns] = len(inputs)
    
    inputs_num[ns] = np.setdiff1d(inputs_feat[ns], inputs_categ[ns])
    
    for i in range(len(inputs)):
        loc = data_path+inputs[i]
        df = pd.read_csv(loc)
        if inputs_categ[ns] != []:
            data[ns, i] = df[inputs_num[ns]].join(pd.get_dummies(df[inputs_categ[ns]])).join(df[inputs_perf[ns]])
        else:
            data[ns, i] = df[inputs_num[ns]].join(df[inputs_perf[ns]])

### Example of a dataframe

> **Usage** :  ```data["name_of_my_sotware_system", id_of_my_input]``` returns the dataframe of measurements for this software system on this input

In [3]:
# eg for x264 and the first input
data["x264", 0]

Unnamed: 0,8x8dct,aq-mode,bframes,cabac,chroma_qp_offset,fast_pskip,mbtree,me_range,mixed_ref,qpmax,...,scenecut_0,scenecut_40,scenecut_None,weightb_1,weightb_None,size,kbs,fps,etime,cpu
0,0,0,0,0,0,1,0,16,0,69,...,1,0,0,0,1,403085,161.07,375.22,2.14,434
1,1,1,8,1,0,1,1,16,1,69,...,1,0,0,1,0,234157,93.57,217.07,3.40,734
2,1,0,8,1,0,1,1,16,1,69,...,1,0,0,1,0,159836,63.87,293.42,2.71,739
3,1,0,8,1,0,1,1,16,1,69,...,1,0,0,1,0,163586,65.37,276.79,2.78,858
4,1,1,3,1,-2,1,0,24,1,69,...,0,1,0,1,0,218392,87.27,287.79,2.74,699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,1,1,3,1,0,1,1,16,1,69,...,0,1,0,1,0,183183,73.20,254.20,3.01,716
197,1,1,3,1,0,1,1,24,0,69,...,0,1,0,1,0,195730,78.21,269.38,2.88,644
198,1,1,3,1,-2,0,1,16,1,69,...,0,1,0,1,0,178146,71.19,263.64,2.92,927
199,1,1,3,0,-2,1,0,16,1,69,...,0,1,0,1,0,234018,93.51,267.21,2.89,685


#### Loss function

In [4]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-4*np.ones(len(y_true))))) * 100

#### Model Shift (MS) is a transfer learning defined by Valov et al. 
First, it trains a performance model on the source input and predicts the performance distribution of the source input. 
Then, it trains a shifting function, predicting the performances of the target input based on the performances of the source. 
Finally, it applies the shifting function to the predictions of the source. 

In [13]:
class MS:
    
    def __init__(self, ns, performance, nb_input_clusters = 4):
        
        # the name of the current software system 
        self.ns = ns
        
        # the performance to predict
        self.performance = performance
        
        # see above
        self.inputs_name = inputs_name[self.ns]
        self.inputs_perf = inputs_perf[self.ns]
        self.inputs_prop = inputs_prop[self.ns].drop(['name'], axis = 1)
        
        # the total number of inputs for this software system
        self.nb_inputs = inputs_count[self.ns]
        
        # the data to work with - removing  other performance properties
        self.data = dict()
        for i in range(self.nb_inputs):
            self.data[i] = data[self.ns, i].drop(np.setdiff1d(self.inputs_perf, [self.performance]), axis = 1)
        
        # random state , i.e. a seed to split the source and the target datasets
        # by using the same set of configurations - needed to compare 
        # the performance of configurations used in the training set of the target 
        self.random_state = np.random.randint(0,1000)
        
        # the total number of configurations measured per input
        self.nb_config = len(data[self.ns, 0].index)
        
        # compute groups of inputs thanks to input properties
        clust_alg = KMeans(n_clusters = nb_input_clusters)
        self.prop_dummies = pd.get_dummies(self.inputs_prop).fillna(0)
        # we normalize the different properties so the clustering algorithm
        # is not influenced by the scale of the properties
        for col in self.prop_dummies.columns:
            self.prop_dummies[col] = (self.prop_dummies[col]-np.mean(
                self.prop_dummies[col]))/np.std(self.prop_dummies[col])
        # computing the clusters/groups of inputs
        clust_alg.fit(self.prop_dummies)
        self.inputs_grps = clust_alg.predict(self.prop_dummies)
    
    def choose_best_source(self, method):
        # random source input
        if method == 'random':
            output_id = self.target_id
            # to avoid choosing the target as source, which would be unfair
            max_iter = 0
            while output_id == self.target_id and max_iter < 50:
                output_id = np.random.randint(0, self.nb_inputs)
                max_iter+=1
        # random source input chosen in the same group of inputs
        if method == 'same_input_grp':
            # we randomly chose some inputs that we can use as source input for the given target
            target_grp = self.inputs_grps[self.target_id]
            target_grp_inputs = [i for i in range(len(self.inputs_grps)) if self.inputs_grps[i] == target_grp]
            output_id = self.target_id
            # to avoid choosing the target as source, which would be unfair
            max_iter = 0
            while output_id == self.target_id and max_iter < 50:
                output_id = target_grp_inputs[np.random.randint(0, len(target_grp_inputs))]
                max_iter+=1
        else:
            # select a set of potential source inputs
            y = [k for k in range(self.nb_inputs)]
            potential_sources, _ = train_test_split(y, train_size = self.nb_available_source_inputs)
            
            # select the source input that has a set of properties as close as possible of the target
            if method =='closest_properties':
                # the mean absolute error between the input properties of the sources and the target
                diff_src_tgt = [mean_absolute_error(self.prop_dummies.iloc[self.target_id],
                                                    self.prop_dummies.iloc[ps]) if ps != self.target_id else 100 for ps in potential_sources]
                # we select the source for which the difference of properties is minimal
                output_id = potential_sources[np.argmin(diff_src_tgt)]
            
            # best performance correlation between the potential source and the given target
            if method == 'max_perf_corr':
                target = self.data[self.target_id]
                y_tgt = np.array(target[self.performance], dtype=float)
                y_tgt_train, _ = train_test_split(y_tgt,
                                                           train_size=self.train_size, 
                                                           random_state=self.random_state)
                corr_src_tgt = []
                
                for ps in potential_sources:
                    if ps != self.target_id:
                        y_src = self.data[ps][self.performance]
                        y_src_train, _ = train_test_split(y_src,
                                                          train_size=self.train_size, 
                                                          random_state=self.random_state)
                        corr_src_tgt.append(np.corrcoef(y_src_train, y_tgt_train)[0,1])
                    else:
                        corr_src_tgt.append(0)
                # we select the source having the greater correlation with the target
                # for the training set of configurations
                output_id = potential_sources[np.argmax(corr_src_tgt)]
                
        return output_id
    
    
    def learn(self, 
              target_id, 
              proportion_training_config, 
              method,
              prop_available_source_inputs,
              learning_algorithm = RandomForestRegressor, 
              shift_function = RandomForestRegressor):
    
        # the number of configurations used for the training of the transfer
        self.train_size = int(proportion_training_config*self.nb_config)
        
        # the number of potential inputs that we can use as source
        self.nb_available_source_inputs = int(prop_available_source_inputs*self.nb_inputs) 
        
        # the target id
        self.target_id = target_id
        
        # choose the source id
        source_id = self.choose_best_source(method)

        # we define the source input, and split it into train-test
        source = self.data[source_id]
        X_src = source.drop([self.performance], axis = 1)
        y_src = np.array(source[self.performance], dtype=float)
        X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_src, 
                                                                            y_src, 
                                                                            train_size=self.train_size,
                                                                            random_state=self.random_state)
        
        # We define the target input, and split it into train-test
        target = self.data[self.target_id]
        X_tgt = target.drop([self.performance], axis = 1)
        y_tgt = np.array(target[self.performance], dtype=float)
        X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(X_tgt, 
                                                                            y_tgt, 
                                                                            train_size=self.train_size, 
                                                                            random_state=self.random_state)

        # the learning algorithm, training on the source video
        # X_src_train2, _, y_src_train2, _ = train_test_split(X_src, y_src, test_size=0.7)
        lf = learning_algorithm()
        lf.fit(X_src, y_src)
        y_src_pred_test = np.array(lf.predict(X_src_test)).reshape(-1,1)

        # the shift function, to transfer the prediction from the source to the target
        shift = shift_function()
        shift.fit(np.array(y_src_train).reshape(-1,1), y_tgt_train)
        y_tgt_pred_test = shift.predict(y_src_test.reshape(-1,1))

        # we return the mean average percentage error 
        # between the real values of y_test from target 
        # and the predictions shifted 
        # np.argmin(y_tgt_pred_test) to predict the best config
        return mean_absolute_percentage_error(y_tgt_pred_test, y_tgt_test)
    

In [16]:
# defined above :  
# name_systems = ["nodejs", "poppler", "xz", "x264", "gcc", "lingeling", "sqlite", "imagemagick"]

prop_train_config = [0.1* k for k in range(1,10)]
proportion_inputs = [0.1* k for k in range(1,10)]
methods = ['random', 'same_input_grp', 'closest_properties', 'max_perf_corr']
repetitions = 10

for ns in name_systems:
    for perf in inputs_perf[ns]:
        
        print(ns,perf)
        
        ms = MS(ns, perf)
        
        res = dict()

        for ptc in prop_train_config:
            for nbi in proportion_inputs:
                for m in methods:
                    for r in range(repetitions):
                        # we randomly chose the index of the target input
                        index_target = np.random.randint(inputs_count[ns])
                        # start timer
                        s = time()
                        val = ms.learn(target_id = index_target, 
                                       proportion_training_config = ptc, 
                                       method = m,
                                       prop_available_source_inputs = nbi)
                        # end timer
                        e = time()
                        res[ptc, m, nbi, r] = (val, e-s)

        final_results = []

        for i in res.keys():
            ptc, m, nbi, r = i
            mape, eltime = res[i]
            final_results.append((ptc, m, nbi, r, mape, eltime))

        df = pd.DataFrame(final_results)
        df.columns = ['proportion_train_config',  'source_selection_method', 'prop_inputs_source',
                      'id repetition', 'mape', 'training_time']
        df.to_csv('../results/'+ns+'/MS_'+perf+'.csv')

nodejs ops
poppler size
poppler time
xz size
xz time
x264 size
x264 kbs
x264 fps
x264 etime
x264 cpu
gcc size
gcc ctime
gcc exec
lingeling conflicts
lingeling cps
lingeling reductions
sqlite q1
sqlite q2
sqlite q3
sqlite q4
sqlite q5
sqlite q6
sqlite q7
sqlite q8
sqlite q9
sqlite q10
sqlite q11
sqlite q12
sqlite q13
sqlite q14
sqlite q15
imagemagick size
imagemagick time


In [None]:
res