# Load data 

#### First, we import some libraries

In [1]:
# for arrays
import numpy as np

# for dataframes
import pandas as pd

# plots
import matplotlib.pyplot as plt
# high-level plots
import seaborn as sns

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# machine learning library
# Principal Component Analysis - determine new axis for representing data
from sklearn.decomposition import PCA
# Random Forests -> vote between decision trees
# Gradient boosting -> instead of a vote, upgrade the same tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
# Decision Tree
from sklearn.tree import DecisionTreeRegressor, plot_tree
# To add interactions in linear regressions models
from sklearn.preprocessing import PolynomialFeatures
# Elasticnet is an hybrid method between ridge and Lasso
from sklearn.linear_model import LinearRegression, ElasticNet
# To separate the data into training and test
from sklearn.model_selection import train_test_split
# Simple clustering (iterative steps)
from sklearn.cluster import KMeans
# get interactions of features
from sklearn.preprocessing import PolynomialFeatures


# we use it to interact with the file system
import os
# compute time
from time import time

# statistics
import scipy.stats as sc
# hierarchical clustering, clusters
from scipy.cluster.hierarchy import linkage, cut_tree, leaves_list
from scipy import stats
# statistical tests
from scipy.stats import mannwhitneyu

# no warning
import warnings
warnings.filterwarnings("ignore")


from common import load_data, split_data

### Import data

In [2]:
data_dir = "../data/"
full_data = load_data(data_dir)

data = full_data["data"]
inputs_perf = full_data["performance_properties"]
inputs_feat = full_data["features"]
inputs_categ = full_data["features_categorical"]
inputs_num = full_data["features_numerical"]
inputs_feat_cols = full_data["feature_columns"]
inputs_prop = full_data["input_properties"]
inputs_name = full_data["input_names"]
inputs_count = full_data["input_counts"]

### Example of a dataframe

> **Usage** :  ```data["name_of_my_software_system", id_of_my_input]``` returns the dataframe of measurements for this software system on this input

In [3]:
# eg for x264 and the first input
data["x264", 0]

Unnamed: 0,8x8dct,aq-mode,bframes,cabac,chroma_qp_offset,fast_pskip,mbtree,me_range,mixed_ref,qpmax,...,scenecut_0,scenecut_40,scenecut_None,weightb_1,weightb_None,size,kbs,fps,etime,cpu
0,0,0,0,0,0,1,0,16,0,69,...,1,0,0,0,1,403085,161.07,375.22,2.14,434
1,1,1,8,1,0,1,1,16,1,69,...,1,0,0,1,0,234157,93.57,217.07,3.40,734
2,1,0,8,1,0,1,1,16,1,69,...,1,0,0,1,0,159836,63.87,293.42,2.71,739
3,1,0,8,1,0,1,1,16,1,69,...,1,0,0,1,0,163586,65.37,276.79,2.78,858
4,1,1,3,1,-2,1,0,24,1,69,...,0,1,0,1,0,218392,87.27,287.79,2.74,699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,1,1,3,1,0,1,1,16,1,69,...,0,1,0,1,0,183183,73.20,254.20,3.01,716
197,1,1,3,1,0,1,1,24,0,69,...,0,1,0,1,0,195730,78.21,269.38,2.88,644
198,1,1,3,1,-2,0,1,16,1,69,...,0,1,0,1,0,178146,71.19,263.64,2.92,927
199,1,1,3,0,-2,1,0,16,1,69,...,0,1,0,1,0,234018,93.51,267.21,2.89,685


### Separation training & validation

> **Usage** :  ```train["name_of_my_sotware_system", id_of_my_input, p]``` returns the dataframe of training measurements for this software system on this input using p% of the measurements as training and ```val["name_of_my_sotware_system", id_of_my_input, p]``` returns the dataframe of validation measurements using p% of the measurements as training

In [5]:
random_seed = 100
system = "gcc"
train_data, test_data, _ , _ = split_data(
    data, system, inputs_count, inputs_feat_cols, random_seed
)


In [6]:
test_data["gcc", 1]

Unnamed: 0,-ffloat-store,-floop-interchange,-fno-asm,-fprefetch-loop-arrays,optim_-O0,optim_-O1,optim_-O2,optim_-Ofast,optim_-Og,size,ctime,exec
0,1,1,0,0,0,0,0,1,0,19112,0.17,9.196411
1,1,1,1,1,1,0,0,0,0,17752,0.069,37.248072
2,1,1,0,0,1,0,0,0,0,17752,0.072,37.099344
3,1,0,0,1,0,0,1,0,0,17552,0.123,9.234749
4,1,1,0,0,0,0,1,0,0,17552,0.171,8.988491
5,1,0,0,1,0,0,0,1,0,19112,0.139,9.675731
6,1,1,1,0,1,0,0,0,0,17752,0.067,35.000599
7,0,1,1,0,1,0,0,0,0,17752,0.07,34.973631


In [7]:
np.random.seed(100)

valid_training_percentages = [(k+1)/10 for k in range(9)] # 0.1 -> 0.9 included

train = dict()
val = dict()

for dkey in data.keys():
    soft, input_id = dkey
    for p in valid_training_percentages:
        X_train, X_test = train_test_split(data[soft, input_id], train_size=p)
        train[soft, input_id, p] = X_train
        val[soft, input_id, p] = X_test

In [8]:
train['nodejs', 0, 0.1]

Unnamed: 0,--experimental-vm-modules,--experimental-wasm-modules,--jitless,--no-warnings,--node-memory-debug,--preserve-symlinks-main,ops
23,0,0,0,0,0,1,114.537842
39,0,1,1,0,0,1,113.147423
3,1,0,1,1,0,0,125.774115
24,1,1,1,1,0,1,116.343086
8,1,0,0,1,0,1,111.468678


In [9]:
val['nodejs', 0, 0.1]

Unnamed: 0,--experimental-vm-modules,--experimental-wasm-modules,--jitless,--no-warnings,--node-memory-debug,--preserve-symlinks-main,ops
6,0,1,1,1,0,0,113.887916
36,0,1,1,0,0,0,115.251092
37,0,1,1,0,1,0,115.647211
28,1,1,1,1,1,1,115.62557
43,0,1,0,0,0,0,109.957798
49,1,0,0,1,1,1,116.653152
5,0,0,1,0,1,1,120.98169
33,1,1,1,0,0,1,132.588184
20,1,1,1,0,0,0,116.514776
42,1,1,0,0,1,1,120.652824
