In [55]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn import ensemble
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import PCA


In [56]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [57]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        if(Y_test[i]!= 0 ):
            error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
        
    error = error/ len(Y_test)
    return error

In [58]:
def return_best_param(model, grid, X_train, Y_train):
    grid = GridSearchCV(model, grid, refit = True, verbose = 0)
    # fitting the model for grid search 
    tqdm(grid.fit(X_train, Y_train)) 
    print('Found Best Parameters for this model', model)

    # print how our model looks after hyper-parameter tuning 
    return (grid.best_estimator_) 

# Dataset 1 :Qsort

In [97]:
def process_all_qsort(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1', 'bus_speed','num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [98]:
dataset_name_n = 'qsort_physical'
dataset_name = 'qsort_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_qsort(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_qsort(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_qsort' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(2730, 17) (672, 17) (2730,) (672,)
(67, 17) (605, 17) (67,) (605,)
(2797, 17) (2797,) (605, 17) (605,)
(2797, 17) (605, 17) (2797,) (605,)
Running model number: 1 with Model Name:  best_knn
(2797, 17) (605, 17) (2797, 1) (605, 1)
Running model number: 2 with Model Name:  best_dt
(2797, 17) (605, 17) (2797, 1) (605, 1)
Running



Running model number: 4 with Model Name:  best_etr
(2797, 17) (605, 17) (2797, 1) (605, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(2730, 17) (672, 17) (2730,) (672,)
(67, 17) (605, 17) (67,) (605,)
(2797, 17) (2797,) (605, 17) (605,)
(2797, 17) (605, 17) (2797,) (605,)
Running model number: 1 with Model Name:  best_knn
(2797, 17) (605, 17) (2797, 1) (605, 1)
Running model number: 2 with Model Name:  best_dt
(2797, 17) (605, 17) (2797, 1) (605, 1)
Running



Running model number: 4 with Model Name:  best_etr
(2797, 17) (605, 17) (2797, 1) (605, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn  0.202976      0.388767    1.007209
1    best_dt -0.187240      0.420365    0.902488
2    best_rf -0.226083      0.420755    0.992843
3   best_etr  0.355351      0.309951    0.809528


# Dataset 2: Dijkstra

In [99]:
def process_all_dijkstra(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [100]:
dataset_name_n = 'dijkstra_physical'
dataset_name = 'dijkstra_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_dijkstra(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_dijkstra(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_dijkstra' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(362, 16) (52, 16) (362,) (52,)
(5, 16) (47, 16) (5,) (47,)
(367, 16) (367,) (47, 16) (47,)
(367, 16) (47, 16) (367,) (47,)
Running model number: 1 with Model Name:  best_knn
(367, 16) (47, 16) (367, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(367, 16) (47, 16) (367, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(367, 1



Running model number: 4 with Model Name:  best_etr
(367, 16) (47, 16) (367, 1) (47, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(362, 16) (52, 16) (362,) (52,)
(5, 16) (47, 16) (5,) (47,)
(367, 16) (367,) (47, 16) (47,)
(367, 16) (47, 16) (367,) (47,)
Running model number: 1 with Model Name:  best_knn
(367, 16) (47, 16) (367, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(367, 16) (47, 16) (367, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(367, 1



Running model number: 4 with Model Name:  best_etr
(367, 16) (47, 16) (367, 1) (47, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn  0.026137      0.149727    0.568331
1    best_dt  0.446164      0.147622    0.177530
2    best_rf  0.493021      0.145920    0.167562
3   best_etr  0.234186      0.136657    0.378743


# Dataset 3: Matmul

In [101]:
def process_all_matmul(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1', 'bus_speed','num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [102]:
dataset_name_n = 'matmul_physical'
dataset_name = 'matmul_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_matmul(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_matmul(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_matmul' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1780, 17) (519, 17) (1780,) (519,)
(51, 17) (468, 17) (51,) (468,)
(1831, 17) (1831,) (468, 17) (468,)
(1831, 17) (468, 17) (1831,) (468,)
Running model number: 1 with Model Name:  best_knn
(1831, 17) (468, 17) (1831, 1) (468, 1)
Running model number: 2 with Model Name:  best_dt
(1831, 17) (468, 17) (1831, 1) (468, 1)
Running



Running model number: 4 with Model Name:  best_etr
(1831, 17) (468, 17) (1831, 1) (468, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1780, 17) (519, 17) (1780,) (519,)
(51, 17) (468, 17) (51,) (468,)
(1831, 17) (1831,) (468, 17) (468,)
(1831, 17) (468, 17) (1831,) (468,)
Running model number: 1 with Model Name:  best_knn
(1831, 17) (468, 17) (1831, 1) (468, 1)
Running model number: 2 with Model Name:  best_dt
(1831, 17) (468, 17) (1831, 1) (468, 1)
Running



Running model number: 4 with Model Name:  best_etr
(1831, 17) (468, 17) (1831, 1) (468, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn  0.043162      1.710582    0.882659
1    best_dt -0.248243      1.664266    0.729765
2    best_rf -0.233999      1.675748    0.733846
3   best_etr  0.099135      1.701964    0.634588


 # Tracking

In [103]:
def process_all_tracking(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [104]:
dataset_name_n = 'tracking_physical'
dataset_name = 'tracking_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_tracking(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_tracking(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_tracking' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425,) (52,)
(5, 16) (47, 16) (5,) (47,)
(430, 16) (430,) (47, 16) (47,)
(430, 16) (47, 16) (430,) (47,)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(430, 1



Running model number: 4 with Model Name:  best_etr
(430, 16) (47, 16) (430, 1) (47, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425,) (52,)
(5, 16) (47, 16) (5,) (47,)
(430, 16) (430,) (47, 16) (47,)
(430, 16) (47, 16) (430,) (47,)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(430, 1



Running model number: 4 with Model Name:  best_etr
(430, 16) (47, 16) (430, 1) (47, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn -0.069951      0.161222    0.621600
1    best_dt  0.317745      0.146274    0.502543
2    best_rf  0.352084      0.152910    0.452282
3   best_etr  0.338626      0.152532    0.423415


# SVM 

In [105]:
def process_all_svm(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [106]:
dataset_name_n = 'svm_physical'
dataset_name = 'svm_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_svm(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_svm(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_svm' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(390, 16) (52, 16) (390,) (52,)
(5, 16) (47, 16) (5,) (47,)
(395, 16) (395,) (47, 16) (47,)
(395, 16) (47, 16) (395,) (47,)
Running model number: 1 with Model Name:  best_knn
(395, 16) (47, 16) (395, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(395, 16) (47, 16) (395, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(395, 1



Running model number: 4 with Model Name:  best_etr
(395, 16) (47, 16) (395, 1) (47, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(390, 16) (52, 16) (390,) (52,)
(5, 16) (47, 16) (5,) (47,)
(395, 16) (395,) (47, 16) (47,)
(395, 16) (47, 16) (395,) (47,)
Running model number: 1 with Model Name:  best_knn
(395, 16) (47, 16) (395, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(395, 16) (47, 16) (395, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(395, 1



Running model number: 4 with Model Name:  best_etr
(395, 16) (47, 16) (395, 1) (47, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn -0.522287      0.111705    0.579131
1    best_dt  0.051923      0.120993    0.463564
2    best_rf  0.173163      0.120499    0.284424
3   best_etr  0.027684      0.108803    0.372807


# Montecarlo

In [107]:
def process_all_montecarlo(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1', 'bus_speed','num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [108]:
dataset_name_n = 'montecarlocalcpi_physical'
dataset_name = 'montecarlocalcpi_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_montecarlo(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_montecarlo(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_montecarlo' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1365, 17) (260, 17) (1365,) (260,)
(26, 17) (234, 17) (26,) (234,)
(1391, 17) (1391,) (234, 17) (234,)
(1391, 17) (234, 17) (1391,) (234,)
Running model number: 1 with Model Name:  best_knn
(1391, 17) (234, 17) (1391, 1) (234, 1)
Running model number: 2 with Model Name:  best_dt
(1391, 17) (234, 17) (1391, 1) (234, 1)
Running



Running model number: 4 with Model Name:  best_etr
(1391, 17) (234, 17) (1391, 1) (234, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1365, 17) (260, 17) (1365,) (260,)
(26, 17) (234, 17) (26,) (234,)
(1391, 17) (1391,) (234, 17) (234,)
(1391, 17) (234, 17) (1391,) (234,)
Running model number: 1 with Model Name:  best_knn
(1391, 17) (234, 17) (1391, 1) (234, 1)
Running model number: 2 with Model Name:  best_dt
(1391, 17) (234, 17) (1391, 1) (234, 1)
Running



Running model number: 4 with Model Name:  best_etr
(1391, 17) (234, 17) (1391, 1) (234, 1)




  model_name         r2  mape_runtime  mape_power
0   best_knn   0.010851      2.475787    0.743036
1    best_dt  -0.173435      1.925371    0.633345
2    best_rf  -0.191443      2.233268    0.660251
3   best_etr -19.602992      5.211828    0.595247


# MSER

In [109]:
def process_all_mser(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [110]:
dataset_name_n = 'mser_physical'
dataset_name = 'mser_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_mser(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_mser(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_mser' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(430, 16) (52, 16) (430,) (52,)
(5, 16) (47, 16) (5,) (47,)
(435, 16) (435,) (47, 16) (47,)
(435, 16) (47, 16) (435,) (47,)
Running model number: 1 with Model Name:  best_knn
(435, 16) (47, 16) (435, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(435, 16) (47, 16) (435, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(435, 1



Running model number: 4 with Model Name:  best_etr
(435, 16) (47, 16) (435, 1) (47, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(430, 16) (52, 16) (430,) (52,)
(5, 16) (47, 16) (5,) (47,)
(435, 16) (435,) (47, 16) (47,)
(435, 16) (47, 16) (435,) (47,)
Running model number: 1 with Model Name:  best_knn
(435, 16) (47, 16) (435, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(435, 16) (47, 16) (435, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(435, 1



Running model number: 4 with Model Name:  best_etr
(435, 16) (47, 16) (435, 1) (47, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn -0.127201      0.278283    0.717318
1    best_dt  0.269796      0.237772    0.657332
2    best_rf  0.162004      0.203831    0.708848
3   best_etr  0.388831      0.226810    0.519449


# Stitch

In [111]:
def process_all_stitch(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[val].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[val].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    Y_sim = np.reshape(Y_sim, (len(Y_sim),1))
    Y_phy = np.reshape(Y_phy, (len(Y_phy),1))    
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    Y_train = np.reshape(Y_train, (len(Y_train),1))
    Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    
    r2_scores = []
    mape_scores = []
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
        

        k = k + 1  
    return r2_scores, mape_scores
                       


In [112]:
dataset_name_n = 'stitch_physical'
dataset_name = 'stitch_simulated'
dataset_path = '\\ALL_CSV\\Dataset\\'
path_for_saving_data = dataset_name
r2_runtime, mape_runtime = process_all_stitch(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'runtime')                       
r2_power, mape_power = process_all_stitch(dataset_path, dataset_name,dataset_name_n,path_for_saving_data, val = 'power')                       
# print(r2_runtime, r2_power)
r2 = []
for i in range(4):
    r2.append(np.mean([r2_runtime[i], r2_power[i]]))
df = pd.DataFrame(columns = ['model_name','r2', 'mape_runtime', 'mape_power'])    

best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
for k in range(4):
    df = df.append({'model_name': best_models_name[k],
                         'r2': r2[k], 'mape_runtime': mape_runtime[k][0],'mape_power': mape_power[k][0]}
                       , ignore_index=True)   
print(df)    
df.to_csv('result_univariate_stitch' + '.csv')

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425,) (52,)
(5, 16) (47, 16) (5,) (47,)
(430, 16) (430,) (47, 16) (47,)
(430, 16) (47, 16) (430,) (47,)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(430, 1



Running model number: 4 with Model Name:  best_etr
(430, 16) (47, 16) (430, 1) (47, 1)




Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425,) (52,)
(5, 16) (47, 16) (5,) (47,)
(430, 16) (430,) (47, 16) (47,)
(430, 16) (47, 16) (430,) (47,)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 1) (47, 1)
Running model number: 3 with Model Name:  best_rf
(430, 1



Running model number: 4 with Model Name:  best_etr
(430, 16) (47, 16) (430, 1) (47, 1)




  model_name        r2  mape_runtime  mape_power
0   best_knn -0.055239      0.162916    0.590057
1    best_dt  0.289099      0.168052    0.469719
2    best_rf  0.445561      0.131533    0.422779
3   best_etr  0.262938      0.141010    0.438473
