In [97]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn import ensemble
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import PCA


In [98]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [99]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        if(Y_test[i]!= 0 ):
            error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
        
    error = error/ len(Y_test)
    return error

In [100]:
def return_best_param(model, grid, X_train, Y_train):
    grid = GridSearchCV(model, grid, refit = True, verbose = 0)
    # fitting the model for grid search 
    tqdm(grid.fit(X_train, Y_train)) 
    print('Found Best Parameters for this model', model)

    # print how our model looks after hyper-parameter tuning 
    return (grid.best_estimator_) 

# Dataset 1 :Qsort

In [101]:
def process_all_qsort(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1', 'bus_speed','num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)
        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_qsort' + '.csv')

In [102]:
dataset_name_n = 'qsort_physical'
dataset_name = 'qsort_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_qsort(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(2730, 17) (672, 17) (2730, 2) (672, 2)
(67, 17) (605, 17) (67, 2) (605, 2)
(2797, 17) (2797, 2) (605, 17) (605, 2)
(2797, 17) (605, 17) (2797, 2) (605, 2)
Running model number: 1 with Model Name:  best_knn
(2797, 17) (605, 17) (2797, 2) (605, 2)
Running model number: 2 with Model Name:  best_dt
(2797, 17) (605, 17) (2797, 2) 

# Dijkstra

In [103]:
def process_all_dijkstra(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)
        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_dijkstra' + '.csv')

In [104]:
dataset_name_n = 'dijkstra_physical'
dataset_name = 'dijkstra_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_dijkstra(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(362, 16) (52, 16) (362, 2) (52, 2)
(5, 16) (47, 16) (5, 2) (47, 2)
(367, 16) (367, 2) (47, 16) (47, 2)
(367, 16) (47, 16) (367, 2) (47, 2)
Running model number: 1 with Model Name:  best_knn
(367, 16) (47, 16) (367, 2) (47, 2)
Running model number: 2 with Model Name:  best_dt
(367, 16) (47, 16) (367, 2) (47, 2)
Running model number: 3 with Model Name: 

# Matmul

In [105]:
def process_all_matmul(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1',
                                                        'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_matmul' + '.csv')

In [106]:
dataset_name_n = 'matmul_physical'
dataset_name = 'matmul_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_matmul(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1780, 17) (519, 17) (1780, 2) (519, 2)
(51, 17) (468, 17) (51, 2) (468, 2)
(1831, 17) (1831, 2) (468, 17) (468, 2)
(1831, 17) (468, 17) (1831, 2) (468, 2)
Running model number: 1 with Model Name:  best_knn
(1831, 17) (468, 17) (1831, 2) (468, 2)
Running model number: 2 with Model Name:  best_dt
(1831, 17) (468, 17) (1831, 2) 

# Tracking

In [107]:
def process_all_tracking(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_tracking' + '.csv')

In [108]:
dataset_name_n = 'tracking_physical'
dataset_name = 'tracking_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_tracking(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425, 2) (52, 2)
(5, 16) (47, 16) (5, 2) (47, 2)
(430, 16) (430, 2) (47, 16) (47, 2)
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 3 with Model Name: 

# SVM

In [109]:
def process_all_svm(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_svm' + '.csv')

In [110]:
dataset_name_n = 'svm_physical'
dataset_name = 'svm_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_svm(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(390, 16) (52, 16) (390, 2) (52, 2)
(5, 16) (47, 16) (5, 2) (47, 2)
(395, 16) (395, 2) (47, 16) (47, 2)
(395, 16) (47, 16) (395, 2) (47, 2)
Running model number: 1 with Model Name:  best_knn
(395, 16) (47, 16) (395, 2) (47, 2)
Running model number: 2 with Model Name:  best_dt
(395, 16) (47, 16) (395, 2) (47, 2)
Running model number: 3 with Model Name: 

# Montecarlo

In [111]:
def process_all_montecarlo(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1',
                                                        'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_montecarlo' + '.csv')

In [112]:
dataset_name_n = 'montecarlocalcpi_physical'
dataset_name = 'montecarlocalcpi_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_montecarlo(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
(1365, 17) (260, 17) (1365, 2) (260, 2)
(26, 17) (234, 17) (26, 2) (234, 2)
(1391, 17) (1391, 2) (234, 17) (234, 2)
(1391, 17) (234, 17) (1391, 2) (234, 2)
Running model number: 1 with Model Name:  best_knn
(1391, 17) (234, 17) (1391, 2) (234, 2)
Running model number: 2 with Model Name:  best_dt
(1391, 17) (234, 17) (1391, 2) 

# MSER

In [113]:
def process_all_mser(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_mser' + '.csv')

In [114]:
dataset_name_n = 'mser_physical'
dataset_name = 'mser_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_mser(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(430, 16) (52, 16) (430, 2) (52, 2)
(5, 16) (47, 16) (5, 2) (47, 2)
(435, 16) (435, 2) (47, 16) (47, 2)
(435, 16) (47, 16) (435, 2) (47, 2)
Running model number: 1 with Model Name:  best_knn
(435, 16) (47, 16) (435, 2) (47, 2)
Running model number: 2 with Model Name:  best_dt
(435, 16) (47, 16) (435, 2) (47, 2)
Running model number: 3 with Model Name: 

# Stitch

In [115]:
def process_all_stitch(dataset_path, dataset_name,dataset_name_n,path_for_saving_data):
    
    ################## Data Preprocessing ######################
    
    df = pd.read_csv(dataset_path + dataset_name + '.csv')
    dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                             , encoder_isa = None, encoder_mem_type=None)
    encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                                   , encoder_isa = None, encoder_mem_type=None)
    
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4',
                                                        'isa_1','isa_2' ,'isa_3', 'isa_4', 'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns, total_data_n.columns, len(total_data.columns), len(total_data_n.columns))
    total_data = total_data.fillna(0)
    total_data_n = total_data_n.fillna(0)
 
    X_sim = total_data.drop(columns = ['runtime', 'power']).to_numpy()
    Y_sim = total_data[['runtime', 'power']].to_numpy()
    X_phy = total_data_n.drop(columns = ['runtime', 'power']).to_numpy()
    Y_phy = total_data_n[['runtime','power']].to_numpy()    
    print(X_sim.shape, X_phy.shape, Y_sim.shape, Y_phy.shape)

    # Separating Physical data to 10% and 90%
    X_train_phy, X_test_phy, Y_train_phy, Y_test_phy = train_test_split(X_phy, Y_phy, test_size = 0.90, random_state = 0)
    print(X_train_phy.shape, X_test_phy.shape, Y_train_phy.shape, Y_test_phy.shape)
    X_train_sim = np.append(X_sim, X_train_phy,axis = 0)
    Y_train_sim = np.append(Y_sim, Y_train_phy,axis = 0)
    print(X_train_sim.shape, Y_train_sim.shape, X_test_phy.shape, Y_test_phy.shape)
    
    X_train = X_train_sim
    X_test = X_test_phy
    Y_train = Y_train_sim
    Y_test = Y_test_phy
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    
    scaler_X_sim = StandardScaler()
    scaler_X_phy = StandardScaler()
    scaler_X_sim.fit(X_sim)
    scaler_X_phy.fit(X_phy)
    
    scaler_Y_sim = StandardScaler()
    scaler_Y_phy = StandardScaler()
    scaler_Y_sim.fit(Y_sim)
    scaler_Y_phy.fit(Y_phy)
    
    X_train = scaler_X_sim.transform(X_train)
    X_test = scaler_X_phy.transform(X_test)
    # Y_train = np.reshape(Y_train, (len(Y_train),1))
    # Y_test = np.reshape(Y_test, (len(Y_test),1))
    Y_train = scaler_Y_sim.transform(Y_train)
    Y_test = scaler_Y_phy.fit_transform(Y_test)    
    
    ################## Data Preprocessing ######################
    '''pca = PCA(n_components=9)
    pca.fit(X_train)
    X_train = pca.transform(X_train)'''

    # pca = PCA(n_components=9)
    # pca.fit(X_test)
    # X_test = pca.transform(X_test)
    # Put best models here using grid search
    
    
    # 4. KNN
    param_grid_knn =   {'n_neighbors': [ 6, 7, 13, 15],  
             'weights' : ['uniform', 'distance'],
              'p' : [1, 2, 4, 5, 7 ,10]
             } 
    model_knn = KNeighborsRegressor()          
    # best_knn = return_best_param(model_knn, param_grid_knn, X_train, Y_train) 
    
    model_dt = DecisionTreeRegressor()          
    # best_dt = return_best_param(model_dt, param_grid_dt, X_train, Y_train) 

    # 7. Random Forest 
    param_grid_rf =   {'n_estimators' : [50,  200],  
              'max_depth': [5,9,15,20]

             } 
    model_rf = RandomForestRegressor()          
    # best_rf = return_best_param(model_rf, param_grid_rf, X_train, Y_train) 
    
    # 8. Extra Trees Regressor
    param_grid_etr =   {'n_estimators' : [50, 200],
              'max_depth': [5,9,15,20]
                       }
    model_etr = ExtraTreesRegressor()          
    # best_etr =  return_best_param(model_etr, param_grid_etr, X_train, Y_train) 
    
    
    # return_best_param(model_xgb, param_grid_xgb, X_train, Y_train)
    
    # best_models = [best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr]
    best_models = [model_knn, model_dt, model_rf, model_etr]
    best_models_name = [ 'best_knn', 'best_dt', 'best_rf', 'best_etr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mape_runtime', 'mape_power'])
    
    for model in best_models:
        model_orig = model
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mape_scores_runtime = []
        mape_scores_power = []

        fold = 1
        print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
        model_orig.fit(X_train, Y_train)
        Y_pred_fold = model_orig.predict(X_test)
        Y_test_fold = scaler_Y_phy.inverse_transform(Y_test)
        Y_pred_fold = scaler_Y_phy.inverse_transform(Y_pred_fold)

        
        # print('Accuracy =',accuracy_score(Y_test, Y_pred))
        r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
        mape_scores_runtime.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
        mape_scores_power.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores[0], 'mape_runtime': mape_scores_runtime[0],'mape_power': mape_scores_power[0]}
                       , ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv('result_multivariate_stitch' + '.csv')

In [116]:
dataset_name_n = 'stitch_physical'
dataset_name = 'stitch_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'
path_for_saving_data = dataset_name
process_all_stitch(dataset_path, dataset_name, dataset_name_n, path_for_saving_data)

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'runtime', 'power'],
      dtype='object') 18 18
(425, 16) (52, 16) (425, 2) (52, 2)
(5, 16) (47, 16) (5, 2) (47, 2)
(430, 16) (430, 2) (47, 16) (47, 2)
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 1 with Model Name:  best_knn
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 2 with Model Name:  best_dt
(430, 16) (47, 16) (430, 2) (47, 2)
Running model number: 3 with Model Name: 

# Dataset Experimentation

In [117]:
dataset_name_n = 'matmul_physical'
dataset_name = 'matmul_simulated'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\Dataset\\'

df = pd.read_csv(dataset_path + dataset_name + '.csv')
dfn = pd.read_csv(dataset_path + dataset_name_n + '.csv')
encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df
                                                                         , encoder_isa = None, encoder_mem_type=None)
encoded_data_frame_n, encoder_isa_n, encoder_mem_type_n = encode_text_features('encode', dfn
                                                                               , encoder_isa = None, encoder_mem_type=None)


In [118]:
len(encoded_data_frame), len(encoded_data_frame_n)

(1780, 519)

In [119]:
encoded_data_frame_n.columns, len(encoded_data_frame_n.columns)


(Index(['arch', 'cpu-clock', 'isa_1', 'l1d_assoc', 'l1d_cache_lines',
        'l1d_shared_by_threads', 'l1d_size', 'l2_assoc', 'l2_cache_lines',
        'l2_shared_by_threads', 'l2_size', 'l3_assoc', 'l3_cache_lines',
        'l3_shared_by_threads', 'l3_size', 'mem-size', 'mem-type_1',
        'mem-type_2', 'mem_clock', 'num-cpus', 'PS', 'num-cpu', 'bus_speed',
        'runtime', 'power'],
       dtype='object'), 25)

In [120]:
encoded_data_frame.columns, len(encoded_data_frame.columns)

(Index(['arch', 'cpu-clock', 'isa_1', 'isa_2', 'l1d_assoc', 'l1d_cache_lines',
        'l1d_shared_by_threads', 'l1d_size', 'l2_assoc', 'l2_cache_lines',
        'l2_shared_by_threads', 'l2_size', 'l3_assoc', 'l3_cache_lines',
        'l3_shared_by_threads', 'l3_size', 'mem-size', 'mem-type_1',
        'mem-type_2', 'mem-type_3', 'mem-type_4', 'mem_clock', 'num-cpus', 'PS',
        'runtime', 'power'],
       dtype='object'), 26)

In [121]:
    total_data_n = encoded_data_frame_n.drop(columns = ['arch','mem-type_1','mem-type_2','isa_1',
                                                        'bus_speed', 'num-cpu'])
    total_data = encoded_data_frame.drop(columns = ['arch','mem-type_1','mem-type_2','mem-type_3','mem-type_4','isa_1',
                                                    'isa_2'])
    print(total_data.columns,'\n', total_data_n.columns, len(total_data.columns), len(total_data_n.columns))

Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 
 Index(['cpu-clock', 'l1d_assoc', 'l1d_cache_lines', 'l1d_shared_by_threads',
       'l1d_size', 'l2_assoc', 'l2_cache_lines', 'l2_shared_by_threads',
       'l2_size', 'l3_assoc', 'l3_cache_lines', 'l3_shared_by_threads',
       'l3_size', 'mem-size', 'mem_clock', 'num-cpus', 'PS', 'runtime',
       'power'],
      dtype='object') 19 19
