In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import ensemble
import xgboost as xgb

Using TensorFlow backend.


In [2]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [3]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
    error = error/len(Y_test)
    return error
def process_all(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'PS').columns
    X = total_data.drop(columns = ['runtime','PS','power']).to_numpy()
    Y = total_data[['runtime','power']].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    
    # 1. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
    
    # 2. RR
    best_rr = linear_model.Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='svd', tol=0.001)
    
    # 3. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=1,
                    weights='distance')
    
    
    # 4. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='mae', max_depth=9, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
    
    # 5. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='friedman_mse', max_depth=9,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 6. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=12,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=None, verbose=0,
                    warm_start='True')
    
    # 7. GPR
    best_gpr = GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    
    best_models = [best_lr, best_rr, best_knn, best_dt, best_rf, best_etr, best_gpr,]
    best_models_name = ['best_lr', 'best_rr', 'best_knn', 'best_dt', 'best_rf', 'best_etr','best_gpr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape_runtime','mape_power', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_runtime_scores = []
        mape_power_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0, test_size = 0.2)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            # pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            # pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_runtime_scores.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
            mape_power_scores.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        # print(mape_scores)
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape_runtime': mape_runtime_scores,'mape_power': mape_power_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(r'Results.csv')


In [4]:
dataset_name = 'dijkstra_power'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\dijkstra_power.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\' + dataset_name
process_all(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (362, 22) (362, 2)
Train Test Split: (289, 22) (73, 22) (289, 2) (73, 2)
Running model number: 1 with Model Name:  best_lr
Running model number: 2 with Model Name:  best_rr
Running model number: 3 with Model Name:  best_knn
Running model number: 4 with Model Name:  best_dt




Running model number: 5 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 6 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 7 with Model Name:  best_gpr
  model_name    dataset_name  \
0    best_lr  dijkstra_power   
1    best_rr  dijkstra_power   
2   best_knn  dijkstra_power   
3    best_dt  dijkstra_power   
4    best_rf  dijkstra_power   

                                                  r2  \
0  [0.8994006586931498, 0.9064534001111062, 0.925...   
1  [0.8995170101812557, 0.9065001788415745, 0.925...   
2  [0.9089178424756674, 0.9409361337571749, 0.942...   
3  [0.9941950865389134, 0.9969189614746297, 0.994...   
4  [0.9942461222945362, 0.9972443248564092, 0.995...   

                                                 mse  \
0  [46468828.11279223, 65733743.00228015, 6655376...   
1  [46392680.65397789, 65581063.687536694, 666729...   
2  [24551196.745616864, 26448272.24932045, 230522...   
3  [35.12623227849589, 89.54097023807054, 3585.95...   
4  [226051.57817162774, 102084.94593104126, 19074...   

                                        mape_runtime  \
0  [0.17207047267026776, 0

In [16]:
df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])

In [17]:
df

Unnamed: 0,model_name,dataset_name,r2,mse,mape,mae
