In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import ensemble
import xgboost as xgb

In [8]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [36]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
    error = error/len(Y_test)
    return error

def process_all(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    total_data = encoded_data_frame.drop(columns = ['arch'])
    # print(total_data.head())
    total_data = total_data.fillna(0)
    X = total_data.drop(columns = ['runtime','power']).to_numpy()
    Y = total_data[['runtime','power']].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()
    X = scaler_x.fit_transform(X)
    Y = scaler_y.fit_transform(Y)
    # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    # print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    
    # 1. LR
    best_lr = LinearRegression()
    
    # 2. RR
    best_rr = linear_model.Ridge()
    
    # 3. KNN
    best_knn = KNeighborsRegressor()
    
    # 7. GPR
    best_gpr = GaussianProcessRegressor()
    
    # 4. Decision Tree
    best_dt = DecisionTreeRegressor()
    
    # 5. Random Forest 
    best_rf = RandomForestRegressor()
    
    # 6. Extra Trees Regressor
    best_etr = ExtraTreesRegressor()
    
    
    best_models = [best_lr, best_rr, best_knn, best_dt, best_rf, best_etr, best_gpr,]
    best_models_name = ['best_lr', 'best_rr', 'best_knn', 'best_dt', 'best_rf', 'best_etr','best_gpr']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name','r2', 'mse', 'mape_runtime','mape_power' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_runtime_scores = []
        mape_power_scores = []

        cv = ShuffleSplit(n_splits=10, random_state=0, test_size = 0.2)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            model_orig.fit(X_train_fold, Y_train_fold)
            
            '''scaler_x = StandardScaler()
            scaler_y = StandardScaler()
            X_train_fold = scaler_x.fit_transform(X_train_fold)
            X_test_fold = scaler_x.fit_transform(X_test_fold)'''
            
            # Y_train_fold = scaler_y.fit_transform(Y_train_fold)
            # Y_test_fold = scaler_y.fit_transform(Y_test_fold)   
            
            Y_pred_fold = model_orig.predict(X_test_fold)

            Y_pred_fold = scaler_y.inverse_transform(Y_pred_fold)
            Y_test_fold = scaler_y.inverse_transform(Y_test_fold)
            # save the folds to disk
            
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            # pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            # pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_runtime_scores.append(absolute_percentage_error(Y_test_fold[:,0], Y_pred_fold[:,0]))
            mape_power_scores.append(absolute_percentage_error(Y_test_fold[:,1], Y_pred_fold[:,1]))
        df = df.append({'model_name': best_models_name[k],'r2': r2_scores, 'mse': mse_scores,
                        'mape_runtime': mape_runtime_scores,'mape_power': mape_power_scores}, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(r'Results.csv')


In [37]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_2_Aditya\\' + dataset_name
process_all(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (52, 26) (52, 2)
Running model number: 1 with Model Name:  best_lr
Running model number: 2 with Model Name:  best_rr
Running model number: 3 with Model Name:  best_knn
Running model number: 4 with Model Name:  best_dt
Running model number: 5 with Model Name:  best_rf
Running model number: 6 with Model Name:  best_etr
Running model number: 7 with Model Name:  best_gpr
  model_name                                                 r2  \
0    best_lr  [0.8840606390621188, 0.6841725606781158, 0.697...   
1    best_rr  [0.8848617900146047, 0.6827940834804698, 0.694...   
2   best_knn  [0.9055686878382078, 0.6608085076290745, 0.690...   
3    best_dt  [0.8679872964380833, 0.6326062089192492, 0.590...   
4    best_rf  [0.8880572317133231, 0.6605596951919783, 0.627...   

                                                 mse  \
0  [2374162.5220905053, 15300324.395901332, 15549...   
1  [2340665.9669511695, 15310075.424625797, 15707...   
2  [1879536.2004195394, 15343420.4532270

In [16]:
df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])

In [17]:
df

Unnamed: 0,model_name,dataset_name,r2,mse,mape,mae
