In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import ensemble
import xgboost as xgb

In [6]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [7]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        if(Y_test[i]!= 0 ):
            error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
        
    error = error/ len(Y_test)
    return error

# Dataset 1 :dijkstra_physical

In [13]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [14]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (52, 20) (52,)
Train Test Split: (41, 20) (11, 20) (41,) (11,)
Running model number: 1 with Model Name:  best_svr
Running model number: 2 with Model Name:  best_lr
Running model number: 3 with Model Name:  best_rr
Running model number: 4 with Model Name:  best_knn
Running model number: 5 with Model Name:  best_gpr
Running model number: 6 with Model Name:  best_dt
Running model number: 7 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 8 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 9 with Model Name:  best_gbr
Running model number: 10 with Model Name:  best_xgb
  model_name       dataset_name  \
0   best_svr  dijkstra_physical   
1    best_lr  dijkstra_physical   
2    best_rr  dijkstra_physical   
3   best_knn  dijkstra_physical   
4   best_gpr  dijkstra_physical   

                                                  r2  \
0  [0.706312387430097, 0.11420960680031889, 0.413...   
1  [0.7358363002601873, 0.2558236703072164, 0.263...   
2  [0.746938061499899, 0.26838277330657956, 0.539...   
3  [0.771401290306746, 0.26355119334827026, 0.391...   
4  [0.3832584181529476, 0.5257698801961834, 0.382...   

                                                 mse  \
0  [6867490.713539857, 66689948.15289214, 1126236...   
1  [6177113.630850524, 56028018.8460269, 14148130...   
2  [5917513.84197788, 55082461.14489979, 8836948....   
3  [5345474.064119581, 55446224.19696031, 1168005...   
4  [14421674.271264665, 35704137.621107526, 11854...   

            

# Dataset 2 : dijkstra_simulated

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 3 : qsort_physical

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 4 : qsort_simulated

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 5 : mantevominiFE_physical

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 6 : npbEP_physical

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 7 : npbMG_physical

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 8 : sha_physical

In [15]:
def process_all_sha_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn =  KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=13, p=7,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=15,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='random')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='False')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [16]:
dataset_name = 'sha_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\sha_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_sha_physical(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (52, 20) (52,)
Train Test Split: (41, 20) (11, 20) (41,) (11,)
Running model number: 1 with Model Name:  best_svr
Running model number: 2 with Model Name:  best_lr
Running model number: 3 with Model Name:  best_rr
Running model number: 4 with Model Name:  best_knn
Running model number: 5 with Model Name:  best_gpr
Running model number: 6 with Model Name:  best_dt
Running model number: 7 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 8 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 9 with Model Name:  best_gbr
Running model number: 10 with Model Name:  best_xgb
  model_name  dataset_name                                                 r2  \
0   best_svr  sha_physical  [0.5439399158414611, 0.12855304675271628, 0.22...   
1    best_lr  sha_physical  [0.8294903913494944, 0.3970132969485334, 0.396...   
2    best_rr  sha_physical  [0.8029926849948819, 0.1595873851196461, 0.484...   
3   best_knn  sha_physical  [0.8139163449771692, 0.30931401585212936, 0.37...   
4   best_gpr  sha_physical  [0.5302629303915782, -0.7435340173000458, 0.09...   

                                                 mse  \
0  [15007066.046998067, 35576553.14683912, 211766...   
1  [5610771.579335157, 24616746.21502805, 1659621...   
2  [6482702.369093542, 34309585.852761306, 141749...   
3  [6123249.541443429, 28197042.32282849, 1711098...   
4  [15457119.518239526, 71179238.61992726, 248968...   

                                                mape  \
0  [0.13582241805

# Dataset 9 : sha_simulated

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 10 : stitch_physical

In [19]:
def process_all_stitch_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr =  LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=True)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='sparse_cg', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=13, p=4,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='mae', max_depth=9, max_features='sqrt',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='random')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='False')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=4,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [20]:
dataset_name = 'stitch_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\stitch_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_stitch_physical(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (52, 20) (52,)
Train Test Split: (41, 20) (11, 20) (41,) (11,)
Running model number: 1 with Model Name:  best_svr
Running model number: 2 with Model Name:  best_lr
Running model number: 3 with Model Name:  best_rr
Running model number: 4 with Model Name:  best_knn
Running model number: 5 with Model Name:  best_gpr
Running model number: 6 with Model Name:  best_dt
Running model number: 7 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 8 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 9 with Model Name:  best_gbr
Running model number: 10 with Model Name:  best_xgb
  model_name     dataset_name  \
0   best_svr  stitch_physical   
1    best_lr  stitch_physical   
2    best_rr  stitch_physical   
3   best_knn  stitch_physical   
4   best_gpr  stitch_physical   

                                                  r2  \
0  [0.6949980747330784, 0.6246862331067944, 0.643...   
1  [0.8226182587873474, 0.5269882673816201, 0.669...   
2  [0.8226075574918322, 0.7962183000617079, 0.704...   
3  [0.8041550111102402, 0.7326424008143987, 0.644...   
4  [0.4469629764218257, -1.850601378988788, -8.81...   

                                                 mse  \
0  [7607569.041533611, 5397689.546858382, 3536098...   
1  [4424378.114340558, 6802762.674629249, 3282864...   
2  [4424645.033457197, 2930748.745784586, 2933279...   
3  [4884901.212060691, 3845084.954765081, 3526902...   
4  [13794232.071529493, 40996794.21033895, 973984...   

                        

# Dataset 11 : stitch_simulated

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)

# Dataset 12 : svm_physical

In [21]:
def process_all_svm_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sparse_cg', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='mse', max_depth=5, max_features='log2',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='random')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [22]:
dataset_name = 'svm_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\svm_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_svm_physical(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (52, 20) (52,)
Train Test Split: (41, 20) (11, 20) (41,) (11,)
Running model number: 1 with Model Name:  best_svr
Running model number: 2 with Model Name:  best_lr
Running model number: 3 with Model Name:  best_rr
Running model number: 4 with Model Name:  best_knn
Running model number: 5 with Model Name:  best_gpr
Running model number: 6 with Model Name:  best_dt
Running model number: 7 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 8 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 9 with Model Name:  best_gbr
Running model number: 10 with Model Name:  best_xgb
  model_name  dataset_name                                                 r2  \
0   best_svr  svm_physical  [0.3860817843050499, 0.606153679853983, 0.3428...   
1    best_lr  svm_physical  [0.99922632636046, 0.9303403948208646, 0.82499...   
2    best_rr  svm_physical  [0.9992812875795799, 0.9049200948394939, 0.874...   
3   best_knn  svm_physical  [0.9993418975996776, 0.966790584149034, 0.8341...   
4   best_gpr  svm_physical  [0.9955208050804074, 0.9170310605827265, 0.345...   

                                                 mse  \
0  [11919165.678956317, 8708334.46592358, 2411410...   
1  [15020.802535854325, 1540243.2614812588, 64217...   
2  [13953.735522920819, 2102311.416338758, 459216...   
3  [12776.997558675437, 734293.2658122513, 608654...   
4  [86963.15729046887, 1834525.8994947465, 240030...   

                                                mape  \
0  [0.06763686746

# Dataset 13 : svm_simulated

In [None]:
def process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr = SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='lsqr', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort=False,
                      random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='True')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=3,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=42, verbose=0,
                    warm_start='True')
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=4, min_child_weight=1, missing=None, n_estimators=10,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(path_for_saving_data + '.csv')
        # print('MSE for 10 folds\n', mse_scores)
        # print('\nR2 scores for 10 folds\n', r2_scores)
        # print('\nMAPE for 10 folds\n', mape_scores)
        # print('\nMAE scores for 10 folds\n', mae_scores)
        # print('\nMean MSE = ', np.mean(mse_scores), '\nMedian MSE = ', np.median(mse_scores))
        # print('\nMean R2 score =',np.mean(r2_scores), '\nMedian R2 scores = ', np.median(r2_scores))
        # print('\nMean Absolute Percentage Error =',np.mean(mape_scores), 
        #       '\nMedian Absolute Percentage Error =', np.median(mape_scores))    
        # print('\nMean MAE =',np.mean(mae_scores), 
        #      '\nMedian MAE =', np.median(mae_scores)) 


In [None]:
dataset_name = 'dijkstra_physical'
dataset_path = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\\Performance\\PhysicalSystems\\dijkstra_physical.csv'
path_for_saving_data = 'C:\\Users\\Rajat\\Desktop\\PROJECT_MODE\\Paper_3_Rajat\\results_with_normalization_without_PCA\\' + dataset_name
process_all_dijkstra_physical(dataset_path, dataset_name, path_for_saving_data)