In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import SGD
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import category_encoders as ce
import os
import pickle
import gc
from tqdm import tqdm
import pickle
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import ensemble
import xgboost as xgb

Using TensorFlow backend.


In [2]:
def encode_text_features(encode_decode, data_frame, encoder_isa=None, encoder_mem_type=None):
    # Implement Categorical OneHot encoding for ISA and mem-type
    if encode_decode == 'encode':
        encoder_isa = ce.one_hot.OneHotEncoder(cols=['isa'])
        encoder_mem_type = ce.one_hot.OneHotEncoder(cols=['mem-type'])
        encoder_isa.fit(data_frame, verbose=1)
        df_new1 = encoder_isa.transform(data_frame)
        encoder_mem_type.fit(df_new1, verbose=1)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
    else:
        df_new1 = encoder_isa.transform(data_frame)
        df_new = encoder_mem_type.transform(df_new1)
        encoded_data_frame = df_new
        
    return encoded_data_frame, encoder_isa, encoder_mem_type

In [3]:
def absolute_percentage_error(Y_test, Y_pred):
    error = 0
    for i in range(len(Y_test)):
        if(Y_test[i]!= 0 ):
            error = error + (abs(Y_test[i] - Y_pred[i]))/Y_test[i]
        
    error = error/ len(Y_test)
    return error

In [4]:
def process_all(dataset_path, dataset_name, path_for_saving_data):
    
    ################## Data Preprocessing ######################
    df = pd.read_csv(dataset_path)
    encoded_data_frame, encoder_isa, encoder_mem_type = encode_text_features('encode', df, 
                                                                             encoder_isa = None, encoder_mem_type=None)
    # total_data = encoded_data_frame.drop(columns = ['arch', 'arch1'])
    
    total_data = encoded_data_frame.drop(columns = ['arch', 'sys'])
    total_data = total_data.fillna(0)
    X_columns = total_data.drop(columns = 'runtime').columns
    X = total_data.drop(columns = ['runtime']).to_numpy()
    Y = total_data['runtime'].to_numpy()
    # X_columns = total_data.drop(columns = 'PS').columns
    # X = total_data.drop(columns = ['runtime','PS']).to_numpy()
    # Y = total_data['runtime'].to_numpy()
    print('Data X and Y shape', X.shape, Y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    print('Train Test Split:', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    ################## Data Preprocessing ######################
    
    # Put best models here using grid search
    
    # 1. SVR 
    best_svr =SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    
    # 2. LR
    best_lr = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
    
    # 3. RR
    best_rr = linear_model.Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='svd', tol=0.001)
    
    # 4. KNN
    best_knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=1,
                    weights='distance')
    
    # 5. GPR
    best_gpr = GaussianProcessRegressor(alpha=0.01, copy_X_train=True, kernel=None,
                         n_restarts_optimizer=0, normalize_y=True,
                         optimizer='fmin_l_bfgs_b', random_state=None)
    # 6. Decision Tree
    best_dt = DecisionTreeRegressor(criterion='mse', max_depth=7, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')
    
    # 7. Random Forest 
    best_rf = RandomForestRegressor(bootstrap=True, criterion='friedman_mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start='False')
    
    # 8. Extra Trees Regressor
    best_etr = ExtraTreesRegressor(bootstrap=False, criterion='friedman_mse', max_depth=15,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
                    oob_score=False, random_state=None, verbose=0,
                    warm_start='True')
    
    # 9. GBR
    best_gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='mae', init=None,
                          learning_rate=0.1, loss='lad', max_depth=None,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    # 10. XGB
    best_xgb = xgb.XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, validate_parameters=False, verbosity=1)
    
    best_models = [best_svr, best_lr, best_rr, best_knn, best_gpr, best_dt, best_rf, best_etr, best_gbr, best_xgb]
    best_models_name = ['best_svr', 'best_lr', 'best_rr', 'best_knn', 'best_gpr', 'best_dt', 'best_rf', 'best_etr'
                        , 'best_gbr', 'best_xgb']
    k = 0
    
    df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])
    
    for model in best_models:
        
        print('Running model number:', k+1, 'with Model Name: ', best_models_name[k])
        r2_scores = []
        mse_scores = []
        mape_scores = []
        mae_scores = []

        # cv = KFold(n_splits = 10, random_state = 42, shuffle = True)
        cv = ShuffleSplit(n_splits=10, random_state=0)
        # print(cv)
        
        fold = 1
        for train_index, test_index in cv.split(X):
            model_orig = model
            # print("Train Index: ", train_index, "\n")
            # print("Test Index: ", test_index)

            X_train_fold, X_test_fold, Y_train_fold, Y_test_fold = X[train_index], X[test_index], Y[train_index], Y[test_index]
            # print(X_train_fold.shape, X_test_fold.shape, Y_train_fold.shape, Y_test_fold.shape)
            model_orig.fit(X_train_fold, Y_train_fold)
            Y_pred_fold = model_orig.predict(X_test_fold)
            
            # save the folds to disk
            data = [X_train_fold, X_test_fold, Y_train_fold, Y_test_fold]
            filename = path_for_saving_data + '/folds_data/' + best_models_name[k] +'_'+ str(fold) + '.pickle'
            # pickle.dump(data, open(filename, 'wb'))
            
            
            # save the model to disk
            # filename = path_for_saving_data + '/models_data/' + best_models_name[k] + '_' + str(fold) + '.sav'
            fold = fold + 1
            # pickle.dump(model_orig, open(filename, 'wb'))

            # some time later...
            '''
            # load the model from disk
            loaded_model = pickle.load(open(filename, 'rb'))
            result = loaded_model.score(X_test, Y_test)
            print(result)
            '''
            # scores.append(best_svr.score(X_test, y_test))
            '''
            plt.figure()
            plt.plot(Y_test_fold, 'b')
            plt.plot(Y_pred_fold, 'r')
            '''
            # print('Accuracy =',accuracy_score(Y_test, Y_pred))
            r2_scores.append(r2_score(Y_test_fold, Y_pred_fold))
            mse_scores.append(mean_squared_error(Y_test_fold, Y_pred_fold))
            mape_scores.append(absolute_percentage_error(Y_test_fold, Y_pred_fold))
            mae_scores.append(mean_absolute_error(Y_test_fold, Y_pred_fold))
        
        df = df.append({'model_name': best_models_name[k], 'dataset_name': dataset_name
                        , 'r2': r2_scores, 'mse': mse_scores, 'mape': mape_scores, 'mae': mae_scores }, ignore_index=True)
        k = k + 1  
    print(df.head())
    df.to_csv(r'matmul_physical.csv')


In [12]:
dataset_name = 'matmul_lab_omp_physical'
dataset_path = 'Dataset_CSV\\PhysicalSystems\\matmul_lab_omp.csv'
path_for_saving_data = 'data\\' + dataset_name
process_all(dataset_path, dataset_name, path_for_saving_data)

Data X and Y shape (280, 22) (280,)
Train Test Split: (224, 22) (56, 22) (224,) (56,)
Running model number: 1 with Model Name:  best_svr
Running model number: 2 with Model Name:  best_lr
Running model number: 3 with Model Name:  best_rr
Running model number: 4 with Model Name:  best_knn
Running model number: 5 with Model Name:  best_gpr
Running model number: 6 with Model Name:  best_dt
Running model number: 7 with Model Name:  best_rf


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 8 with Model Name:  best_etr


  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "
  warn("Warm-start fitting without increasing n_estimators does not "


Running model number: 9 with Model Name:  best_gbr
Running model number: 10 with Model Name:  best_xgb
  model_name             dataset_name  \
0   best_svr  matmul_lab_omp_physical   
1    best_lr  matmul_lab_omp_physical   
2    best_rr  matmul_lab_omp_physical   
3   best_knn  matmul_lab_omp_physical   
4   best_gpr  matmul_lab_omp_physical   

                                                  r2  \
0  [-0.16035470465073454, -0.0014391098331045793,...   
1  [0.8252934084878512, 0.790003726222055, 0.8144...   
2  [0.8148061646037538, 0.7898954488185312, 0.816...   
3  [0.9995495815244082, 0.980537626918769, 0.9963...   
4  [0.9773629612348737, 0.9631802440745174, 0.820...   

                                                 mse  \
0  [427435451420.4876, 133267957454.77885, 178343...   
1  [64356002961.70421, 27945557752.547997, 309776...   
2  [68219149123.64022, 27959966924.57296, 3056331...   
3  [165918941.57111415, 2589983437.12658, 6092943...   
4  [8338719914.362123, 4899842255

In [44]:
df = pd.DataFrame(columns = ['model_name', 'dataset_name', 'r2', 'mse', 'mape', 'mae' ])

In [45]:
df

Unnamed: 0,model_name,dataset_name,r2,mse,mape,mae
