# Imports

In [1]:
import sys
import time
import time
import pandas as pd
import numpy as np
import json
from os import path

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.externals import joblib

# Functions

In [2]:
def supervised_learning_and_evaluation(model=None, model_name='', parameters={}, X=None, y=None, num_folds=3, path_results='', path_dir_model='', serialize_model=False): 
  #Splitting data
  kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state = 0)
  count_fold = 1

  #Verifying and creating a dir in case serialize_model = True
  if serialize_model == True: 
    if not (os.path.exists(path_dir_model)):
      os.mkdir(path_dir_model)
    if not (os.path.isdir(path_dir_model)) and (len(path_dir_model) > 0):
      raise Exception('Invalid directory to save the classification model')
      

  #DataFrame to store de results
  results = get_dataframe(path_results)

  #Performing k-Fold Cross-Validation
  for train_index, test_index in kf.split(X,y):
    #Train test spliting 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
  
    #Model training
    start_time_model_building = time.time()
    model.fit(X_train,y_train)
    elapsed_time_model_building = (time.time() - start_time_model_building) / 1000

    #Model Evaluation
    evaluation = supervised_evaluation(model, model_name, parameters, count_fold, X_test, y_test, elapsed_time_model_building)

    #Adding the evaluation result into a result file
    results = results.append(evaluation, ignore_index=True)
    

    #Saving the classification model
    if serialize_model == True: 
      path_model = os.path.join(path_dir_model, f'{model_name}_{count_fold}.mdl')
      with open(path_model, 'wb') as file_model:
        joblib.dump(model, file_model)

    count_fold += 1
  
  #saving DataFrame
  results.to_csv(path_results, index=False)

In [3]:
def supervised_evaluation(model, alg_name, parameters, fold_number, X_test, y_test, model_building_time): 
  
  evaluation = {} 
  evaluation['Algorithm'] = alg_name
  evaluation['Parameters'] = parameters
  evaluation['Fold_Number'] = fold_number
  evaluation['Accuracy'] = accuracy_score(y_test,model.predict(X_test))
  evaluation['Macro_F1'] = f1_score(y_test,model.predict(X_test), average='macro')
  evaluation['Micro_F1'] = f1_score(y_test,model.predict(X_test), average='micro')
  evalutaion['Macro_ROC_AUC'] = roc_auc_score(y_test,model.predict(X_test), average='macro')
  evalutaion['Micro_ROC_AUC'] = roc_auc_score(y_test,model.predict(X_test), average='micro')
  evaluation['Building_Time'] = model_building_time
  start_time_classification = time.time()
  evaluation['Confusion_Matrix'] = confusion_matrix(y_test,model.predict(X_test)).tolist()
  elapsed_time_classification = (time.time() - start_time_classification) / 1000
  evaluation['Classification_Time'] = elapsed_time_classification
  evaluation['Memory'] = sys.getsizeof(model) / 1024
  
  return evaluation 

In [4]:
def get_dataframe(path_results):
  results = None 
  if (path.exists(path_results)):
    results = pd.read_csv(path_results)
  else: 
    results = pd.DataFrame(columns=['Algorithm','Parameters','Fold_Number','Accuracy','Macro_F1','Micro_F1','Macro_F1','Micro_F1','Confusion_Matrix','Memory','Building_Time','Classification_Time'])
  
  return results

In [5]:
def get_dataframe_summary(path_results):
  results = None 
  if (path.exists(path_results)):
    results = pd.read_csv(path_results)
  else: 
    results = pd.DataFrame(columns=['Algorithm','Parameters',
                                    'Avg_Accuracy','Std_Accuracy',
                                    'Avg_Macro_F1','Std_Macro_F1',
                                    'Avg_Micro_F1','Std_Micro_F1',
                                    'Confusion_Matrix',
                                    'Avg_Memory','Std_Memory',
                                    'Avg_Building_Time','Std_Building_Time',
                                    'Avg_Classification_Time','Std_Classification_Time'])
  
  return results

In [6]:
def result_summary(df_results, alg, param): 

  #Selecting the results
  sel_params_alg = (df_results['Parameters'] == param) & (df_results['Algorithm'] == alg)
  df_selection = df_results[sel_params_alg]

  #Creating a dictionry with the results 
  result_summary = {}
  result_summary['Algorithm'] = alg
  result_summary['Parameters'] = param
  
  result_summary['Avg_Accuracy'] = df_selection['Accuracy'].mean()
  result_summary['Std_Accuracy'] = df_selection['Accuracy'].std()

  result_summary['Avg_Macro_F1'] = df_selection['Macro_F1'].mean()
  result_summary['Std_Macro_F1'] = df_selection['Macro_F1'].std()

  result_summary['Avg_Micro_F1'] = df_selection['Micro_F1'].mean()
  result_summary['Std_Micro_F1'] = df_selection['Micro_F1'].std()

  print('Alg', alg)
  print('Param', param)
  print('Shape df_results: ', df_results.shape)
  print('Shape df_selection: ', df_selection.shape)

  result_summary['Confusion_Matrix'] = summarize_confusion_matrices(df_selection)

  result_summary['Avg_Memory'] = df_selection['Memory'].mean()
  result_summary['Std_Memory'] = df_selection['Memory'].std()

  result_summary['Avg_Building_Time'] = df_selection['Building_Time'].mean()
  result_summary['Std_Building_Time'] = df_selection['Building_Time'].std()

  result_summary['Avg_Classification_Time'] = df_selection['Classification_Time'].mean()
  result_summary['Std_Classification_Time'] = df_selection['Classification_Time'].std()

  return result_summary

In [7]:
def summarize_confusion_matrices(df_results):
  

  #Conveting the confusion matrix represented by strings into arrays
  matrices = list(df_results['Confusion_Matrix'])
  conv_matrices = []
  for matrix in matrices: 
    conv_matrices.append(json.loads(matrix))

  #Getting the shape of a confusion matrix
  shape = np.array(conv_matrices[0]).shape

  total = np.zeros(shape, dtype=np.float16)
  for matrix in matrices: 
    total += np.array(json.loads(matrix), dtype=np.float16)

  return total.tolist()


In [8]:
def summarize_all_resuls(path_from, path_to):
  df_results = pd.read_csv(path_from)
  df_summary = get_dataframe_summary(path_to)
  #Getting the algorithms in the result file 
  alg_list  = df_results['Algorithm'].unique().tolist()
  for alg in alg_list:
    
    #Selecting all the parameters of an algorithm 
    sel_results_alg = df_results['Algorithm'] == alg 
    df_params_alg = df_results[sel_results_alg]
    parameters_list = df_params_alg['Parameters'].unique().tolist()

    #sumarizing the results of each pararameter 
    for param in parameters_list: 
      result = result_summary(df_params_alg, alg, param)
      df_summary = df_summary.append(result, ignore_index=True)

  #saving DataFrame with the Ssmmarized results
  df_summary.to_csv(path_to, index=False)

# Test Area