In [None]:
!pip install -U memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.60.0.tar.gz (38 kB)
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.60.0-py3-none-any.whl size=31284 sha256=88e8ffa95070217c3a00f3cd53bccb998fd0758eb7f06775001bcafdac038ff2
  Stored in directory: /root/.cache/pip/wheels/67/2b/fb/326e30d638c538e69a5eb0aa47f4223d979f502bbdb403950f
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.60.0


In [None]:
#Import all necessary libraries
import time
import math
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm #SVM Classifier
from sklearn import neighbors #KNN Classifier
from memory_profiler import profile #For memory usage profiling
from sklearn.metrics import confusion_matrix #For specificity and sensitivity metrics
from sklearn.neural_network import MLPClassifier #Multi-Layer Perceptron Classifier
from sklearn.naive_bayes import GaussianNB #Gaussian Naive Bayes Classifier
from sklearn.linear_model import LogisticRegression #Logistic Regression Classifier
from sklearn.preprocessing import StandardScaler #Used to standardize data (0 mean and unit variance)
from sklearn.preprocessing import MinMaxScaler #Used to normalize data between 0 and 1
from sklearn.model_selection import train_test_split #To split data into various validation/training/test sets

#Ignore those pesky warnings
import warnings
warnings.filterwarnings('ignore')

#Load in data from files
dir_path = 'drive/MyDrive/Georgia Tech/Classes/Spring 2022/ECE 6254: Statistical Machine Learning/Final Project/skin_cancer_dataset_3/'
x_train = np.load(dir_path + 'X_train_resize.npy')
y_train = np.load(dir_path + 'y_train_resize.npy')
x_test = np.load(dir_path + 'X_test_resize.npy')
y_test = np.load(dir_path + 'y_test_resize.npy')

#Flatten images into 2d arrays
x_train = x_train.flatten().reshape(2637, 1875)
x_test = x_test.flatten().reshape(660, 1875)

#Split the training set into separate training set and validation set
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=0)

#Check their shape
print("x_train = " + str(x_train.shape))
print("x_val = " + str(x_val.shape))
print("x_test = " + str(x_test.shape))
print("y_train = " + str(y_train.shape))
print("y_val = " + str(y_val.shape))
print("y_test = " + str(y_test.shape))

x_train = (2109, 1875)
x_val = (528, 1875)
x_test = (660, 1875)
y_train = (2109,)
y_val = (528,)
y_test = (660,)


In [None]:
#Perform preprocessing of data for certain algorithms

#Scale the data to 0 mean and unit variance
standard = StandardScaler()
x_train_standard = standard.fit_transform(x_train)
x_val_standard = standard.transform(x_val)
x_test_standard = standard.transform(x_test)

#Normalize data to be between 0 and 1 since some algorithms (KNN) use Euclidean distance as a metric
normal = MinMaxScaler(feature_range=(0, 1))
x_train_normal = normal.fit_transform(x_train)
x_val_normal = normal.transform(x_val)
x_test_normal = normal.transform(x_test)

#Combine training and validation sets for algorithms which don't require parameter tuning
x_val_train = np.concatenate((x_val, x_train))
y_val_train = np.concatenate((y_val, y_train))
x_val_train_standard = standard.transform(x_val_train)
x_val_train_normal = normal.transform(x_val_train)

In [None]:
#Train the KNN Classifier
def KNN_Classifier():

  print("Training the KNN Classifier!\n")

  #Declare variables used for k parameter sweep
  k_optimal = 1
  k_range = 100
  accuracy_KNN = 0.0
  accuracy_val = 0.0

  #Train KNN Classifier and find k which yields maximum accuracy
  for k in range(1, k_range+1):
    KNN_clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform')
    KNN_clf.fit(x_train, y_train)
    accuracy = KNN_clf.score(x_val, y_val)
    #print("k = " + str(k) + ", Accuracy = " + str(accuracy))

    if accuracy > accuracy_val:
      k_optimal = k
      accuracy_val = accuracy

  KNN_clf = neighbors.KNeighborsClassifier(n_neighbors=k_optimal, weights='uniform')
  KNN_clf.fit(x_train, y_train)
  train_accuracy = KNN_clf.score(x_train, y_train)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_KNN = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    accuracy_KNN = KNN_clf.score(x_test, y_test)
    avg_time_KNN += time.time() - start_time
  avg_time_KNN /= num_runs
  avg_time_KNN = str(round(avg_time_KNN, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_KNN = KNN_clf.predict(x_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_KNN).ravel()
  specificity_KNN = tn / (tn+fp)
  sensitivity_KNN = tp / (tp+fn)

  print("KNN Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("KNN Classifier Accuracy on Test Set = " + str(accuracy_KNN))
  print("KNN Classifier Sensitivity on Test Set = " + str(sensitivity_KNN))
  print("KNN Classifier Specificity on Test Set = " + str(specificity_KNN))
  print("Optimal value of k = " + str(k_optimal))
  print("Average Runtime of KNN Classifer on Test Set = " + str(avg_time_KNN) + " seconds")

KNN_Classifier()

Training the KNN Classifier!

KNN Classifier Accuracy on Training Set = 0.7942152678994784
KNN Classifier Accuracy on Test Set = 0.7651515151515151
KNN Classifier Sensitivity on Test Set = 0.6
KNN Classifier Specificity on Test Set = 0.9027777777777778
Optimal value of k = 9
Average Runtime of KNN Classifer on Test Set = 0.2588 seconds


In [8]:
#Train the Logistic Regression Classifier
def LogReg_Classifier():
  
  print("Training the Logistic Regression Classifier!\n")

  #Global parameter variables
  l1_ratio = 'none'
  penalty = 'l2'
  solver = 'lbfgs'
  C = 1

  #Fine-tune C parameter and penalty/solver choices for maximum accuracy
  C_arr = [0.00001, 0.0005, .001, 0.005, 0.01, 0.05, 0.1]
  solvers = ['newton-cg', 'lbfgs', 'sag']
  penalties = ['l2']

  max_val_accuracy = 0.0

  #Perform the parameter sweep
  for sol in solvers:
    for pen in penalties:
      for CC in C_arr:
        LogReg_clf = LogisticRegression(penalty=pen, C=CC, solver=sol, max_iter=1000)
        LogReg_clf.fit(x_train_normal, y_train)

        #Compute the classifier accuracy on the validation set
        val_accuracy = LogReg_clf.score(x_val_normal, y_val)

        if val_accuracy > max_val_accuracy:
          C = CC
          solver = sol
          penalty = pen
          max_val_accuracy = val_accuracy

  #Repeat the parameter sweep for the 'saga' solver which uses the elastic net penalty
  C_arr = [0.00001, 0.0005, .001, 0.005, 0.01, 0.05, 0.1]
  solvers = ['saga']
  penalties = ['elasticnet', 'l1', 'l2']
  l1_ratios = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

  #Perform the parameter sweep
  for sol in solvers:
    for pen in penalties:
      for CC in C_arr:
        for ratio in l1_ratios:
          LogReg_clf = LogisticRegression(penalty=pen, C=CC, solver=sol, l1_ratio=ratio, max_iter=1000)
          LogReg_clf.fit(x_train_normal, y_train)

          #Compute the classifier accuracy on the validation set
          val_accuracy = LogReg_clf.score(x_val_normal, y_val)

          if val_accuracy > max_val_accuracy:
            C = CC
            solver = sol
            penalty = pen
            l1_ratio = ratio
            max_val_accuracy = val_accuracy

  #Based on the validation set, set the parameter values of the optimal logistic regression algorithm
  LogReg_clf = LogisticRegression(penalty=penalty, C=C, solver=solver, l1_ratio=l1_ratio, max_iter=1000)
  LogReg_clf.fit(x_train_normal, y_train)
  train_accuracy = LogReg_clf.score(x_train_normal, y_train)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_LogReg = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = LogReg_clf.score(x_test_normal, y_test)
    avg_time_LogReg += (time.time() - start_time)
  avg_time_LogReg /= num_runs
  avg_time_LogReg = str(round(avg_time_LogReg, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_LogReg = LogReg_clf.predict(x_test_normal)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_LogReg).ravel()
  specificity_LogReg = tn / (tn+fp)
  sensitivity_LogReg = tp / (tp+fn)

  print("Logistic Regression Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("Logistic Regression Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("Logistic Regression Classifier Sensitivity on Test Set = " + str(sensitivity_LogReg))
  print("Logistic Regression Classifier Specificity on Test Set = " + str(specificity_LogReg))
  print("Parameters:")
  print("   C = " + str(C))
  print("   Penalty = " + str(penalty))
  print("   Solver = " + str(solver))
  print("   L1 Ratio = " + str(l1_ratio))
  print("Average Runtime of Logistic Regression Classifer on Test Set = " + str(avg_time_LogReg) + " seconds")

LogReg_Classifier()

Training the Logistic Regression Classifier!

Logistic Regression Classifier Accuracy on Test Set = 0.7803030303030303
Logistic Regression Classifier Accuracy on Training Set = 0.8330962541488858
Logistic Regression Classifier Sensitivity on Test Set = 0.7566666666666667
Logistic Regression Classifier Specificity on Test Set = 0.8
Parameters:
   C = 0.1
   Penalty = elasticnet
   Solver = saga
   L1 Ratio = 0
Average Runtime of Logistic Regression Classifer on Test Set = 0.002 seconds


In [None]:
#Train the Support Vector Machine Classifier
def SVM_Classifier():
  
  print("Training the SVM Classifier!\n")

  #Perform a parameter sweep for the kernel type and C value
  C_arr = [0.00001, 0.0005, .001, 0.005, 0.01, 0.05, 0.1]
  C = 1
  kernel = 'rbf'
  degree = 'none'
  max_val_accuracy = 0.0

  #Sweep 1: Test the linear kernel
  print("Training SVM Classifier with a linear kernel...")

  for C_val in C_arr:
      SVM_clf = svm.SVC(C=C_val, kernel='linear')
      SVM_clf.fit(x_train_standard, y_train)

      #Compute the classifier accuracy on the validation set
      val_accuracy = SVM_clf.score(x_val_standard, y_val)

      if val_accuracy > max_val_accuracy:
        C = C_val
        kernel = 'linear'
        max_val_accuracy = val_accuracy

  #Sweep 2: Test the polynomial kernel
  print("Training SVM Classifier with a polynomial kernel...")

  for deg in [1,2,3,4,5,6,7,8,9,10]:
      print("   Training degree = " + str(deg) + "...")
      for C_val in C_arr:
        SVM_clf = svm.SVC(C=C_val, kernel='poly', degree=deg)
        SVM_clf.fit(x_train_standard, y_train)

        #Compute the classifier accuracy on the validation set
        val_accuracy = SVM_clf.score(x_val_standard, y_val)

        if val_accuracy > max_val_accuracy:
          C = C_val
          kernel = 'poly'
          degree = deg
          max_val_accuracy = val_accuracy

  #Sweep 3: Test the rbf kernel
  print("Training SVM Classifier with a RBF kernel...")

  for C_val in C_arr:
      SVM_clf = svm.SVC(C=C_val, kernel='rbf')
      SVM_clf.fit(x_train_standard, y_train)

      #Compute the classifier accuracy on the validation set
      val_accuracy = SVM_clf.score(x_val_standard, y_val)

      if val_accuracy > max_val_accuracy:
        C = C_val
        kernel = 'rbf'
        max_val_accuracy = val_accuracy

  #Sweep 4: Test the sigmoid kernel
  print("Training SVM Classifier with a sigmoid kernel...")

  for C_val in C_arr:
      SVM_clf = svm.SVC(C=C_val, kernel='sigmoid')
      SVM_clf.fit(x_train_standard, y_train)

      #Compute the classifier accuracy on the validation set
      val_accuracy = SVM_clf.score(x_val_standard, y_val)

      if val_accuracy > max_val_accuracy:
        C = C_val
        kernel = 'sigmoid'
        max_val_accuracy = val_accuracy

  #Based on the validation set, set the parameter values of the optimal SVM algorithm
  SVM_clf = svm.SVC(C=C, kernel=kernel, degree=deg)
  SVM_clf.fit(x_train_standard, y_train)
  train_accuracy = SVM_clf.score(x_train_standard, y_train)
  test_accuracy = SVM_clf.score(x_test_standard, y_test)
  num_SVs = len(SVM_clf.support_vectors_)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_SVM = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = SVM_clf.score(x_test_standard, y_test)
    avg_time_SVM += (time.time() - start_time)
  avg_time_SVM /= num_runs
  avg_time_SVM = str(round(avg_time_SVM, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_SVM = SVM_clf.predict(x_test_standard)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_SVM).ravel()
  specificity_SVM = tn / (tn+fp)
  sensitivity_SVM = tp / (tp+fn)

  print("\n")
  print("SVM Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("SVM Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("SVM Classifier Sensitivity on Test Set = " + str(sensitivity_SVM))
  print("SVM Classifier Specificity on Test Set = " + str(specificity_SVM))
  print("Number of support vectors = " + str(num_SVs))
  print("Parameters:")
  print("   C = " + str(C))
  print("   Kernel = " + str(kernel))
  print("   Degree = " + str(degree))
  print("Average Runtime of SVM Classifer on Test Set = " + str(avg_time_SVM) + " seconds")

SVM_Classifier()

In [None]:
#Train the Gaussian Naive Bayes Classifier
def NB_Classifier():
  
  print("Training the Gaussian Naive Bayes Classifier!\n")

  NB_clf = GaussianNB()
  NB_clf.fit(x_val_train_standard, y_val_train)
  train_accuracy = NB_clf.score(x_val_train_standard, y_val_train)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_NB = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = NB_clf.score(x_test_standard, y_test)
    avg_time_NB += (time.time() - start_time)
  avg_time_NB /= num_runs
  avg_time_NB = str(round(avg_time_NB, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_NB = NB_clf.predict(x_test_standard)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_NB).ravel()
  specificity_NB = tn / (tn+fp)
  sensitivity_NB = tp / (tp+fn)

  print("Gaussian Naive Bayes Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("Gaussian Naive Bayes Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("Gaussian Naive Bayes Classifier Sensitivity on Test Set = " + str(sensitivity_NB))
  print("Gaussian Naive Bayes Classifier Specificity on Test Set = " + str(specificity_NB))
  print("Average Runtime of Naive Bayes Classifer on Test Set = " + str(avg_time_NB) + " seconds")

NB_Classifier()

In [None]:
#Train the K-Means Classifier
def KM_Classifier():

  print("Training the K-Means Classifier!\n")

  from sklearn.metrics.cluster import completeness_score
  from sklearn.cluster import KMeans

  KM_clf = KMeans(n_clusters=2, random_state=0, max_iter=1000)
  KM_clf.fit(x_val_train, y_val_train)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_KM = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    y_pred = KM_clf.predict(x_test)
    avg_time_KM += (time.time() - start_time)
  avg_time_KM /= num_runs
  avg_time_KM = str(round(avg_time_KM, 4))

  #Manually find the accuracy of the clustering
  num_matches = 0
  for i in range(len(y_test)):
    if y_pred[i] == y_test[i]:
      num_matches += 1
  KM_accuracy = num_matches / len(y_test)

  #Calculate sensitivity and specificity metrics
  y_pred_KM = KM_clf.predict(x_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_KM).ravel()
  specificity_KM = tn / (tn+fp)
  sensitivity_KM = tp / (tp+fn)

  print("K-Means Clustering Classifier Accuracy on Test Set = " + str(KM_accuracy))
  print("K-Means Clustering Classifier Sensitivity on Test Set = " + str(sensitivity_KM))
  print("K-Means Clustering Classifier Specificity on Test Set = " + str(specificity_KM))
  print("Average Runtime of K-Means Clustering Classifer on Test Set = " + str(avg_time_KM) + " seconds")

KM_Classifier()

In [None]:
#Train the Multi-Layer Perceptron Classifier
def MLP_Classifier():

  print("Training the Multi-Layer Perceptron Classifier!\n")

  #Optimize parameters to achieve the highest test accuracy
  #The relevant parameters we will optimize are:
  #   1) The number of neural network layers
  #   2) The number of neurons within each layer
  #   3) The activation function
  #   4) The solver used for weight optimization
  #   5) The alpha value used as the L2 penalty (regularization term) parameter
  #Perform a sweep on each parameter separately for efficiency

  #Declare optimal parameter variables
  highest_accuracy = 0.0
  hidden_layer_sizes_opt = ()
  activation_opt = ''
  solver_opt = ''
  alpha_opt = 0.0

  #Parameter sweep for the number of layers and size of layers
  #Keep other layers at default values
  #num_neurons = [10, 25, 50, 75, 100, 125, 150, 175, 200] #Values chosen from observing empirical results
  num_neurons = [5, 10, 15, 20, 25, 30, 35, 40] #Values chosen from observing empirical results
  num_layers = 3
  base_tuple = ()
  curr_tuple = ()

  print("Parameter Sweep for Hidden Layer Configuration:")
  for i in range(1, num_layers+1): #Sweep num_layers layers
    best_num = 10
    best_local_accuracy = 0.0
    for j in num_neurons:

      curr_tuple = base_tuple + (j,)
      clf = MLPClassifier(hidden_layer_sizes=curr_tuple, random_state=1, max_iter=1000).fit(x_train, y_train)
      accuracy = clf.score(x_val, y_val)
      print("   Testing Layer Configuration: " + str(curr_tuple) + ",  Validation Accuracy: " + str(accuracy))

      if accuracy > best_local_accuracy:
        best_num = j
        best_local_accuracy = accuracy

      if accuracy > highest_accuracy:
        hidden_layer_sizes_opt = curr_tuple
        highest_accuracy = accuracy
    
    base_tuple += (best_num,)
  print("\n")

  #Parameter sweep for the activation function
  print("Parameter Sweep for Activation Function:")
  act_funcs = ['identity', 'logistic', 'tanh', 'relu']
  best_local_accuracy = 0.0
  for act in act_funcs:
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_opt, activation=act, random_state=1, max_iter=1000).fit(x_train, y_train)
    accuracy = clf.score(x_val, y_val)
    print("   Testing Activation Function: " + act + ",  Validation Accuracy: " + str(accuracy))

    if accuracy > best_local_accuracy:
      activation_opt = act
      best_local_accuracy = accuracy

    if accuracy > highest_accuracy:
      activation_opt = act
      highest_accuracy = accuracy
  print("\n")

  #Parameter sweep for the solver
  print("Parameter Sweep for Solver:")
  solvers = ['lbfgs', 'sgd', 'adam']
  best_local_accuracy = 0.0
  for sol in solvers:
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_opt, activation=activation_opt, solver=sol, random_state=1, max_iter=1000).fit(x_train, y_train)
    accuracy = clf.score(x_val, y_val)
    print("   Testing Solver: " + sol + ",  Validation Accuracy: " + str(accuracy))

    if accuracy > best_local_accuracy:
      solver_opt = sol
      best_local_accuracy = accuracy

    if accuracy > highest_accuracy:
      solver_opt = sol
      highest_accuracy = accuracy
  print("\n")

  #Parameter sweep for alpha
  print("Parameter Sweep for Alpha:")
  alphas = [0.000001, 0.000005, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
  best_local_accuracy = 0.0
  for alpha in alphas:
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_opt, activation=activation_opt, solver=solver_opt, alpha=alpha, random_state=1, max_iter=1000).fit(x_train, y_train)
    accuracy = clf.score(x_val, y_val)
    print("   Testing Alpha: " + str(alpha) + ",  Validation Accuracy: " + str(accuracy))

    if accuracy > best_local_accuracy:
      alpha_opt = alpha
      best_local_accuracy = accuracy

    if accuracy > highest_accuracy:
      alpha_opt = alpha
      highest_accuracy = accuracy
  print("\n")

  # Fit MLPClassifier to the training data using the optimal parameters found previously
  clf = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes_opt, activation=activation_opt, solver=solver_opt, alpha=alpha_opt, random_state=1, max_iter=1000).fit(x_train, y_train)

  # Compute the accuracy on the training set and test set
  train_accuracy = clf.score(x_train, y_train)

  #Find the average runtime on the testing set
  num_runs = 20
  avg_time_MLP = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = clf.score(x_test, y_test)
    avg_time_MLP += (time.time() - start_time)
  avg_time_MLP /= num_runs
  avg_time_MLP = str(round(avg_time_MLP, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_MLP = clf.predict(x_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_MLP).ravel()
  specificity_MLP = tn / (tn+fp)
  sensitivity_MLP = tp / (tp+fn)

  print("Multi-Layer Perceptron Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("Multi-Layer Perceptron Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("Multi-Layer Perceptron Classifier Sensitivity on Test Set = " + str(sensitivity_MLP))
  print("Multi-Layer Perceptron Classifier Specificity on Test Set = " + str(specificity_MLP))
  print("Optimal Parameters:")
  print("   Hidden Layer Configuration: " + str(hidden_layer_sizes_opt))
  print("      Number of Layers: " + str(len(hidden_layer_sizes_opt)))
  for i in range(1, len(hidden_layer_sizes_opt)+1):
    print("      Layer " + str(i) + " Size: " + str(hidden_layer_sizes_opt[i-1]) + " Neurons")
  print("   Activation Function: " + str(activation_opt))
  print("   Solver for Weight Optimization: " + str(solver_opt))
  print("   Alpha: " + str(alpha_opt))
  print("Average Runtime of Multi-Layer Perceptron Classifer on Test Set = " + str(avg_time_MLP) + " seconds")

MLP_Classifier()

In [None]:
#Train the Decision Tree Classifier
def DT_Classifier():

  print("Training the Decision Tree Classifier!\n")

  optimal_depth = 4
  optimal_leaves = 4

  #Parameter sweep for max depth
  print("Parameter sweep for optimal depth of tree...")
  max_depth = [3,4,5,6,7,8,9,10,11,12,13,14]
  acc = 0.0
  for i in max_depth:
    clf = DecisionTreeClassifier(max_leaf_nodes=optimal_leaves, max_depth=i)
    clf = clf.fit(x_train, y_train)
    y_pred_val = clf.predict(x_val)
    acc_val = accuracy_score(y_val, y_pred_val)
    if acc_val > acc:
      optimal_depth = i
    print('  Accuracy in validation set:', acc_val, 'max depth:', i)
  print("\n")

  #Parameter sweep for max leaf nodes
  print("Parameter sweep for optimal leaf nodes in the tree...")
  max_leaves = [3,4,5,6,7,8,9,10]
  acc = 0.0
  for i in max_leaves:
    clf = DecisionTreeClassifier(max_leaf_nodes=i, max_depth=optimal_depth)
    clf = clf.fit(x_train, y_train)
    y_pred_val = clf.predict(x_val)
    acc_val = accuracy_score(y_val, y_pred_val)
    if acc_val > acc:
      optimal_leaves = i
    print('  Accuracy in validation set:', acc_val, 'max leaf nodes:', i)
  print("\n")

  clf = DecisionTreeClassifier(max_leaf_nodes=optimal_leaves, max_depth=optimal_depth)
  clf = clf.fit(x_train, y_train)

  # Compute the accuracy on the training set and test set
  train_accuracy = clf.score(x_train, y_train)

  #Find the average runtime on the testing set
  num_runs = 50
  avg_time_DT = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = clf.score(x_test, y_test)
    avg_time_DT += (time.time() - start_time)
  avg_time_DT /= num_runs
  avg_time_DT = str(round(avg_time_DT, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_DT = clf.predict(x_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_DT).ravel()
  specificity_DT = tn / (tn+fp)
  sensitivity_DT = tp / (tp+fn)

  print("Decision Tree Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("Decision Tree Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("Decision Tree Classifier Sensitivity on Test Set = " + str(sensitivity_DT))
  print("Decision Tree Classifier Specificity on Test Set = " + str(specificity_DT))
  print("Max Depth of Decision Tree Classifier = " + str(optimal_depth))
  print("Max Leaves in Decision Tree Classifier = " + str(optimal_leaves))
  print("Average Runtime of Decision Tree Classifer on Test Set = " + str(avg_time_DT) + " seconds")

DT_Classifier()

In [None]:
#Train the Random Forests Classifier
def RF_Classifier():

  print("Training the Random Forests Classifier!\n")

  n_estimators_list = [50,80,120,160,200]
  criterion_list = ['gini', 'entropy']
  max_features_list = ['auto', 'sqrt', 'log2']
  max_depth_list = [5, 7, 9]
  max_leaf_nodes_list = [10]
  params_grid = {
      'n_estimators': n_estimators_list,
      'criterion': criterion_list,
      'max_features': max_features_list,
      'max_depth': max_depth_list,
      'max_leaf_nodes': max_leaf_nodes_list
  }

  def my_roc_auc_score(model, X, y): return metrics.roc_auc_score(y, model.predict(X))
  num_combinations = 1
  for k in params_grid.keys(): num_combinations *= len(params_grid[k])

  print('Number of combinations = ', num_combinations)

  #params_grid
  model_rf = GridSearchCV(estimator=RandomForestClassifier(),
                          param_grid=params_grid,
                          cv=3,
                          scoring=my_roc_auc_score,
                          return_train_score=True,
                          verbose=4)

  model_rf.fit(x_val_train, y_val_train)

  best_params = model_rf.best_params_
  print('Random Forest Classifier Training Complete! Best parameters are:', best_params)
  print("\n")

  best_criterion = best_params['criterion']
  best_depth = best_params['max_depth']
  best_features = best_params['max_features']
  best_nodes = best_params['max_leaf_nodes']
  best_estimators = best_params['n_estimators']

  clf = RandomForestClassifier(max_leaf_nodes=best_nodes, max_depth=best_depth, n_estimators=best_estimators, criterion=best_criterion, max_features=best_features)
  clf = clf.fit(x_val_train, y_val_train)

  # Compute the accuracy on the training set and test set
  train_accuracy = clf.score(x_val_train, y_val_train)

  #Find the average runtime on the testing set
  num_runs = 50
  avg_time_RF = 0.0
  for i in range(0, num_runs):
    start_time = time.time()
    test_accuracy = clf.score(x_test, y_test)
    avg_time_RF += (time.time() - start_time)
  avg_time_RF /= num_runs
  avg_time_RF = str(round(avg_time_RF, 4))

  #Calculate sensitivity and specificity metrics
  y_pred_RF = clf.predict(x_test)
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred_RF).ravel()
  specificity_RF = tn / (tn+fp)
  sensitivity_RF = tp / (tp+fn)

  print("Random Forest Classifier Accuracy on Test Set = " + str(test_accuracy))
  print("Random Forest Classifier Accuracy on Training Set = " + str(train_accuracy))
  print("Random Forest Classifier Sensitivity on Test Set = " + str(sensitivity_RF))
  print("Random Forest Classifier Specificity on Test Set = " + str(specificity_RF))
  print("Average Runtime of Random Forest Classifer on Test Set = " + str(avg_time_RF) + " seconds")

RF_Classifier()