# CAI4104 Final Project: Model Evaluation

In [1]:
import os
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import utils

# Let's check our software versions
print('------------')
print('### Python version: ' + __import__('sys').version)
print(f'### NumPy version: {np.__version__}')
print(f'### SciPy version: {sp.__version__}')
print(f'### Scikit-learn version: {sklearn.__version__}')
print(f'### Tensorflow version: {tf.__version__}')
print('------------')


2024-04-21 16:47:46.950156: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-21 16:47:46.953422: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-21 16:47:47.080707: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-21 16:47:47.472316: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


------------
### Python version: 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0]
### NumPy version: 1.26.3
### SciPy version: 1.11.4
### Scikit-learn version: 1.3.0
### Tensorflow version: 2.16.1
------------


# Loading the models and model histories

In [None]:
predicted_path = '../predicted/'
histories_path = '../histories/'

test_t_dummy_pred = np.load(predicted_path + 'test_t_dummy_pred.npy')
test_t_nb_pred = np.load(predicted_path + 'test_t_nb_pred.npy')
test_t_lr_pred = np.load(predicted_path + 'test_t_lr_pred.npy')
test_t_cnn_pred = np.load(predicted_path + 'test_t_cnn_pred.npy')

dummy_hist = np.load(histories_path + 'dummy_hist.npy')
nb_hist = np.load(histories_path + 'nb_hist.npy')
lr_hist = np.load(histories_path + 'lr_hist.npy')
cnn_hist = np.load(histories_path + 'cnn_hist.npy')

# Loading the data

In [2]:
# Relative path to .npy files
data_path = '../data/'
data = np.load(data_path + 'data.npz')

# Load numpy arrays
train_x = data['train_x']
train_t = data['train_t']

val_x = data['val_x']
val_t = data['val_t']

test_x = data['test_x']
test_t = data['test_t']

assert train_x.shape[0] == train_t.shape[0], "Training image quantity mismatches label quantity"
assert val_x.shape[0] == val_t.shape[0], "Validation image quantity mismatches label quantity"
assert test_x.shape[0] == test_t.shape[0], "Test image quantity mismatches label quantity"

num_images = train_x.shape[0] + val_x.shape[0] + test_x.shape[0]
image_shape = train_x.shape[1:]

print(f'{num_images} images with shape {image_shape}')

35110 images with shape (48, 48, 1)


In [3]:
train_x_flat = train_x.reshape(train_x.shape[0], train_x.shape[1]*train_x.shape[2])
val_x_flat = val_x.reshape(val_x.shape[0], val_x.shape[1]*val_x.shape[2])
test_x_flat = test_x.reshape(test_x.shape[0], test_x.shape[1]*test_x.shape[2])

train_t_num = np.array([np.argmax(a) for a in train_t])
val_t_num = np.array([np.argmax(a) for a in val_t])
test_t_num = np.array([np.argmax(a) for a in test_t])

# Compute metrics for each model on test set
## Metrics:
####     - Accuracy
####     - F1 Score
####     - Precision
####     - Recall
####     - Area Under Curve (AUC), Receiver Operating Characteristic (ROC), done per class

In [6]:
from keras.metrics import Accuracy, F1Score, Precision, Recall
from sklearn.metrics import roc_curve, auc

def compute_metrics (name, test_t_pred):
    metrics = {}

    acc = Accuracy()
    acc.update_state(test_t, test_t_pred)
    metrics['accuracy'] = acc.result()

    f1 = F1Score()
    f1.update_state(test_t, test_t_pred)
    metrics['f1score'] = f1.result()

    prec = Precision()
    prec.update_state(test_t, test_t_pred)
    metrics['precision'] = prec.result()

    rec = Recall()
    rec.update_state(test_t, test_t_pred)
    metrics['recall'] = rec.result()

    print('{}:\n Accuracy: {:.2f}, F1-Score: {:.2f}, Precision: {:.2f}, Recall: {:.2f}'.format(
        name, metrics['accuracy'], metrics['f1score'], metrics['precision'], metrics['recall']
    ))
    
    return metrics

def compute_roc (test_t_pred):
    fpr = {}
    tpr = {}
    roc = {}
    
    t_max = np.max(test_t_num)
    for i in range(t_max):
        fpr[i], tpr[i], _ = roc_curve(test_t[:, i], test_t_pred[:, i])
        roc[i] = auc(fpr[i], tpr[i])

    return fpr, tpr, roc

In [None]:
dummy_metrics = compute_metrics('Dummy Classifier', test_t_dummy_pred)
dummy_fpr, dummy_tpr, dummy_roc = compute_roc(test_t_dummy_pred)

nb_metrics = compute_metrics('Naive Bayes', test_t_nb_pred)
nb_fpr, nb_tpr, nb_roc = compute_roc(test_t_nb_pred)

lr_metrics = compute_metrics('Logistic Regression', test_t_lr_pred)
lr_fpr, lr_tpr, lr_roc = compute_roc(test_t_lr_pred)

cnn_metrics = compute_metrics('CNN', test_t_cnn_pred)
cnn_fpr, cnn_tpr, cnn_roc = compute_roc(test_t_cnn_pred)

# Evaluate and create graphs for each model history and ROC

In [4]:
def evaluate_history (name, history, metrics):
    for metric in metrics.keys():
        plt.plot(history.history[metric])
        plt.plot(history.history['val_' + metric])
        plt.axhline(metrics[metric])
        plt.title('{} {}'.format(name, metric))
        plt.ylabel(metric)
        plt.xlabel('epoch')
        plt.legend(['train', 'val', 'test'], loc='upper left')
        plt.show()   

def evaluate_roc (name, fpr, tpr, roc):
    colors = ['red', 'green', 'purple', 'yellow', 'grey', 'blue', 'orange']
    classes = ['angry', 'disgusted', 'fearful', 'happy', 'sad', 'surprised']
    for i, color in enumerate(colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC of class {} (area = {1:0.2f})'.format(classes[i], roc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.title('receiver operating characteristic on test set for each class of model {}'.format(name))
    plt.legend(loc="lower right")
    plt.show()

In [None]:
evaluate_history('Dummy Classifier', dummy_hist, dummy_metrics)
evaluate_roc('Dummy Classifier', dummy_fpr, dummy_tpr, dummy_roc)

evaluate_history('Naive Bayes', nb_hist, nb_metrics)
evaluate_roc('Naive Bayes', nb_fpr, nb_tpr, nb_roc)

evaluate_history('Logistic Regression', lr_hist, lr_metrics)
evaluate_roc('Logistic Regression', lr_fpr, lr_tpr, lr_roc)

evaluate_history('CNN', cnn_hist, cnn_metrics)
evaluate_roc('CNN', cnn_fpr, cnn_tpr, cnn_roc)