# CAI4104 Final Project: Model Evaluation

In [1]:
import os
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import utils

# Let's check our software versions
print('------------')
print('### Python version: ' + __import__('sys').version)
print(f'### NumPy version: {np.__version__}')
print(f'### SciPy version: {sp.__version__}')
print(f'### Scikit-learn version: {sklearn.__version__}')
print(f'### Tensorflow version: {tf.__version__}')
print('------------')


2024-04-20 15:01:39.467094: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 15:01:39.469751: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-20 15:01:39.472950: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-20 15:01:39.511335: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


------------
### Python version: 3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:23:14) [GCC 10.4.0]
### NumPy version: 1.26.3
### SciPy version: 1.11.4
### Scikit-learn version: 1.3.0
### Tensorflow version: 2.16.1
------------


# Loading the models and model histories

In [None]:
model_path = '../model/'
history_path = '../history/'

dummy_model = 
nb_model = 
lr_model = 
cnn_model = 

dummy_hist = np.load(history_path + 'dummy_hist.npy')
nb_hist = np.load(history_path + 'nb_hist.npy')
lr_hist = np.load(history_path + 'lr_hist.npy')
cnn_hist = np.load(history_path + 'cnn_hist.npy')

# Loading the data

In [2]:
# Relative path to .npy files
data_path = '../data/'
data = np.load(data_path + 'data.npz')

# Load numpy arrays
train_x = data['train_x']
train_t = data['train_t']

val_x = data['val_x']
val_t = data['val_t']

test_x = data['test_x']
test_t = data['test_t']

assert train_x.shape[0] == train_t.shape[0], "Training image quantity mismatches label quantity"
assert val_x.shape[0] == val_t.shape[0], "Validation image quantity mismatches label quantity"
assert test_x.shape[0] == test_t.shape[0], "Test image quantity mismatches label quantity"

num_images = train_x.shape[0] + val_x.shape[0] + test_x.shape[0]
image_shape = train_x.shape[1:]

print(f'{num_images} images with shape {image_shape}')

35110 images with shape (48, 48, 1)


In [3]:
train_x_flat = train_x.reshape(train_x.shape[0], train_x.shape[1]*train_x.shape[2])
val_x_flat = val_x.reshape(val_x.shape[0], val_x.shape[1]*val_x.shape[2])
test_x_flat = test_x.reshape(test_x.shape[0], test_x.shape[1]*test_x.shape[2])

train_t_num = np.array([np.argmax(a) for a in train_t])
val_t_num = np.array([np.argmax(a) for a in val_t])
test_t_num = np.array([np.argmax(a) for a in test_t])

# Evaluate each model on test set
## Metrics:
####     - Accuracy
####     - F1 Score
####     - Precision
####     - Recall
####     - Area Under Curve (AUC), done per class

In [5]:
def evaluate_metrics (name, model):
    metrics = {}
    test_t_pred = model.predict(test_t)

    acc = Accuracy()
    acc.update_state(test_t, test_t_pred)
    metrics['accuracy'] = acc.result()

    f1 = F1Score()
    f1.update_state(test_t, test_t_pred)
    metrics['f1score'] = f1.result()

    prec = Precision()
    prec.update_state(test_t, test_t_pred)
    metrics['precision'] = prec.result()

    rec = Recall()
    rec.update_state(test_t, test_t_pred)
    metrics['recall'] = rec.result()

    t_max = np.max(test_t_num)
    for i in range(t_max):
        mc_auc = MulticlassAUC(t_label=i)
        mc_auc.update_state(test_t, test_t_pred)

        if (i == 0):
            metrics['auc']['angry'] = mc_auc.result()
        elif (i == 1):
            metrics['auc']['disgusted'] = mc_auc.result()
        elif (i == 2):
            metrics['auc']['fearful'] = mc_auc.result()
        elif (i == 3):
            metrics['auc']['happy'] = mc_auc.result()
        elif (i == 4):
            metrics['auc']['neutral'] = mc_auc.result()
        elif (i == 5):
            metrics['auc']['sad'] = mc_auc.result()
        elif (i == 6):
            metrics['auc']['surprised'] = mc_auc.result()

    print('{}:\n Accuracy: {:.2f}, F1-Score: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, AUC:\n\tAngry: {:.2f}\n\tDisgusted: {:.2f}\n\tFearful: {:.2f}\n\tHappy: {:.2f}\n\tNeutral: {:.2f}\n\tSad: {:.2f}\n\tSurprised: {:.2f}'.format(
        name, metrics['accuracy'], metrics['f1score'], metrics['precision'], metrics['recall'], 
        metrics['auc']['angry'], metrics['auc']['disgusted'], metrics['auc']['fearful'], metrics['auc']['happy'], 
        metrics['auc']['neutral'], metrics['auc']['sad'], metrics['auc']['surprised'] 
    ))
    
    return metrics

In [None]:
dummy_metrics = evaluate_metrics('Dummy Classifier', dummy_model)
nb_metrics = evaluate_metrics('Naive Bayes', nb_model)
lr_metrics = evaluate_metrics('Logistic Regression', lr_model)
cnn_metrics = evaluate_metrics('CNN', cnn_model)

# Evaluate each model history

In [None]:
def evaluate_history (name, history, metrics):
    for metric in metrics.keys():
        if (metric == 'auc'):
            # this is wrong for sure, roc is not epoch based, we shall fix
            # use as basis: https://github.com/Tony607/ROC-Keras/blob/master/ROC-Keras.ipynb
            for emotion in metrics['auc'].keys():
                plt.plot(history.history['MulticlassAUC_' + emotion])
                plt.plot(history.history['val_MulticlassAUC_' + emotion])
                plt.axhline(metrics[metric][emotion])
                plt.title('{} {} {}'.format(name, metric, emotion))
                plt.ylabel(metric)
                plt.xlabel('epoch')
                plt.legend(['train', 'val', 'test'], loc='upper left')
                plt.show() 
        else:
            plt.plot(history.history[metric])
            plt.plot(history.history['val_' + metric])
            plt.axhline(metrics[metric])
            plt.title('{} {}'.format(name, metric))
            plt.ylabel(metric)
            plt.xlabel('epoch')
            plt.legend(['train', 'val', 'test'], loc='upper left')
            plt.show()   

# Print associated graphs for comparisons

In [1]:
# look all dis shiz up bc i aint got a clue

In [None]:
evaluate_history('Dummy Classifier', dummy_hist, dummy_metrics)
evaluate_history('Naive Bayes', nb_hist, nb_metrics)
evaluate_history('Logistic Regression', lr_hist, lr_metrics)
evaluate_history('CNN', cnn_hist, cnn_metrics)