In [None]:
from sklearn.decomposition import TruncatedSVD
import numpy as np
import os

# Code to calculate the entropy of each classifier matrix using Singualr Values. 
def compute_svd_entropy(singular_values_mat):
    class_entropies = {int(cls): 0 for cls in range(6)}

    for j in range(6):
        singular_values = singular_values_mat[:, j]
        # Normalize the singular values. 
        total_sum = np.sum(singular_values)
        normalized_singular_values = singular_values / total_sum
    
        # Compute the (differential) entropy. 
        entropy = -np.sum(normalized_singular_values * np.log2(normalized_singular_values + 1e-12))  # Add small value to avoid log(0). 
        class_entropies[j] = entropy

    # Compute average entropy to return as well. 
    avg_entropy = np.mean(list(class_entropies.values()))
    
    return class_entropies, avg_entropy


def Emotion_SVD_Classification(X_train, X_test, y_train, y_test, k):
    # Compute the rank k SVD of the training data. Do so based on class labels. 
    # 6 total class labels, so 6 different rank k SVDs. 
    def Rank_k_SVD(X_train, y_train, k):
        unique_emotions = np.unique(y_train) # Check which emotions are represented in the data. 
        n_classes = len(unique_emotions)

        U = np.zeros((X_train.shape[1], k, n_classes))  # For storing U (left singular vectors for each emotion). 
        S = np.zeros((k, n_classes))  # For storing singular values of eavh emotion. 
        Vt = np.zeros((k, X_train.shape[1], n_classes))  # For storing Vt (right singular vectors for each emotion). 

        # Loop over each emotion (class): 
        for j, emotion in enumerate(unique_emotions):
            # Select the indices corresponding to the current emotion
            emotion_indices = np.where(y_train == emotion)[0]
            emotion_data = X_train[emotion_indices]

            # Perform a Truncated SVD for the current emotion data (fast method): 
            svd = TruncatedSVD(n_components=k)
            svd.fit(emotion_data)
        
            U[:, :, j] = svd.components_.T  # Left singular vectors (rows of U). 
            S[:, j] = svd.singular_values_  # Singular values (diagonal entries of Sigma). 
            Vt[:, :, j] = svd.components_  # Right singular vectors (Vt). 
        train_u = U; singular_values_mat = S # Only need the left singular vectors for reconstruction and singular values for entropy. 
        return train_u, singular_values_mat

    def Rank_k_expansion_coeff_mat(X_test, train_u):
        # The rank k can be found by measuring the size of the 2nd dimension of the parameter matrix train_u. 
        k = train_u.shape[1]
        n_classes = train_u.shape[2]

        # Initialize the matrix for the expansion coefficients. (k*n_samples)*num_classes
        n_samples = X_test.shape[0]  # number of test samples
        test_svd = np.zeros((k, n_samples, n_classes))

        # Compute the expansion coefficients for each emotion class. 
        for j in range(n_classes):
            # Project the test patterns onto the left singular vectors for this emotion class. 
            test_svd[:, :, j] = train_u[:, :, j].T @ X_test.T  # Transpose X_test to match the shape. 
        return test_svd

    def svd_rank_k_error(X_test, train_u, test_svd):
        # Initialize an array to store the results (approximation errors). 
        test_svdres = np.zeros((train_u.shape[2], X_test.shape[0]))  # (n_classes, n_samples)
        # Reconstruct training data with each matrix from test_svd. 
        # Check columns. Store euclidean distance of the columns to each original test vector. 
        for j in range(train_u.shape[2]):
            temp_jth_approx = train_u[:,:,j] @ test_svd[:,:,j]
            for i in range(X_test.shape[0]):
                distance_vector = X_test[i, :] - temp_jth_approx[:, i]
                test_svdres[j, i] = np.linalg.norm(distance_vector, 2)

        # Returns the residuals of the test_svd in relation to the original test data. 
        # Hope is that data in each class space should be reconstructed well by its associated train_u. 
        return test_svdres

    def convert_emotions_to_integers(y_data, emotion_map):
        # Easier to work with than the string labels for emotions. 
        return np.array([emotion_map[emotion] for emotion in y_data])

    def compute_confusion_matrix(test_predict, y_test_int, n_classes=6):
        # Initialize a confusion matrix. 
        test_confusion = np.zeros((n_classes, n_classes))
    
        # Populate the confusion matrix. 
        # Go through what the true labels are and keep track of the predictions of the entries that should have had that label. 
        # Best case would be only entries down the diagonal. That would be 100% accuracy. 
        for i in range(len(y_test_int)):
            true_label = y_test_int[i]
            predicted_label = int(test_predict[i])
            test_confusion[true_label, predicted_label] += 1
    
        return test_confusion

    # Convert emotions to integer labels for the training and test data. 
    emotion_map = {'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5}
    y_train_int = convert_emotions_to_integers(y_train, emotion_map)
    y_test_int = convert_emotions_to_integers(y_test, emotion_map)


    # Initialize results for rate and test_predict that "don't make sense," which allows us to more easily identify where errors occur. 
    rate = -1
    test_predict = np.zeros(X_test.shape[0])

    # Compute the necessary values for our calculations. 
    train_u, singular_values_mat = Rank_k_SVD(X_train, y_train_int, k)
    test_svd = Rank_k_expansion_coeff_mat(X_test, train_u)
    test_svdres = svd_rank_k_error(X_test, train_u, test_svd)

    # Prediction step. The minimum entry of the columns of test_svdres is the predicted label. 
    # These results would likely be different if a measure other than Euclidean disatnce was used. 
    for i in range(X_test.shape[0]):
        test_predict[i] = np.argmin(test_svdres[:, i])
    
    # Confusion matrix calculation using the integer labels. 
    test_confusion = compute_confusion_matrix(test_predict, y_test_int)

    num_correct_matched = np.sum(np.diag(test_confusion))
    rate = num_correct_matched / X_test.shape[0]

    # Computations to get the entropy. 
    class_entropies, avg_entropy = compute_svd_entropy(singular_values_mat)
    
    print("Average entropy:", avg_entropy)
    print("Per-class entropies:", class_entropies)
    
    return rate, test_predict, test_confusion



def generate_train_and_test_data(folder, train_percent=0.5):
    # Allocate a split for the training and testing data. 
    # Note: To have the code be better, we would have separated test data into a validation set and 
    # test data, and have run the tests on the validation to pick an optimal rank k, but generally 
    # higher k will work better than lower k. 

    # Load all .npz files from the folder. 
    npz_files = [f for f in os.listdir(folder) if f.endswith(".npz")]

    # Randomly shuffle the files and split into training and testing sets. 
    np.random.shuffle(npz_files)
    # Calculate training and testing file split. 
    train_size = int(len(npz_files) * train_percent)
    train_files = npz_files[:train_size]
    test_files = npz_files[train_size:]

    # Stuff for training data. 
    # Load the training data from the files. 
    X_train_list = []; y_train_list = []
    for file in train_files:
        data = np.load(f"{folder}/{file}", allow_pickle=True)
        X_train_list.append(data['coords'])
        y_train_list.append(data['emotions'])
    # Convert to numpy arrays
    X_train = np.concatenate(X_train_list, axis=0)
    y_train = np.concatenate(y_train_list, axis=0)
    # Flatten the X_train array. 
    X_train = X_train.reshape(X_train.shape[0], -1)

    # Stuff for test data. 
    # Load testing data from remaining files
    X_test_list = []; y_test_list = []
    for file in test_files:
        data = np.load(f"{folder}/{file}", allow_pickle=True)
        X_test_list.append(data['coords'])
        y_test_list.append(data['emotions'])

    # Convert to numpy arrays. 
    X_test = np.concatenate(X_test_list, axis=0)
    y_test = np.concatenate(y_test_list, axis=0)
    # Flatten the X_test array. 
    X_test = X_test.reshape(X_test.shape[0], -1)

    return X_train, X_test, y_train, y_test

In [None]:
# Pulls the data from the folder. Separates into training and testing data based on a desired percentage. 
folder = "video_frames_npz"
X_train, X_test, y_train, y_test = generate_train_and_test_data(folder, train_percent=0.7)

In [None]:
k = 5  # Use rank 5 for SVD. 
rate, test_predict, test_confusion = Emotion_SVD_Classification(X_train, X_test, y_train, y_test, k)

print("\nAccuracy Rate:", rate)
print("Confusion Matrix:\n", test_confusion)

Average entropy: 0.6203845325485418
Per-class entropies: {0: np.float64(0.6389778136414924), 1: np.float64(0.6152517837837952), 2: np.float64(0.6116633801958407), 3: np.float64(0.6350117373512845), 4: np.float64(0.60463989789449), 5: np.float64(0.6167625824243479)}

Accuracy Rate: 0.2984742548518926
Confusion Matrix:
 [[ 7200.  3917.  5164.  3291.  4164.  4007.]
 [ 4608. 10588.  4550.  5289.  2954.  4154.]
 [ 5729.  3559.  7877.  2872.  5003.  4996.]
 [ 2638.  3165.  2405. 13736.  2873.  3137.]
 [ 4095.  2982.  5069.  1295.  5524.  5387.]
 [ 4540.  5565.  6053.  1944.  6302.  6857.]]


In [None]:
k = 25  # Use rank 25 for SVD. 
rate, test_predict, test_confusion = Emotion_SVD_Classification(X_train, X_test, y_train, y_test, k)

print("\nAccuracy Rate:", rate)
print("Confusion Matrix:\n", test_confusion)

Average entropy: 0.9092454603026966
Per-class entropies: {0: np.float64(0.9401354466223886), 1: np.float64(0.9219908474864223), 2: np.float64(0.8909344278311391), 3: np.float64(0.943929383029909), 4: np.float64(0.8707473664034893), 5: np.float64(0.8877352904428307)}

Accuracy Rate: 0.373464600061099
Confusion Matrix:
 [[ 8285.  3253.  5621.  2285.  4724.  3575.]
 [ 3425. 13276.  5465.  3666.  2491.  3820.]
 [ 4588.  3224. 11072.  2167.  4007.  4978.]
 [ 2778.  2693.  3709. 14739.  2265.  1770.]
 [ 3176.  1653.  3980.  1483.  8868.  5192.]
 [ 4242.  3446.  6678.  2016.  6327.  8552.]]


In [None]:
k = 50  # Use rank 50 for SVD. 
rate, test_predict, test_confusion = Emotion_SVD_Classification(X_train, X_test, y_train, y_test, k)

print("\nAccuracy Rate:", rate)
print("Confusion Matrix:\n", test_confusion)

Average entropy: 0.9838690442852428
Per-class entropies: {0: np.float64(1.0165242460159976), 1: np.float64(1.0003959298225205), 2: np.float64(0.9637729112723595), 3: np.float64(1.0208975849866897), 4: np.float64(0.9411158876307879), 5: np.float64(0.9605077059831006)}

Accuracy Rate: 0.43922669448783497
Confusion Matrix:
 [[ 9282.  1969.  4663.  2206.  5419.  4204.]
 [ 2859. 17382.  4432.  1714.  1764.  3992.]
 [ 3822.  2404. 11493.  2276.  4518.  5523.]
 [ 2421.  1986.  2674. 15954.  2885.  2034.]
 [ 2782.   914.  3489.  1010. 11897.  4260.]
 [ 3421.  3140.  6123.  1335.  7049. 10193.]]


In [None]:
k = 100  # Use rank 100 for SVD. 
rate, test_predict, test_confusion = Emotion_SVD_Classification(X_train, X_test, y_train, y_test, k)

print("\nAccuracy Rate:", rate)
print("Confusion Matrix:\n", test_confusion)