# CAI4104 Final Project: Baseline Models

In [1]:
import os
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn

# Let's check our software versions
print('------------')
print('### Python version: ' + __import__('sys').version)
print(f'### NumPy version: {np.__version__}')
print(f'### SciPy version: {sp.__version__}')
print(f'### Scikit-learn version: {sklearn.__version__}')
print('------------')


------------
### Python version: 3.11.5 (main, Sep 11 2023, 13:54:46) [GCC 11.2.0]
### NumPy version: 1.24.3
### SciPy version: 1.11.1
### Scikit-learn version: 1.3.0
------------


## Loading the data

In [2]:
# Relative path to .npy files
data_path = '../data/'

# Relative path to .npy files
data_path = '../data/'
data = np.load(data_path + 'data.npz')

# Load numpy arrays
train_x = data['train_x']
train_t = data['train_t']

val_x = data['val_x']
val_t = data['val_t']

test_x = data['test_x']
test_t = data['test_t']

assert train_x.shape[0] == train_t.shape[0], "Training image quantity mismatches label quantity"
assert val_x.shape[0] == val_t.shape[0], "Validation image quantity mismatches label quantity"
assert test_x.shape[0] == test_t.shape[0], "Test image quantity mismatches label quantity"

num_images = train_x.shape[0] + val_x.shape[0] + test_x.shape[0]
image_shape = train_x.shape[1:]

print(f'{num_images} images with shape {image_shape}')

35887 images with shape (48, 48, 1)


In [3]:
train_x_flat = train_x.reshape(train_x.shape[0], train_x.shape[1]*train_x.shape[2])
val_x_flat = val_x.reshape(val_x.shape[0], val_x.shape[1]*val_x.shape[2])
test_x_flat = test_x.reshape(test_x.shape[0], test_x.shape[1]*test_x.shape[2])

train_t_num = np.array([np.argmax(a) for a in train_t])
val_t_num = np.array([np.argmax(a) for a in val_t])
test_t_num = np.array([np.argmax(a) for a in test_t])

## Baseline Models

In [4]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def evaluate(model):

    model.fit(train_x_flat, train_t_num)
    train_y = model.predict(train_x_flat)
    val_y = model.predict(val_x_flat)

    train_acc = accuracy_score(train_t_num, train_y)
    train_bal_acc = balanced_accuracy_score(train_t_num, train_y)

    val_acc = accuracy_score(val_t_num, val_y)
    val_bal_acc = balanced_accuracy_score(val_t_num, val_y)

    print('Model Accuracies:')
    print(f'Training:   Standard: {train_acc}   Balanced: {train_bal_acc}')
    print(f'Validation:   Standard: {val_acc}   Balanced: {val_bal_acc}')

    return train_y, val_y



### Dummy Classifier

In [5]:
from sklearn.dummy import DummyClassifier

model_dm = DummyClassifier(strategy='most_frequent')
evaluate(model_dm)


Model Accuracies:
Training:   Standard: 0.2519904458598726   Balanced: 0.14285714285714285
Validation:   Standard: 0.245216422069478   Balanced: 0.14285714285714285


(array([3, 3, 3, ..., 3, 3, 3]), array([3, 3, 3, ..., 3, 3, 3]))

### Naive Bayes Classifier

In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Create and train model, hyperparameter tuned using Grid Search
model_nb = GaussianNB(var_smoothing=0.0012328467394420659)
evaluate(model_nb)


"""
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=model_nb, 
                 param_grid=params_NB, 
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(train_x_flat, train_t_num)

gs_NB.best_params_
""";



Model Accuracies:
Training:   Standard: 0.214171974522293   Balanced: 0.23205352196206208
Validation:   Standard: 0.21344974921047744   Balanced: 0.228924214403077


### Logistic Regression Classifier

In [11]:
from sklearn.linear_model import LogisticRegression

seed = 42

# Create and train model
model_lr = LogisticRegression(penalty='l2', C=0.5, multi_class='multinomial', solver='lbfgs', max_iter=5000, random_state=seed)

# Evaluate model
train_preds_lr, val_preds_lr = evaluate(model_lr)


Model Accuracies:
Training:   Standard: 0.4967356687898089   Balanced: 0.4453942526704614
Validation:   Standard: 0.3457180011146201   Balanced: 0.2857314189551788
