# CAI4104 Final Project: Baseline Models

In [2]:
import os
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn

# Let's check our software versions
print('------------')
print('### Python version: ' + __import__('sys').version)
print(f'### NumPy version: {np.__version__}')
print(f'### SciPy version: {sp.__version__}')
print(f'### Scikit-learn version: {sklearn.__version__}')
print('------------')


------------
### Python version: 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]
### NumPy version: 1.26.3
### SciPy version: 1.11.4
### Scikit-learn version: 1.2.2
------------


## Loading the data

In [3]:
# Relative path to .npy files
data_path = '../data/'

# Relative path to .npy files
data_path = '../data/'
data = np.load(data_path + 'data.npz')

# Load numpy arrays
train_x = data['train_x']
train_t = data['train_t']

val_x = data['val_x']
val_t = data['val_t']

test_x = data['test_x']
test_t = data['test_t']

assert train_x.shape[0] == train_t.shape[0], "Training image quantity mismatches label quantity"
assert val_x.shape[0] == val_t.shape[0], "Validation image quantity mismatches label quantity"
assert test_x.shape[0] == test_t.shape[0], "Test image quantity mismatches label quantity"

num_images = train_x.shape[0] + val_x.shape[0] + test_x.shape[0]
image_shape = train_x.shape[1:]

print(f'{num_images} images with shape {image_shape}')

35887 images with shape (48, 48, 1)


In [7]:
train_x_flat = train_x.reshape(train_x.shape[0], train_x.shape[1]*train_x.shape[2])
val_x_flat = val_x.reshape(val_x.shape[0], val_x.shape[1]*val_x.shape[2])
test_x_flat = test_x.reshape(test_x.shape[0], test_x.shape[1]*test_x.shape[2])

train_t_num = np.array([np.argmax(a) for a in train_t])
val_t_num = np.array([np.argmax(a) for a in val_t])
test_t_num = np.array([np.argmax(a) for a in test_t])

## Baseline Models

In [8]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

def evaluate(model):

    model.fit(train_x_flat, train_t_num)
    train_y = model.predict(train_x_flat)
    val_y = model.predict(val_x_flat)

    train_acc = accuracy_score(train_t_num, train_y)
    train_bal_acc = balanced_accuracy_score(train_t_num, train_y)

    val_acc = accuracy_score(val_t_num, val_y)
    val_bal_acc = balanced_accuracy_score(val_t_num, val_y)

    print('Model Accuracies:')
    print(f'Training:   Standard: {train_acc}   Balanced: {train_bal_acc}')
    print(f'Validation:   Standard: {val_acc}   Balanced: {val_bal_acc}')

    return train_y, val_y



### Dummy Classifier

In [9]:
from sklearn.dummy import DummyClassifier

model_dm = DummyClassifier(strategy='most_frequent')
evaluate(model_dm)


Model Accuracies:
Training:   Standard: 0.2519904458598726   Balanced: 0.14285714285714285
Validation:   Standard: 0.245216422069478   Balanced: 0.14285714285714285


(array([3, 3, 3, ..., 3, 3, 3], dtype=int64),
 array([3, 3, 3, ..., 3, 3, 3], dtype=int64))

### Naive Bayes Classifier

In [10]:
from sklearn.naive_bayes import GaussianNB

# Create and train model
model_nb = GaussianNB()
evaluate(model_nb)


Model Accuracies:
Training:   Standard: 0.21926751592356689   Balanced: 0.2313865646325879
Validation:   Standard: 0.21716514954486346   Balanced: 0.22750685490921052


(array([6, 5, 1, ..., 5, 3, 3], dtype=int64),
 array([2, 1, 6, ..., 6, 6, 6], dtype=int64))

### Logistic Regression Classifier

In [11]:
from sklearn.linear_model import LogisticRegression

seed = 42

# Create and train model
model_lr = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=seed)
evaluate(model_lr)


Model Accuracies:
Training:   Standard: 0.4984076433121019   Balanced: 0.45535518601018093
Validation:   Standard: 0.33921605052944453   Balanced: 0.2982973365701582


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(array([4, 3, 3, ..., 3, 0, 3], dtype=int64),
 array([6, 4, 4, ..., 5, 0, 4], dtype=int64))