## Imports

In [1]:
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from keras.datasets import mnist
import warnings

## Function definitions

In [2]:
def apply_rotation(image, degrees):
    """
    Rotates an image by a specified angle using OpenCV's affine transformation.

    Parameters:
    - image (numpy array): The image to rotate.
    - degrees (float): The angle in degrees to rotate the image.

    Returns:
    - numpy array: The rotated image.
    """
    num_rows, num_cols = image.shape[:2]
    center = (num_cols / 2, num_rows / 2)
    rotation_matrix = cv2.getRotationMatrix2D(center, degrees, 1)
    return cv2.warpAffine(image, rotation_matrix, (num_cols, num_rows))

In [3]:
def augment_images(data, oversampling_factor, rotation_choices):
    """
    Augments a dataset by rotating images and oversampling to artificially increase the dataset size.

    Parameters:
    - data (tuple): A tuple containing images and their labels (images, labels).
    - oversampling_factor (float): The factor by which to oversample the data.
    - rotation_choices (list): A list of angles from which one will be randomly selected for each image rotation.

    Returns:
    - tuple: A tuple containing the augmented images and labels.
    """
    images, labels = data
    total_images = images.shape[0]
    extra_samples = int(total_images * (oversampling_factor - 1))

    augmented_imgs = []
    augmented_lbls = []
    for _ in range(extra_samples):
        index = random.randint(0, total_images - 1)
        degrees = random.choice(rotation_choices)
        augmented_imgs.append(apply_rotation(images[index], degrees))
        augmented_lbls.append(labels[index])

    augmented_imgs = np.array(augmented_imgs)
    augmented_lbls = np.array(augmented_lbls)
    return np.concatenate([images, augmented_imgs]), np.concatenate([labels, augmented_lbls])

In [4]:
def model_training(data):
    """
    Trains a RandomForestClassifier using a GridSearch to optimize parameters.

    Parameters:
    - data (tuple): A tuple containing features and targets for training (features, targets).

    Returns:
    - RandomForestClassifier: The best model found via GridSearchCV.
    """
    features, targets = data
    features_reshaped = features.reshape(features.shape[0], -1)
    X_train, _, y_train, _ = train_test_split(features_reshaped, targets, test_size=0.2)
    params = {'n_estimators': [10, 25], 'criterion': ['gini', 'entropy']}
    grid = GridSearchCV(RandomForestClassifier(), params, cv=2, verbose=3)
    grid.fit(X_train, y_train)
    return grid.best_estimator_

In [5]:
def performance_evaluation(model, data, min_accuracy):
    """
    Evaluates the accuracy of a model and checks if it exceeds a specified threshold.

    Parameters:
    - model (RandomForestClassifier): The model to evaluate.
    - data (tuple): A tuple containing features and the true labels (features, true_labels).
    - min_accuracy (float): The threshold for deeming the model's performance satisfactory.

    Returns:
    - bool: True if the model's accuracy exceeds the threshold, False otherwise.
    """
    features, true_labels = data
    predicted_labels = model.predict(features.reshape(features.shape[0], -1))
    return accuracy_score(true_labels, predicted_labels) > min_accuracy

## Function calls and Training

In [6]:
warnings.filterwarnings('ignore')
random.seed(42)
np.random.seed(42)

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

initial_model = model_training((train_images, train_labels))
initial_performance = performance_evaluation(initial_model, (test_images, test_labels), 0.95)
print(f'Initial model performance satisfactory: {initial_performance}')

angles_to_test = [-30, -20, -10, 10, 20, 30]
for angle in angles_to_test:
    print(f'Augmenting with rotation by {angle} degrees.')
    test_images, test_labels = augment_images((test_images, test_labels), 2, [angle])
    performance_ok = performance_evaluation(initial_model, (test_images, test_labels), 0.95)

    while not performance_ok:
        print('Performance not satisfactory, augmenting training data and retraining model.')
        train_images, train_labels = augment_images((train_images, train_labels), 2, [angle])
        initial_model = model_training((train_images, train_labels))
        performance_ok = performance_evaluation(initial_model, (test_images, test_labels), 0.95)


Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ...criterion=gini, n_estimators=10;, score=0.934 total time=   0.8s
[CV 2/2] END ...criterion=gini, n_estimators=10;, score=0.928 total time=   0.8s
[CV 1/2] END ...criterion=gini, n_estimators=25;, score=0.952 total time=   2.0s
[CV 2/2] END ...criterion=gini, n_estimators=25;, score=0.951 total time=   2.0s
[CV 1/2] END criterion=entropy, n_estimators=10;, score=0.932 total time=   0.9s
[CV 2/2] END criterion=entropy, n_estimators=10;, score=0.932 total time=   0.9s
[CV 1/2] END criterion=entropy, n_estimators=25;, score=0.951 total time=   2.2s
[CV 2/2] END criterion=entropy, n_estimators=25;, score=0.951 total time=   2.2s
Initial model performance satisfactory: True
Augmenting with rotation by -30 degrees.
Performance not satisfactory, augmenting training data and retraining model.
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ...criterion=gini, n_estimators=10;, score=0.937 total tim