# AI Apprentice Lab 6 Starter Code
#### Deep Neural Network Classification

1. Load and pre-process dataset using the provided pipeline
2. Separate training and testing data
3. Construct a Convolutional Neural Network model in Keras with specific input and output shapes
4. Train the CNNmodel on training data
5. Evaluate results

Resources:

https://keras.io/api/layers/convolution_layers/convolution2d/

https://keras.io/api/layers/pooling_layers/max_pooling2d/

https://keras.io/api/layers/regularization_layers/dropout/

In [None]:
#      IMPORT REQUIRED LIBRARIES
import pandas
import numpy as np
import sklearn
import pickle

#### Below is provided the pre-processing pipeline for CIFAR-10 dataset: https://en.wikipedia.org/wiki/CIFAR-10

In [None]:
############### FUNCTIONS NEEDED FOR PREPROCESSING PIPELINE
# This function performs a min-max normalization on a numpy array
def normalize(x):
    min_val = np.min(x)
    max_val = np.max(x)
    x = (x-min_val) / (max_val-min_val)
    return x

def one_hot_encode(x):
    """
        argument
            - x: a list of labels
        return
            - one hot encoding matrix (number of labels, number of class)
    """
    encoded = np.zeros((len(x), 10))
    
    for idx, val in enumerate(x):
        encoded[idx][val] = 1
    
    return encoded


In [None]:
##############################------------    DATA PREPROCESSING PIPELINE              ---------################
##############################                       DO NOT MODIFY                              #################


def load_cfar10_batch(cifar10_dataset_folder_path, batch_id):
    with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
        # note the encoding type is 'latin1'
        batch = pickle.load(file, encoding='latin1')
        
    features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    labels = batch['labels']
        
    return features, labels


def _preprocess_and_save(normalize, one_hot_encode, features, labels, filename):
    features = normalize(features)
    labels = one_hot_encode(labels)

    pickle.dump((features, labels), open(filename, 'wb'))


def preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode):
    n_batches = 5
    valid_features = []
    valid_labels = []

    for batch_i in range(1, n_batches + 1):
        features, labels = load_cfar10_batch(cifar10_dataset_folder_path, batch_i)
        
        # find index to be the point as validation data in the whole dataset of the batch (10%)
        index_of_validation = int(len(features) * 0.1)

        # preprocess the 90% of the whole dataset of the batch
        # - normalize the features
        # - one_hot_encode the lables
        # - save in a new file named, "preprocess_batch_" + batch_number
        # - each file for each batch
        _preprocess_and_save(normalize, one_hot_encode,
                             features[:-index_of_validation], labels[:-index_of_validation], 
                             'preprocess_batch_' + str(batch_i) + '.p')

        # unlike the training dataset, validation dataset will be added through all batch dataset
        # - take 10% of the whold dataset of the batch
        # - add them into a list of
        #   - valid_features
        #   - valid_labels
        valid_features.extend(features[-index_of_validation:])
        valid_labels.extend(labels[-index_of_validation:])

    # preprocess the all stacked validation dataset
    _preprocess_and_save(normalize, one_hot_encode,
                         np.array(valid_features), np.array(valid_labels),
                         'preprocess_validation.p')

    # load the test dataset
    with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
        batch = pickle.load(file, encoding='latin1')

    # preprocess the testing data
    test_features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    test_labels = batch['labels']

    # Preprocess and Save all testing data
    _preprocess_and_save(normalize, one_hot_encode,
                         np.array(test_features), np.array(test_labels),
                         'preprocess_training.p')
    

def load_preprocess_training_batch(batch_id):
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    filename = 'preprocess_batch_' + str(batch_id) + '.p'
    features, labels = pickle.load(open(filename, mode='rb'))

    # Return the training data in batches of size <batch_size> or less
    return features, labels

###############################################################################################################

In [None]:
#TODO: Process and save the supplied data using preprocess_and_save_data function defined above


#### Let's create a Convolutional Neural Network model for classiying images

In [None]:
#################---------  INSTANTIATE SEQUENTIAL THE MODEL  --------######################################
import keras
CNNmodel = keras.Sequential()
############################################################################################################
#TODO: Stack several Convolutional, Dropout and MaxPooling layers

#TODO: Flatten the image using Flatten layer and add several Dense layers
#      The final layer should have the same amount of dimensions as labels and softmax activation

##############################################################################################################
#################---------            COMPILE THE MODEL         --------######################################
CNNmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
CNNmodel.summary()

In [None]:
#            LOAD THE PRE_PROCESSED DATA INTO PYTHON
for i in range(1,5):
    feat, lab = load_preprocess_training_batch(i)
    if i == 1:
        features = feat
        labels = lab
    else:
        features = np.concatenate((features, feat))
        labels = np.concatenate((labels, lab))
#TODO: Separate training and testing data



#### Now the data is loaded and we are ready for training

In [None]:
#TODO: Train the model on training data, try to have as many epochs as possible


#### Let's evaluate the results

In [None]:
#TODO: Obtain model accuracy on testing and training data
#      To use metrics.accuracy_score obtain result np.argmax()


*Created by Nicholas Stepanov: https://github.com/renowator*