# AI Apprentice Lab 6 Solution
#### Convolutional Neural Network Classification

1. Load and pre-process dataset using the provided pipeline
2. Separate training and testing data
3. Construct a Convolutional Neural Network model in Keras with specific input and output shapes
4. Train the CNNmodel on training data
5. Evaluate results

In [1]:
#      IMPORT REQUIRED LIBRARIES
import pandas
import numpy as np
import sklearn
import pickle

#### Below is provided the pre-processing pipeline for CIFAR-10 dataset: https://en.wikipedia.org/wiki/CIFAR-10

In [2]:
############### FUNCTIONS NEEDED FOR PREPROCESSING PIPELINE
# This function performs a min-max normalization on a numpy array
def normalize(x):
    min_val = np.min(x)
    max_val = np.max(x)
    x = (x-min_val) / (max_val-min_val)
    return x

def one_hot_encode(x):
    """
        argument
            - x: a list of labels
        return
            - one hot encoding matrix (number of labels, number of class)
    """
    encoded = np.zeros((len(x), 10))
    
    for idx, val in enumerate(x):
        encoded[idx][val] = 1
    
    return encoded


In [3]:
##############################------------    DATA PREPROCESSING PIPELINE              ---------################
##############################                       DO NOT MODIFY                              #################


def load_cfar10_batch(cifar10_dataset_folder_path, batch_id):
    with open(cifar10_dataset_folder_path + '/data_batch_' + str(batch_id), mode='rb') as file:
        # note the encoding type is 'latin1'
        batch = pickle.load(file, encoding='latin1')
        
    features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    labels = batch['labels']
        
    return features, labels


def _preprocess_and_save(normalize, one_hot_encode, features, labels, filename):
    features = normalize(features)
    labels = one_hot_encode(labels)

    pickle.dump((features, labels), open(filename, 'wb'))


def preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode):
    n_batches = 5
    valid_features = []
    valid_labels = []

    for batch_i in range(1, n_batches + 1):
        features, labels = load_cfar10_batch(cifar10_dataset_folder_path, batch_i)
        
        # find index to be the point as validation data in the whole dataset of the batch (10%)
        index_of_validation = int(len(features) * 0.1)

        # preprocess the 90% of the whole dataset of the batch
        # - normalize the features
        # - one_hot_encode the lables
        # - save in a new file named, "preprocess_batch_" + batch_number
        # - each file for each batch
        _preprocess_and_save(normalize, one_hot_encode,
                             features[:-index_of_validation], labels[:-index_of_validation], 
                             'preprocess_batch_' + str(batch_i) + '.p')

        # unlike the training dataset, validation dataset will be added through all batch dataset
        # - take 10% of the whold dataset of the batch
        # - add them into a list of
        #   - valid_features
        #   - valid_labels
        valid_features.extend(features[-index_of_validation:])
        valid_labels.extend(labels[-index_of_validation:])

    # preprocess the all stacked validation dataset
    _preprocess_and_save(normalize, one_hot_encode,
                         np.array(valid_features), np.array(valid_labels),
                         'preprocess_validation.p')

    # load the test dataset
    with open(cifar10_dataset_folder_path + '/test_batch', mode='rb') as file:
        batch = pickle.load(file, encoding='latin1')

    # preprocess the testing data
    test_features = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    test_labels = batch['labels']

    # Preprocess and Save all testing data
    _preprocess_and_save(normalize, one_hot_encode,
                         np.array(test_features), np.array(test_labels),
                         'preprocess_training.p')
    

def load_preprocess_training_batch(batch_id):
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    filename = 'preprocess_batch_' + str(batch_id) + '.p'
    features, labels = pickle.load(open(filename, mode='rb'))

    # Return the training data in batches of size <batch_size> or less
    return features, labels

###############################################################################################################

In [4]:
#TODO: Process and save the supplied data using preprocess_and_save_data function defined above
preprocess_and_save_data("Data/cifar-10-batches-py",  normalize, one_hot_encode)

#### Let's create a Convolutional Neural Network model for classiying images

In [5]:
#################---------  INSTANTIATE SEQUENTIAL THE MODEL  --------######################################
import keras
CNNmodel = keras.Sequential()
############################################################################################################
#TODO: Stack several Convolutional, Dropout and MaxPooling layers
CNNmodel.add(keras.layers.Conv2D(128, kernel_size=(2, 2), strides=(2,2), activation='relu', input_shape=(32,32,3)))
CNNmodel.add(keras.layers.Dropout(rate=0.05))
CNNmodel.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
CNNmodel.add(keras.layers.Conv2D(128, kernel_size=(2, 2), strides=(2,2), activation='relu'))
CNNmodel.add(keras.layers.Dropout(rate=0.05))
CNNmodel.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
CNNmodel.add(keras.layers.Conv2D(128, kernel_size=(2, 2), strides=(2,2), activation='relu'))
CNNmodel.add(keras.layers.Dropout(rate=0.05))
#TODO: Flatten the image using Flatten layer and add several Dense layers
#      The final layer should have the same amount of dimensions as labels and softmax activation
CNNmodel.add(keras.layers.Flatten())
CNNmodel.add(keras.layers.Dense(128, activation="relu"))
CNNmodel.add(keras.layers.Dropout(rate=0.2))
CNNmodel.add(keras.layers.Dense(10, activation='softmax')) #Output layer
##############################################################################################################
#################---------            COMPILE THE MODEL         --------######################################
CNNmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
CNNmodel.summary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 16, 16, 128)       1664      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 16, 128)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 128)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 128)         65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 4, 4, 128)         0         
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 2, 2, 128)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 1, 1, 128)        

In [6]:
#            LOAD THE PRE_PROCESSED DATA INTO PYTHON
for i in range(1,5):
    feat, lab = load_preprocess_training_batch(i)
    if i == 1:
        features = feat
        labels = lab
    else:
        features = np.concatenate((features, feat))
        labels = np.concatenate((labels, lab))
#TODO: Separate training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state=25)


#### Now the data is loaded and we are ready for training

In [7]:
#TODO: Train the model on training data, try to have as many epochs as possible
CNNmodel.fit(x_train, y_train, batch_size=16, epochs=50, validation_data=(x_test, y_test))

Train on 25200 samples, validate on 10800 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f3d4c55aef0>

#### Let's evaluate the results

In [8]:
#TODO: Obtain model accuracy on testing and training data
from sklearn import metrics
test_pred = CNNmodel.predict(x_test)
train_pred = CNNmodel.predict(x_train)
print("Train Accuracy:", metrics.accuracy_score(np.argmax(np.array(y_train), axis=1), np.argmax(np.array(train_pred), axis=1)))
print("\n\nTest Accuracy:", metrics.accuracy_score(np.argmax(np.array(y_test), axis=1), np.argmax(test_pred,axis=1)))

Train Accuracy: 0.888968253968254


Test Accuracy: 0.5821296296296297


*Created by Nicholas Stepanov: https://github.com/renowator*