In [None]:
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, Flatten, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import ReduceLROnPlateau
from keras.regularizers import l2
from keras.models import Model
from keras.datasets import cifar10
import keras.backend as K

from time import sleep
from math import ceil
from collections import Counter
import cv2
import matplotlib.pyplot as plt
from datetime import datetime
from random import shuffle
import numpy as np
import os
import pandas as pd

# Data Cleaning
### Functions for data cleaning/organizing which are then called in next cell

In [None]:
def join_image_paths(imagePath):
    # Joins image filenames with rest of whole path
    paths = []
    for oneImage in os.listdir(imagePath):
        paths.append(os.path.join(imagePath, oneImage))
        
    shuffle(paths) # Shuffles list then returns
    return paths

def remove_some_eth(df, label, reduce, poss_labels):
    # Removes some examples from specified eth to try and match up category example sizes
    
    # Gets base DF, all rows not including labels to reduce
    base = df[df['ethnicity'] != label].reset_index(drop=True)
    
    # Gets rows including labels to reduce then reduces it
    eth_to_drop = df[df['ethnicity'] == label]
    eth = eth_to_drop.sample(frac=reduce).reset_index(drop=True)
    
    # Concats reduced labels with all other labels split previously
    new_df = pd.concat([base, eth]).sample(frac=1).reset_index(drop=True)
    
    # Outputs new label counts
    print('Ethnicity %s reduced... new counts...' % label)
    for l in sorted(set(new_df['ethnicity'].tolist())):
        print('%s: %s' % (l, new_df[new_df['ethnicity']==l].shape[0]))
        
    return new_df

def replace_image_names(paths, DF):
    # Data cleaning... alters image names for more standardized format
    
    # Creates new DF for standardized image paths and ethnicities
    print('Replacing filenames in DF with full paths')
    new_df = pd.DataFrame(columns=('path', 'ethnicity'))
    df = format_subs(DF)
    poss_labels = sorted(set(DF['ethnicity'].tolist()))
    
    # Enumerates through paths getting subject number
    for i, onePath in enumerate(paths):
        # Tries to parse by one format type... If this fails it tries parsing by other format type
        try:
            int(onePath.split('_')[0].split('.')[-1])
            fileSub = '-'.join(onePath.split('/')[-1].split('-')[2:]).split('_')[0]
        except:
            int(onePath.split('_')[1])
            fileSub = '.'.join(onePath.split('/')[-1].split('_')[:2])
            
        # Gets ethnicity for file and loads params into new_df
        fileEth = df[df['image']==fileSub]['ethnicity'].iloc[0]
        new_df.loc[i] = [onePath, fileEth]
        
    for oneLabel in poss_labels:
        print('%s: %s' % (oneLabel, new_df[new_df['ethnicity'] == oneLabel].shape[0]))
        
    print('\n')
    
    return new_df

def format_subs(DF):
    DF['image'] = DF['image'].apply(lambda c: c.split('_')[0])
    return DF

def split_sets(DF, tr_path, val_path, ts_path, labels_path):
    # Gets training, validation and testing sets
    
    # Splits up sets to 60/20/20 split
    train, validate, test = np.split(DF.sample(frac=1).reset_index(drop=True), [int(0.6*len(DF)),
                                                                               int(0.8*len(DF))])
    
    # Resets all indicies
    train, validate, test = train.reset_index(drop=True), validate.reset_index(drop=True), test.reset_index(drop=True)
    
    # Writes data to files for later quicker loading
    train.to_csv(tr_path, index=False)
    validate.to_csv(val_path, index=False)
    test.to_csv(ts_path, index=False)
    DF.to_csv(labels_path, index=False)
    
    # Returns sets
    return train, validate, test

def getOneHot(arr, key):
    print('Processing one-hot encoding...')
    output = np.zeros((arr.shape[0], len(key)))
    for i, elem in enumerate(arr):
        output[i][key.index(elem)] = 1
        
    return output

def conv_oneHot(DF):
    DF['ethnicity'] = DF['ethnicity'].apply(lambda c: str_to_oneHot(c))
    
    return DF
    
def str_to_oneHot(string):
    string = string.replace('[', '').replace(']', '').replace(',', '')
    out_list = np.fromstring(string, dtype=np.float, sep=' ')
    
    return out_list

def findImages(cropPaths, ethData):
    pass

def load_sets(imagePath='images/cropped', ethPath='ResNetData/imageData/cssff_ethLabels.csv', 
              tr_path='ResNetData/imageData/eth_train.csv', 
              val_path='ResNetData/imageData/eth_validation.csv', 
              test_path='ResNetData/imageData/eth_testing.csv', 
              labels_path='ResNetData/imageData/eth_labels.csv', proc_choice=1):
    
    ethData = pd.read_csv(ethPath, index_col=False)
    poss_labels = sorted(set(ethData['ethnicity'].tolist()))
    
    cropPaths = join_image_paths(imagePath)
    
    if proc_choice==1:
        print('Processing paths with first choice... This may take some time...')
        ethData = remove_some_eth(replace_image_names(cropPaths, ethData), label='Caucasian', reduce=0.6,
                                 poss_labels=classes)
        ethData['ethnicity'] = list(getOneHot(np.array(ethData['ethnicity'].tolist()), classes))
        
        training_set, validation_set, testing_set = split_sets(ethData, tr_path, val_path, test_path, labels_path)
        
    elif proc_choice==2:
        print('Processing paths with second choice... Loading paths from files...')
        training_set = conv_oneHot(pd.read_csv(tr_path))
        validation_set = conv_oneHot(pd.read_csv(val_path))
        testing_set = conv_oneHot(pd.read_csv(test_path))
        ethData = pd.read_csv(labels_path)
        
    print('\nTraining, validation and testing sets loaded...')
    print('Sizes:\nTraining: %s\nValidation: %s\nTesting: %s' % (training_set.shape[0], validation_set.shape[0],
                                                                testing_set.shape[0]))
        
    return training_set, validation_set, testing_set, ethData, poss_labels
        

# Data prep and architecture param declaration

In [None]:
# Get all image paths
image_path = 'images/cropped'
imagePaths = join_image_paths(image_path)
print(len(imagePaths))

# Get training, validation and testing sets. Also gets full data and de-duplicated class list
tr_set, val_set, ts_set, eth_data, classes = load_sets(proc_choice=2)
num_classes = len(classes)

# Input image dimensions.
input_shape = [224, 224, 3]

# Data loading and model functions

In [None]:
def load_images(image_paths, size=224):
    all_images = np.zeros((len(image_paths), size, size, 3))
    
    for i, _path in enumerate(image_paths):
        # Read image
        image = plt.imread(_path)
        # Resize
        img = cv2.resize(image, (size, size))
        # Normalize
        img = np.divide(image, [255., 255., 255.])
        # Append
        all_images[i] = img
        
    return all_images

def printO(string, filename, header=True, custom_header=False, new_line=True):
    # Function for printing updates to log incase notebook session is closed on local machine
    
    # If-else for clearing file or appending contents
    if string == 'CLEARFILE':
        f = open(filename, 'w')
        
        # If-else for creating header at beginning of empty log
        if custom_header:
            fill = custom_header
        elif header:
            fill = 'Training began at %s\n\n' % datetime.now()
        else:
            fill = ''
        f.write(fill)
        f.close()
        print('User output file %s cleared...' % filename.split('/')[-1])
        
    else:
        print(string)
        f = open(filename, 'a')
        if new_line:
            string += '\n'
        f.write(string)
        f.close()
        
def getMaxBatch(DF, max_set):
    current_batch = pd.DataFrame(columns=list(DF))
    DF = DF.reset_index(drop=True)
    
    if max_set > DF.shape[0]:
        max_set = DF.shape[0]+1
        
    current_batch = DF.loc[:max_set, :]
    
    DF = DF.drop(DF.index[:max_set])
    
    return DF.reset_index(drop=True), current_batch.reset_index(drop=True)

def getBatch(batchDF, image_max):
    batchDF, current_batch = getMaxBatch(batchDF, image_max)
            
    X_tr = load_images(current_batch['path'].tolist())
    Y_tr = current_batch['ethnicity'].tolist()
    
    return batchDF, X_tr, Y_tr

def mostCommonIncorrectPred(incorrect_dict):
    outStr = '\n\nActual Ethnicity: Most Common Misprediction\n'
    outCSV = 'ACT_LABEL,' + ','.join(classes) + '\n'
    for key in classes:
        amount = []
        outCSV += key + ','
        for lookFor in classes:
            amount.append(incorrect_dict[key].count(lookFor))
            outCSV += str(amount[-1]) + ','
            
        outCSV += '\n'
        outStr += '%s: %s\n' % (key, classes[np.argmax(amount)])
        
    return outStr, outCSV

def postBatchProcessing(c, wholeDF, tempDF, image_max, ticker, file_to_write, stage='training'):
    c+=1 
    printO('%s - At iteration %s of %s for batches' % (stage, c, ceil(wholeDF.shape[0]/image_max)),
           filename=file_to_write)
    printO('Estimated time for this epoch\'s %s completion: %s\n' % (stage,
                            ((datetime.now()-ticker)*ceil(tempDF.shape[0]/image_max))),
           filename=file_to_write)
    ticker = datetime.now()
    
    return c, ticker

# Learning rate scheduler
# lr to be reduced based on number of epochs
def lr_schedule(epoch):

    lr = 1e-3
    if epoch > 180:
        lr *= 0.5e-3
    elif epoch > 150:
        lr *= 1e-3
    elif epoch > 100:
        lr *= 1e-2
    elif epoch > 50:
        lr *= 1e-1
    print('Learning rate: ', lr)
    return lr

# A function to build layers for the Resnet:
    # 1. Conv
    # 2. Batch normalization
    # 3. Activation
def resnet_layer(inputs, num_filters=16, kernel_size=3, strides=1, activation='relu', batch_normalization=True):
    """2D Convolution-Batch Normalization-Activation stack builder

    # Arguments
        inputs (tensor): input tensor from input image or previous layer
        num_filters (int): Conv2D number of filters
        kernel_size (int): Conv2D square kernel dimensions
        strides (int): Conv2D square stride dimensions
        activation (string): activation name
        batch_normalization (bool): whether to include batch normalization

    # Returns
        x (tensor): tensor as input to the next layer
    """
    # Convolution operation
    conv = Conv2D(num_filters, kernel_size=kernel_size, strides=strides, padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(1e-4))

    x = inputs
    x = conv(x)
    if batch_normalization:
        x = BatchNormalization()(x)
    if activation is not None:
        x = Activation(activation)(x)
    return x


def resnet_create(input_shape, depth, num_classes=10, stack_size=6):
    """
    First stack does not change the size
    Later, at the beginning of each stack, the feature map size is halved (downsampled)
    by a convolutional layer with strides=2, while the number of filters is
    doubled. Within each stage, the layers have the same number filters and the
    same number of filters.
    Features maps sizes:
    stack 0: 224x224, 16
    stack 1: 112x112, 32
    stack 2:  56x56,  64
    stack 3:  28x28,  128
    stack 4:  14x14,  256
    stack 5:  7x7,  512
    GlobalAveragePooling 7x7, 512 -> 1x1, 512
    Flatten 1x1, 512 -> 512

    # Arguments
        input_shape (tensor): shape of input image tensor
        depth (int): number of core convolutional layers
        num_classes (int): number of classes (CIFAR10 has 10)

    # Returns
        model (Model): Keras model instance
    """
    if (depth - 2) % 6 != 0:
        raise ValueError('depth should be 6n+2 (eg 20, 32, 44)')
    # Start model definition.
    num_filters = 16
    num_res_blocks = int((depth - 2) / 6)

    inputs = Input(shape=input_shape)
    x = resnet_layer(inputs=inputs)
    # Instantiate the stack of residual units
    for stack in range(stack_size):
        for res_block in range(num_res_blocks):
            strides = 1
            if stack > 0 and res_block == 0:  # first layer but not first stack
                strides = 2  # downsample
            y = resnet_layer(inputs=x, num_filters=num_filters, strides=strides)
            y = resnet_layer(inputs=y, num_filters=num_filters, activation=None)
            if stack > 0 and res_block == 0:  # first layer but not first stack
                # linear projection residual shortcut connection to match changed dims
                x = resnet_layer(inputs=x, num_filters=num_filters, kernel_size=1, strides=strides, activation=None, batch_normalization=False)
            # Add skip connection
            x = keras.layers.add([x, y])
            x = Activation('relu')(x)
        num_filters *= 2                    # Increase number of filter

    # Add classifier on top.
    x = AveragePooling2D(pool_size=7)(x)
    y = Flatten()(x)
    outputs = Dense(num_classes, activation='softmax', kernel_initializer='he_normal')(y)

    # Instantiate model.
    model = Model(inputs=inputs, outputs=outputs)
    return model

def getAcc(model, X_set, Y_set, incorrect_dict):
    preds = model.predict(X_set)
    if not incorrect_dict:
        incorrect_dict = {key: [] for key in classes}
        
    correct = 0
    for i, prediction in enumerate(preds):
        if np.argmax(prediction) == np.argmax(Y_set[i]):
            correct += 1
        else:
            actu_eth = classes[np.argmax(Y_set[i])]
            pred_eth = classes[np.argmax(prediction)]
            incorrect_dict[actu_eth].append(pred_eth)
            
    return correct/len(preds), incorrect_dict

def midTrainTest(model, X_batch, Y_batch, incorrect_dict, acc, batches, file_to_write, batchTime, 
                             stage='training'):
    
    a, incorrect_dict = getAcc(model, X_batch, Y_batch, incorrect_dict)
    acc.append(a)
    #loss.append(model.evaluate(X_batch, Y_batch, verbose=0))
    printO("%s - Batch : %s" % (stage, batches), filename=file_to_write)
    printO("%s - of batch %s" % (stage, (len(X_batch) // batch_size)), filename=file_to_write)
    printO("Estimated time for %s batch completion: %s" % (stage, ((datetime.now()-batchTime)/10)*
                                                        ((len(X_batch)//batch_size)-batches)),
          filename=file_to_write)
    
    batchTime = datetime.now()
    
    return acc, incorrect_dict, batchTime

def updateEpochMetrics(epoch_metrics, metrics_file, acc, allAcc, loss, epoch, stage, file_to_write):
    
    loss = [0, 0]
    
    new_line = True
    if stage == 'train':
        new_line = False
        print('DEBUG: ' + str(np.mean(acc)*100))
        print('DEBUG: ' + str(len(acc)))
        print('DEBUG: ' + str(acc))
        met = '%s,%s,%s,' % (epoch+1, np.mean(acc)*100, np.mean(loss))
    elif stage == 'validation':
        met = '%s,%s' % (np.mean(acc)*100, np.mean(loss))
    
    epoch_metrics[stage].append([np.mean(acc)*100, np.mean(loss)])
    printO(met, filename=metrics_file, new_line=new_line)
    printO('%s mean of accuracy and loss for this epoch: %s' % (stage, epoch_metrics[stage][-1]),
          filename=file_to_write)
    
    return epoch_metrics
    """
    # Updates epoch metrics and prints them to file
    epoch_metrics['train'].append([np.mean(acc)*100, np.mean(loss)])
    met = '%s,%s,%s,' % (i+1, np.mean(acc)*100, np.mean(loss))
    printO(met, filename=metrics_path, new_line=False)
    printO('Training mean of accuracy and loss for this epoch: %s' % epoch_metrics['train'][-1])
    
    printO('\n')
    # Epoch metrics updated and output
    epoch_metrics['validation'].append([np.mean(acc)*100, np.mean(loss)])
    met = '%s,%s' % (np.mean(acc)*100, np.mean(loss))
    printO(met, filename=metrics_path)
    printO('Validation mean of accuracy and loss for this epoch: %s' % epoch_metrics['validation'][-1])
        """
def completeEpoch(epoch_metrics, metrics_file, acc, allAcc, loss, incorrect_dict, incorrectPreds_file,
                      epoch_tick, epochs, epoch, file_to_write, models_base):

    loss = [0, 0]
    
    allAcc.append(np.mean(acc)*100)
    s, outCSV = mostCommonIncorrectPred(incorrect_dict)
    printO(outCSV, filename=incorrectPreds_file)
    
    if max(allAcc) == allAcc[-1]:
        model_path = os.path.join(models_base, 'ResNet-Ethnicity_Acc-%.2f_.h5' % allAcc[-1])
        log_path = os.path.join(models_base, 'OUTLOG_ResNet-Ethnicity_Acc-%.2f_.txt' % allAcc[-1])
        printO('CLEARFILE', filename=log_path)
        outStr = 'LOG INFO FOR ETHNICITY TRAINING BEST MODEL FOUND \n\nEPOCH: %s\nACCURACY: %s\nLOSS: %s' % (
            epoch+1, allAcc[-1], np.mean(loss))
        outStr += s
        printO(outStr, filename=log_path)
        
        model.save(model_path)
        
    printO('Accuracy of epoch: %.2f' % (np.mean(acc)*100), filename=file_to_write)
    printO('Time of epoch: %s' % (datetime.now()-epoch_tick), filename=file_to_write)
    printO('As of %s, estimated time remaining for training/validation: %s' % (datetime.now(),
                            (datetime.now()-epoch_tick)*(epochs-(epoch+1))),
          filename=file_to_write)
    epoch_tick = datetime.now()
    
    return allAcc, epoch_tick
    """
    # Epoch accuracy and metrics information output if current model is best model
    allAcc.append(np.mean(acc)*100)
    s, outCSV = mostCommonIncorrectPred(incorrect_dict)
    printO(outCSV, filename=incorrect_preds_path)
    if max(allAcc) == allAcc[-1]:
        model_path = os.path.join(base, 'ethnicityV2_acc_%.2f_.h5' % allAcc[-1])
        log_path = os.path.join(base, 'OUTLOG_ethnicityV2_acc_%.2f.txt' % allAcc[-1])
        printO('CLEARFILE', filename=log_path)
        outStr = 'LOG INFO FOR ETHNICITY TRAINING BEST MODEL FOUND\n\nEPOCH: %s\nACCURACY: %s\nLOSS: %s' % (
                            i+1, allAcc[-1], np.mean(loss))
        outStr += s
        printO(outStr, filename=log_path)

        for _file in os.listdir('/'.join(model_path.split('/')[:-1])):
            if 'ethnicityV2_acc' in _file:
                os.remove(os.path.join(base, _file))

        model.save(model_path)

    printO('Accuracy of epoch: %.2f' % (np.mean(acc)*100))
    printO('Time for epoch: %s' % (datetime.now()-epoch_tick))
    printO('As of %s; estimated time remaining for training/validation: %s' % (datetime.now(), 
                                        (datetime.now()-epoch_tick)*(epochs-(i+1))))
    epoch_tick = datetime.now()
    """

In [None]:
imageGen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)

In [None]:
# Creating model with depth of 8 and stack size of 6
depth = 8
stack_size=6

model = resnet_create(input_shape=input_shape, depth=8, num_classes=len(classes))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr_schedule(0)), metrics=['accuracy'])
print(model.summary())

In [None]:
epochs=50
image_max=3200
batch_size=128
allAcc = []

base_models = 'ResNetData/models'
base_logs = 'ResNetData/logs'
progress_log = os.path.join(base_logs, 'ResNet-printout.txt')
metrics_path = os.path.join(base_logs, 'ResNet-ethModelMetrics.csv')
incorrect_preds_path = os.path.join(base_logs, 'ResNet-incorrectPredMetrics.csv')

printO('CLEARFILE', filename=progress_log)
printO('CLEARFILE', filename=metrics_path, custom_header='EPOCH,TR_ACC,TR_LOSS,VAL_ACC,VAL_LOSS\n')
printO('CLEARFILE', filename=incorrect_preds_path, header=False)

ticker = datetime.now()
epoch_tick = datetime.now()
epoch_metrics = {'train': [], 'validation': []} # Metrics list for loss and accuracy to be stored in

try:
    for i in range(epochs):
        loss = [0, 0]
        printO('\n' + '='*30 + '\n' + 'Epoch %s of %s' % (i+1, epochs) + '\n' + '='*30 + '\n', 
                           filename=progress_log)

        c = 0
        acc = []
        temp_train = tr_set
        while temp_train.shape[0] > 0:
            temp_train, X_tr, Y_tr = getBatch(temp_train, image_max)
            #temp_train, current_batch = getMaxBatch(temp_train, image_max)

            #X_tr = load_images(current_batch['path'].tolist())
            #Y_tr = current_batch['ethnicity'].tolist()

            batchTime = datetime.now()
            batches=0

            for X_batch, Y_batch in imageGen.flow(X_tr, Y_tr, batch_size=batch_size):
                model.fit(X_batch, Y_batch, verbose=0)
                batches += 1
                acc, incorrect_dict, batchTime = midTrainTest(model, X_batch, Y_batch, incorrect_dict, acc,
                                                      batches, progress_log, batchTime, 'training')
                if batches >= len(X_tr) / batch_size:
                    break
                elif batches % 10 == 0:
                    acc, incorrect_dict, batchTime = midTrainTest(model, X_batch, Y_batch, incorrect_dict, acc,
                                                      batches, progress_log, batchTime, 'training')

            c, ticker = postBatchProcessing(c, tr_set, temp_train, image_max, ticker, progress_log)

        epoch_metrics = updateEpochMetrics(epoch_metrics, metrics_path, acc, allAcc, loss, i, 'train', progress_log)

        incorrect_dict = False
        temp_val = val_set
        ticker = datetime.now()
        c = 0
        acc = []
        while temp_val.shape[0] > 0:
            temp_val, X_val, Y_val = getBatch(temp_val, image_max)
            #temp_validation, current_batch = getMaxBatch(temp_validation, image_max)

            #X_val = load_images(current_batch['path'].tolist())
            #Y_val = current_batch['ethnicity'].tolist()
            batchTime = datetime.now()
            batches = 0

            for X_batch, Y_batch in imageGen.flow(X_val, Y_val, batch_size=batch_size):
                a, incorrect_dict = getAcc(model, X_batch, Y_batch, incorrect_dict)
                acc.append(a)
                batches += 1

                if batches >= len(X_val) / batch_size:
                    break
                elif batches % 10 == 0:
                    acc, incorrect_dict, batchTime = midTrainTest(model, X_batch, Y_batch, incorrect_dict, acc,
                                                      batches, progress_log, batchTime, 'training')

            c, ticker = postBatchProcessing(c, val_set, temp_val, image_max, ticker, progress_log, 'validation')

        epoch_metrics = updateEpochMetrics(epoch_metrics, metrics_path, acc, allAcc, loss, i, 'validation',
                                          progress_log)

        allAcc, epoch_tick = completeEpoch(epoch_metrics, metrics_path, acc, allAcc, loss, 
                        incorrect_dict, incorrect_preds_path, epoch_tick, epochs, i, progress_log,
                                          base_models)
            
except Exception as e:
    printO('MODEL TRAINING FAILURE\n\nERROR:\n%s' % e, filename=progress_log)
    
"""
# Prepare model model saving directory.
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'cifar10_%s_model.{epoch:03d}.h5' % model_type
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
filepath = os.path.join(save_dir, model_name)

# Prepare callbacks for model saving and for learning rate adjustment.
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_acc', verbose=1, save_best_only=True)

lr_scheduler = LearningRateScheduler(lr_schedule)

lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6)

callbacks = [checkpoint, lr_reducer, lr_scheduler]

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, callbacks=callbacks)

pandas.DataFrame(history.history).to_csv("history.csv")
"""