### Ensemble classifier

In [None]:
from keras.engine import Model
from keras.layers import Flatten, Dense, Input, Dropout
from keras_vggface.vggface import VGGFace
from keras.preprocessing.image import ImageDataGenerator
from keras_vggface import utils
from keras.optimizers import Adam
from keras.preprocessing import image
from PIL import Image
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.utils import class_weight
import keras
import random, string
import numpy as np
import os
import shutil
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from skimage.io import imread_collection
import scipy.misc
%matplotlib inline

In [None]:
# Constants
IMG_SIZE = 224

#IMG_DIR = '../project/all_females_AUG_2_combined_oversampled_train_s'
#RATING_PATH = '../project/all_females_AUG_2_combined_oversampled_train_s/ratings.txt'

IMG_DIR = '../project/all_females_combined_train_s'
RATING_PATH = '../project/all_females_combined_train_s/train_ratings.txt'

VAL_IMG_DIR = '../project/all_females_combined_val_s'
VAL_RATING_PATH = '../project/all_females_combined_val_s/val_ratings.txt'


VERSION = 'v6-val-oversampled'

In [None]:
def getMean(train_data):
    #calculate mean values for pixels (RGB)

    data = np.zeros((train_data.shape[0], IMG_SIZE, IMG_SIZE, 3)) 
    for i in range(train_data.shape[0]):#train_data:
        _img = image.load_img(os.path.join(IMG_DIR, train_data[i]), target_size=(IMG_SIZE,IMG_SIZE))
        data[i,:,:,:] = image.img_to_array(_img)

    mean = np.mean(data, axis=(0, 1, 2))
    print("Pixel means: ", mean)
    return mean

In [None]:
def preprocess(x,mean):
    x[:,:,0] -= mean[0]
    x[:,:,1] -= mean[1]
    x[:,:,2] -= mean[2]

In [None]:
def prepareData(percentage):
    # Load ratings
    ratings = np.genfromtxt(RATING_PATH)
    
    # Convert ratings to the desired format
    # 1. First round them
    ratings_rounded = np.round(ratings, 1).astype(np.int8)

    # 2. Then encode as suggested by Cheng (2007)
    ratings_prepared = np.zeros((len(ratings_rounded), len(np.unique(ratings_rounded)))).astype(np.int8)
    for i, r in enumerate(ratings_rounded):
        for j in range(r):
            ratings_prepared[i, j] = 1
        
    # 3. Finally, make sure assignment is correct
    assert np.all(np.sum(ratings_prepared, axis=1).astype(np.int8) == ratings_rounded)
    
    ### Move all images according to ratings
    images = sorted(os.listdir(IMG_DIR))
    #remove train_ratings.txt from images
    images.remove('train_ratings.txt')
    #convert to np.array for indexing
    images = np.array(images)

    ### Make sure lengths of ratings and images correspond 
    print(len(ratings))
    print(len(images))
    assert len(ratings) == len(images)
    
    perm_ratings = np.random.permutation(len(ratings))
    test_len = int(len(ratings) * percentage/100)
    train_len = len(ratings) - test_len

    test_ind = perm_ratings[:test_len]
    train_ind = perm_ratings[test_len:]
    
    #test_ind = test_ind.astype(np.int8)
    #print(train_ind)# = train_ind.astype(np.int8)
    
    #get test & train data
    data_test = np.zeros((len(test_ind), IMG_SIZE, IMG_SIZE, 3))    
    data_train = np.zeros((len(train_ind), IMG_SIZE, IMG_SIZE, 3)) 
    last_set_train = 0
    last_set_test = 0
    
    mean = getMean(images[train_ind])
    #print(mean)
    
    for idx, _im in enumerate(images): #iterate over images
        if idx in test_ind:
            _img = image.load_img(os.path.join(IMG_DIR, _im), target_size=(IMG_SIZE,IMG_SIZE))
            _x = image.img_to_array(_img)
            _x = np.expand_dims(_x, axis=0)
            data_test[last_set_test, :, :, :] = utils.preprocess_input(_x, version=1) / 255.0#preprocess(_x,mean)
            last_set_test += 1
        if idx in train_ind:
            _img = image.load_img(os.path.join(IMG_DIR, _im), target_size=(IMG_SIZE,IMG_SIZE))
            _x = image.img_to_array(_img)
            _x = np.expand_dims(_x, axis=0)
            data_train[last_set_train, :, :, :] = utils.preprocess_input(_x, version=1) / 255.0#preprocess(_x,mean)
            last_set_train += 1
    

    return data_train, ratings_prepared[train_ind], data_test, ratings_prepared[test_ind], ratings_rounded, images[train_ind], mean

In [None]:
def prepareValData(mean):
    # Load ratings
    ratings = np.genfromtxt(VAL_RATING_PATH)
    
    # Convert ratings to the desired format
    # 1. First round them
    ratings_rounded = np.round(ratings, 1).astype(np.int8)

    # 2. Then encode as suggested by Cheng (2007)
    ratings_prepared = np.zeros((len(ratings_rounded), len(np.unique(ratings_rounded))))
    for i, r in enumerate(ratings_rounded):
        for j in range(r):
            ratings_prepared[i, j] = 1
        
    # 3. Finally, make sure assignment is correct
    assert np.all(np.sum(ratings_prepared, axis=1).astype(np.int8) == ratings_rounded)
    
    # categorical ratings, bc sklearn ensemble cannot handle multi-outputs
    
    # Round (obviously imbalanced sets)
    #ratings = np.round(ratings, 0).astype(np.int8)
    #print(np.unique(ratings, return_counts=True))
    
    ### Move all images according to ratings
    images = sorted(os.listdir(VAL_IMG_DIR))
    #remove val_ratings.txt from images
    images.remove('val_ratings.txt')
    #convert for indexing
    images = np.array(images)

    ### Make sure lengths of ratings and images correspond 
    assert len(ratings) == len(images)
    
    data = np.zeros((len(images), IMG_SIZE, IMG_SIZE, 3)) 
    last_set = 0
    
    for idx, _im in enumerate(images): #iterate over images
        _img = image.load_img(os.path.join(VAL_IMG_DIR, _im), target_size=(IMG_SIZE,IMG_SIZE))
        _x = image.img_to_array(_img)
        _x = np.expand_dims(_x, axis=0)
        data[last_set, :, :, :] = utils.preprocess_input(_x, version=1) / 255.0
        last_set += 1
    
    return data, ratings, images

In [None]:
# global variable so it can be used by both KNN and keras model

#get augmented data from directory
x_train, y_train, x_test, y_test, ratings, images, mean = prepareData(5)

# get features from vgg-face as input for kNN
vgg_base = VGGFace(include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3), pooling='max')


# Freeze vgg layers
for layer in vgg_base.layers:
    layer.trainable = False
    
# Compute class weights since we have unbalaned classes
class_weight = class_weight.compute_class_weight('balanced', 
                                                 np.unique(ratings), 
                                                 ratings)



In [None]:
def getNN():
    #vgg_base = VGGFace(include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3), pooling='max')
    last_layer = vgg_base.get_layer('global_max_pooling2d_1').output
    X = Dense(256, activation='relu', name='fc6')(last_layer)
    X = Dropout(0.2)(X)
    X = Dense(128, activation='relu', name='fc7')(X)
    X = Dropout(0.2)(X)
    X = Dense(64, activation='relu', name='fc8')(X)
    X = Dropout(0.2)(X)
    output = Dense(len(np.unique(ratings)), activation='sigmoid')(X)
    model = Model(inputs=vgg_base.input, outputs=output)

    # Freeze vgg layers
    #for layer in vgg_base.layers:
    #    layer.trainable = False
    
    # Compute class weights since we have unbalaned classes
    #class_weight = class_weight.compute_class_weight('balanced', 
    #                                             np.unique(ratings), 
    #                                             ratings)

    model.compile(optimizer=Adam(lr= 0.0005,clipnorm=1.0),
              loss='binary_crossentropy', 
              metrics=['accuracy'])
    
    return model

In [None]:
def getKNN(x_train, y_train, vgg=None, img_in = True):
    if img_in:
        knn_img = KNeighborsClassifier(n_neighbors=5)
        knn_img.fit(x_train.reshape((x_train.shape[0],-1)),y_train)
        return knn_img
    else:
        # get features from vgg-face as input for kNN
        features = vgg.predict(x_train)
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(features,y_train)
        return knn
        

In [None]:
def recommend(images, pred_classes):
    recs = np.where(pred_classes >= 4)
    
    for i in recs[0]:
        img = mpimg.imread(os.path.join(IMG_DIR, images[i]))
        imgplot = plt.imshow(img)
        plt.show()
        plt.clf()

def recommendVal(images, pred_classes):
    recs = np.where(pred_classes >= 4)
    
    for i in recs[0]:
        img = mpimg.imread(os.path.join(VAL_IMG_DIR, images[i]))
        imgplot = plt.imshow(img)
        plt.show()
        plt.clf()

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
#Validation set:
x_val, ratings_val, images_val = prepareValData(mean)

In [None]:
print(len(y_train.shape))
print(y_train.shape)

In [None]:
#wrap keras model as sklearn classifier
net = KerasRegressor(build_fn=getNN, epochs=1, batch_size=32, verbose=0, class_weight=class_weight)
net.fit(x_train,y_train)

In [None]:
#get a KNN classifier
knn = getKNN(x_train, y_train, vgg=vgg_base, img_in = True) #returns fitted classifier

estimators = []
estimators.append(('net', net))
estimators.append(('knn', knn))

ensemble = VotingClassifier(estimators, voting='soft') #soft voting for averaging the predictions

ensemble.fit(x_train,y_train)