## OVERVIEW
---
* Feature Selection & Data Sampling
* Image Processing
* Data Augmentation
* Transfer Learning with Keras Xception
* Bottleneck Feature Ectraction
* Deep Learning

In [None]:
import datetime as dt
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')


import os
from keras.applications import xception
from keras.preprocessing import image
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import cv2
from scipy.stats import uniform

from tqdm import tqdm
from IPython.core.display import display, HTML
from PIL import Image
from io import BytesIO
import base64

import keras
from keras.models import Model, Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Masking,GlobalAveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img



from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### DATA UTILITIES

In [None]:
#copying the pretrained models to the cache directory
cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
models_dir = os.path.join(cache_dir, 'models')
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

#copy the Xception models
!cp ../input/keras-pretrained-models/xception* ~/.keras/models/
#show
!ls ~/.keras/models

In [None]:
base_folder = '../input/celeba-dataset'
data_folder = '../input/celeba-dataset/img_align_celeba'
image_folder = '../input/celeba-dataset/img_align_celeba/img_align_celeba'

In [None]:
#read the image attributes csv file
df = pd.read_csv('../input/celeba-dataset/list_attr_celeba.csv')

In [None]:
print(df.columns)
df.head()

### FEATURE SELECTION

In [None]:
df = df = df[['image_id', 'Male']]
#replace -1 to 0
df.replace(to_replace=-1, value=0, inplace=True) 

In [None]:
#add a class feature
def to_labels(x):
    if x == 0:
        return 'female'
    else:
        return 'male'

df['class'] = df.Male.apply(to_labels)
df.columns = ['filename', 'label', 'class']

In [None]:
#show new dataframe
df.head()

### COUNTPLOT PER GENDER

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(df.label);

### DATA SAMPLING

In [None]:
SAMPLES = 300
train_data = pd.concat([df[df['label']== i][:SAMPLES] for i in range(0,2)])
print('TRAIN DATA SHAPE: ', train_data.shape)

### SHOW SAMPLE IMAGES

In [None]:
# function to get an image
def read_img(filename, size):
    img = image.load_img(os.path.join(image_folder, filename), target_size=size)
    #convert image to array
    img = image.img_to_array(img) / 255
    return img

In [None]:
nb_rows = 3
nb_cols = 5
fig, axs = plt.subplots(nb_rows, nb_cols, figsize=(10, 5));
plt.suptitle('SAMPLE IMAGES');
for i in range(0, nb_rows):
    for j in range(0, nb_cols):
        axs[i, j].xaxis.set_ticklabels([]);
        axs[i, j].yaxis.set_ticklabels([]);
        axs[i, j].imshow((read_img(train_data['filename'].iloc[np.random.randint(500)], (255,255))));
plt.show();

### DATA AUGMENTATION

In [None]:
#create a imagegenerator for for augmentation
datagen =  ImageDataGenerator(
  rotation_range=30,
  width_shift_range=0.2,
  height_shift_range=0.2,
  shear_range=0.2,
  zoom_range=0.2,
  horizontal_flip=True
)

In [None]:
img = read_img(train_data['filename'].iloc[546], (255,255))
plt.title('ORIG IMAGE')
plt.imshow(img);


###### SHOW SAMPLE AUGMENTED IMAGE

In [None]:
# reshape image to 4 dimentional
img = img.reshape((1,) + img.shape)

In [None]:
plt.figure(figsize=(20,10))
plt.suptitle('Data Augmentation', fontsize=28)


i = 0

for batch in datagen.flow(img, batch_size=32):
    plt.subplot(3, 5, i+1)
    plt.grid(False)
    plt.imshow(batch.reshape(255, 255, 3));
    
    if i == 9:
        break
    i += 1
    
plt.show();

In [None]:
#split the data
X = train_data.drop(['label', 'class'], axis=1)
y = train_data['label']

train_x, train_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

### IMAGE PROCESSING

In [None]:
# function to get an image
def read_img(filename, size):
    img = image.load_img(os.path.join(image_folder, filename), target_size=size)
    #convert image to array
    img = image.img_to_array(img) / 255
    img = img.reshape((1,) + img.shape)
    return img


array_img = []
label_img = []

for i, file in tqdm(enumerate(train_x['filename'])):
    img = read_img(file, (255,255))
    label = y_train.iloc[i]
    num = 0
    for batch in datagen.flow(img, batch_size=32):
        batch = batch.reshape(255,255,3)
        array_img.append(batch)
        label_img.append(label)
        if num == 4:
            break
        num=num+1

### XCEPTION INPUT PREPROCESSING

In [None]:
#preprocess train
INPUT_SIZE = 255


X_train = np.zeros((len(array_img), INPUT_SIZE, INPUT_SIZE, 3), dtype='float')
i=0
for file in tqdm(array_img):
    X_train[i] = xception.preprocess_input(np.expand_dims(file.copy(), axis=0))
    i = i+1

In [None]:
#preprocess validation

X_val = np.zeros((len(train_val), INPUT_SIZE, INPUT_SIZE, 3), dtype='float')
for i, file in tqdm(enumerate(train_val['filename'])):
    img = read_img(file, (255,255))
    X_val[i] = xception.preprocess_input(np.expand_dims(img.copy(), axis=0))

### BOTTLENECK FEATURE EXTRACTION

In [None]:
xception_bf = xception.Xception(weights='imagenet', include_top=False, pooling='avg')
bf_train_x = xception_bf.predict(X_train, batch_size=32, verbose=1)
bf_train_val = xception_bf.predict(X_val, batch_size=32, verbose=1)

In [None]:
#print shape of feature and size
print('Train Shape: ', bf_train_x.shape)
print('Train Size: ', bf_train_x.size)

print('Validation Shape: ', bf_train_val.shape)
print('Validation Size: ', bf_train_val.size)

### MODELLING

In [None]:
#keras model
model = Sequential()
model.add(Dense(units = 512 , activation = 'relu', input_dim=bf_train_x.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(units = 64 , activation = 'relu'))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.compile(optimizer ='adam' , loss = 'binary_crossentropy' , metrics = ['accuracy'])
model.summary()

In [None]:
#set callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
         ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

#fit the data
history = model.fit(bf_train_x, np.array(label_img), batch_size=32, epochs=100, callbacks=callbacks)

### LOSS AND ACCURACY

In [None]:
fig, ax = plt.subplots(1,2,figsize=(14,5))
ax[0].set_title('TRAINING LOSS');
ax[1].set_title('TRAINING ACCURACY');


ax[0].plot(history.history['loss'], color= 'salmon',lw=2);
ax[1].plot(history.history['accuracy'], color= 'steelblue',lw=2);

In [None]:
#predict the validation data
predictions = model.predict_classes(bf_train_val)

### CLASSIFICATION REPORT

In [None]:
print(classification_report(y_val, predictions))

### CONFUSION MATRIX

In [None]:
con_mat = confusion_matrix(y_val, predictions)
plt.figure(figsize=(5,5))

sns.heatmap(con_mat, annot=True, square=True);
plt.xlabel('Y_TRUE');
plt.ylabel('PREDICTIONS');