In [None]:
from collections import defaultdict
from glob import glob
from random import choice, sample
import os.path
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
from tensorflow.keras.utils import load_img

In [None]:
from keras_vggface.vggface import VGGFace
from keras_vggface.utils import preprocess_input

from keras.layers import Input, Dense, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Exploratory Data Analysis

Understand the images, the directory structure, and references/keys between people/relationships/images work. 

`F*` top-level directories contain families.

`MID*` directories inside them contain members: mothers/fathers/children.

`train_relationships.csv` contains positive examples with `parent-sibiling` (e.g.: `mother-child` and `father-child`) relationships.

In [None]:
train_relationships = pd.read_csv('data/train_relationships.csv')
train_relationships

In [None]:
train_images = [os.path.join(root, file).replace('\\', '/') for root, _, files in os.walk(os.path.expanduser("data/train")) for file in files]
train_images

In [None]:
all_images_df = pd.DataFrame({
    'files': train_images,
    'familyId': [file.split('/')[3] for file in train_images],
    'kinId': [file.split('/')[4] for file in train_images],
    'uniqueId': [file.split('/')[3] + '/' + file.split('/')[4] for file in train_images]
})
all_images_df

In [None]:
families = all_images_df["familyId"].unique()
pd.DataFrame(families)

In [None]:
distinct_image_sizes = {Image.open(path).size for path in all_images_df.files}
assert(len(distinct_image_sizes) == 1)

# all images have the same dimension
# no need for resizing

distinct_image_sizes

# Preprocess

In [None]:
# Select F09* families for validation, the rest for training
val_families = "F09"

all_images = glob("data/train/*/*/*.jpg")
all_images = [p.replace('\\', '/') for p in all_images]
train_images = [p for p in all_images if val_families not in p]
validation_images = [p for p in all_images if val_families in p]

assert(len(all_images) == len(train_images) + len(validation_images))

# split 90.7% + 9.3%, test set provided by Kaggle
print(f'all        images: {len(all_images)}')
print(f'train      images: {len(train_images)}')
print(f'validation images: {len(validation_images)}')

In [None]:
# training paths - to images not belonging to `F09` families
train_person_to_images_map = defaultdict(list)
for x in train_images:
    train_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

# validation paths - to images belonging to `F09` families
val_person_to_images_map = defaultdict(list)
for x in validation_images:
    val_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

In [None]:
# must not contain paths starting with `data/train/F09`
train_person_to_images_map

In [None]:
# must only contain paths starting with `data/train/F09`
validation_images

In [None]:
# people found in train.zip
ppl = [x.split("/")[-3] + "/" + x.split("/")[-2] for x in all_images]

# filter out people not found in train.zip but present in train_relationships.csv
# to make all data cross-referencing;
# similar to an INNER JOIN
relationships = list(zip(train_relationships.p1.values, train_relationships.p2.values))
relationships = [x for x in relationships if x[0] in ppl and x[1] in ppl]

train = [x for x in relationships if val_families not in x[0]]
val = [x for x in relationships if val_families in x[0]]

print(f'train relationships:      {len(train)}')
print(f'validation relationships: {len(val)}')

In [None]:
# reorder channels from RGB to BGR
# zero-center all channels with respect to ImageNet
# no scaling, no actual normalization
def normalize_image(path):
    img = load_img(path)
    img = np.array(img).astype(np.float)
    return preprocess_input(img)

def generator(list_tuples, person_to_images_map, batch_size=16):
    ppl = list(person_to_images_map.keys())
    while True:
        batch_tuples = sample(list_tuples, batch_size // 2)
        labels = [1] * len(batch_tuples)

        while len(batch_tuples) < batch_size:
            p1 = choice(ppl)
            p2 = choice(ppl)

            if p1 != p2 and (p1, p2) not in list_tuples and (p2, p1) not in list_tuples:
                batch_tuples.append((p1, p2))
                labels.append(0)

        for x in batch_tuples:
            if not len(person_to_images_map[x[0]]):
                print(x[0])

        X1 = [choice(person_to_images_map[x[0]]) for x in batch_tuples]
        X1 = np.array([normalize_image(x) for x in X1])

        X2 = [choice(person_to_images_map[x[1]]) for x in batch_tuples]
        X2 = np.array([normalize_image(x) for x in X2])
        labels = np.asarray(labels).astype('float32').reshape((-1,1))

        yield [X1, X2], labels

In [None]:
def build_model():
    # images are 3-channel 224x224
    input_1 = Input(shape=(224, 224, 3))
    input_2 = Input(shape=(224, 224, 3))

    # VGGFace pre-trained on faces
    # only load feature extraction layers
    base_model = VGGFace(model='resnet50', include_top=False)
    
    # senet50 expands resnet50 with an added Squeeze-and-Excitation block 
    # base_model = VGGFace(model='senet50', include_top=False)

    # transfer learning - feature extraction
    # use a pre-trained model as a fixed feature extractor
    # train its final 3 layers for our application, freeze the rest
    for x in base_model.layers[:-3]:
        x.trainable = True

    x1 = base_model(input_1)
    x2 = base_model(input_2)

    x1 = Concatenate(axis=-1)([GlobalMaxPool2D()(x1), GlobalAvgPool2D()(x1)])
    x2 = Concatenate(axis=-1)([GlobalMaxPool2D()(x2), GlobalAvgPool2D()(x2)])

    # Distance metric: (x1 - x2)^2 + x1^2 * x2^2 + x1 * x2
    x3 = Subtract()([x1, x2])
    x3 = Multiply()([x3, x3])

    x1_ = Multiply()([x1, x1])
    x2_ = Multiply()([x2, x2])
    x4 = Subtract()([x1_, x2_])

    x5 = Multiply()([x1, x2])

    x = Concatenate(axis=-1)([x4, x3, x5])

    # hidden layers activation: ReLu
    x = Dense(100, activation="relu")(x)
    
    x = Dropout(0.01)(x)
    
    # output layer activation: Sigmoid
    out = Dense(1, activation="sigmoid")(x)

    model = Model([input_1, input_2], out)
    # loss function: Binary Cross-Entropy
    model.compile(loss="binary_crossentropy", metrics=['accuracy'], optimizer=Adam(0.00001))

    model.summary()

    return model

In [None]:
file_path = "faces.h5"

# Save the Keras model/weights frequently
checkpoint = ModelCheckpoint(
    file_path,
    monitor='val_accuracy', # best validation accuracy
    verbose=1,
    save_best_only=True,
    mode='max')

# Reduce learning rate when the validation accuracy stopped improving
reduce_on_plateau = ReduceLROnPlateau(
    monitor="val_accuracy",
    mode="max",
    factor=0.1,
    patience=20,
    verbose=1)

callbacks_list = [checkpoint, reduce_on_plateau]

model = build_model()

if os.path.isfile(file_path):
    model.load_weights(file_path)

hist = model.fit(
    generator(train, train_person_to_images_map, batch_size=16),
    use_multiprocessing=False,
    validation_data=generator(val, val_person_to_images_map, batch_size=16),
    epochs=30,
    verbose=2,
    workers=1,
    callbacks=callbacks_list,
    steps_per_epoch=50,
    validation_steps=50)

# Testing

In [None]:
def batcher(seq, size=32):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

test_path = "data/test/"

submission = pd.read_csv('data/sample_submission.csv')

predictions = []

# "taqaddum" progress bars
for batch in tqdm(batcher(submission.img_pair.values)):
    X1 = [x.split("-")[0] for x in batch]
    X1 = np.array([normalize_image(test_path + x) for x in X1])

    X2 = [x.split("-")[1] for x in batch]
    X2 = np.array([normalize_image(test_path + x) for x in X2])

    # https://numpy.org/doc/stable/reference/generated/numpy.ravel.html
    pred = model.predict([X1, X2]).ravel().tolist()
    predictions += pred

submission['is_related'] = predictions

submission.to_csv("face.csv", index=False)

# Plot

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 6))

# loss to the left
axes[0].plot(hist.history['loss'], label='loss')
axes[0].plot(hist.history['val_loss'], label='val_loss')
axes[0].legend(prop={'size':15})

# accuracy to the right
axes[1].plot(hist.history['accuracy'], label='acc')
axes[1].plot(hist.history['val_accuracy'], label='val_acc')
axes[1].legend(prop={'size':15})

plt.savefig('faces.png')