In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imports
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
from tensorflow.python.keras import Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, Activation, Dense, MaxPooling2D, Flatten, BatchNormalization, Dropout
import math
import matplotlib.pyplot as plt
import pandas as pd



In [None]:
# some config
OUTPUT_PATH = 'output'
# you can change to 42 if you believe in magic
# SEED = np.random.randint(100)
SEED = 42

DATASET_PATH = '/kaggle/input/age-gender-and-ethnicity-face-data-csv/age_gender.csv'

INPUT_SHAPE = (48, 48, 1)

# compile configurations
METRICS = ["accuracy", 'mean_absolute_error']
LOSS = "categorical_crossentropy"
OPTIMIZER = 'adam'
INIT_LR = 1e-3

# train configuration
EPOCHS = 50
BS = 16
# no more than 25% without improvements 
EARLY_STOPPING_PATIENCE = int(round(EPOCHS/4))
# we will save the model, so everyone can download and predict at home, hashtag IWantToBePopular ahahahah
MODEL_SAVE = True
LOSSES = {'age_out': 'mean_squared_error', 'eth_out': 'categorical_crossentropy', 'gen_out': 'categorical_crossentropy'}
# since the age is the most difficult to predict, we are going to weight ot more
LOSS_WEIGHT = {'age_out': 8.0, 'eth_out': 3.0, 'gen_out': 1.0}


In [None]:
# help functions

def save_plot(H, path):
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(H.history["loss"], label="train_loss")
    plt.plot(H.history["val_loss"], label="val_loss")
    plt.plot(H.history["age_out_accuracy"], label="age_train_acc")
    plt.plot(H.history["eth_out_accuracy"], label="eth_train_acc")
    plt.plot(H.history["gen_out_accuracy"], label="gen_train_acc")
    plt.plot(H.history["val_age_out_accuracy"], label="age_val_acc")
    plt.plot(H.history["val_eth_out_accuracy"], label="eth_val_acc")
    plt.plot(H.history["val_gen_out_accuracy"], label="gen_val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.savefig(path)

    
    
class Style:
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    UNDERLINE = '\033[4m'
    RESET = '\033[0m'

def load_et_dataset(datasetpath, verbose=0):
    print(f'[INFO] loading ethnicity Dataset in {datasetpath}')
    dataset = open(datasetpath)
    dataset = np.array([row for row in dataset])
    columns = dataset[0]
    dataset = dataset[1:]
    data = []
    labels = []
    if verbose > 0:
        print(f'[INFO] loading data and inform every {verbose} loaded datapoints')
    for (i, image_data) in enumerate(dataset):
        row = image_data.split(',')
        label = row[0:3]
        image = row[4:]
        image = [i.split(' ') for i in image]
        image = np.array([int(i) for i in image[0]], dtype='uint8')
        data.append(image.reshape(48, 48))
        labels.append(label)
        if verbose > 0 and i > 0 and (i + 1) % verbose == 0:
            print(f'[INFO] processed {i + 1}/{len(dataset)}')
    print(f'[INFO] loaded {len(data)} datapoints from {datasetpath}')
    # preprocess ready for training
    labels = np.array(labels, dtype='int')
    data = np.array(data, dtype="float32")
    data = np.expand_dims(data, axis=-1)
    data /= 255.0
    return (data, labels)

In [None]:
# defining the model
class EthGenNet:
    @staticmethod
    def ethnicity_model(inputs, num_classes):
        # model = Sequential()
        # first block
        model = Conv2D(32, (3, 3), padding="same")(inputs)
        model = Activation("relu")(model)
        model = BatchNormalization(axis=-1)(model)
        model = MaxPooling2D(pool_size=(3, 3))(model)
        model = Dropout(0.25)(model)
        # second block
        # (CONV => RELU) * 2 => POOL
        model = Conv2D(64, (3, 3), padding="same")(model)
        model = Activation("relu")(model)
        model = BatchNormalization(axis=-1)(model)
        model = Conv2D(64, (3, 3), padding="same")(model)
        model = Activation("relu")(model)
        model = BatchNormalization(axis=-1)(model)
        model = MaxPooling2D(pool_size=(2, 2))(model)
        model = Dropout(0.25)(model)
        # third block
        # (CONV => RELU) * 2 => POOL
        model = Conv2D(128, (3, 3), padding="same")(model)
        model = Activation("relu")(model)
        model = BatchNormalization(axis=-1)(model)
        model = Conv2D(128, (3, 3), padding="same")(model)
        model = Activation("relu")(model)
        model = BatchNormalization(axis=-1)(model)
        model = MaxPooling2D(pool_size=(2, 2))(model)
        model = Dropout(0.25)(model)
        # FC block
        model = Flatten()(model)
        model = Dense(256)(model)
        model = Activation("relu")(model)
        model = BatchNormalization()(model)
        model = Dropout(0.5)(model)
        model = Dense(num_classes)(model)
        model = Activation('softmax', name="eth_out")(model)
        # print the Structure of the model and return it
        # print(f'Model Structure:\n{model.summary()}')
        return model

    @staticmethod
    def gender_model(inputs, num_classes):
        # lenet architecture works well with this
        print(f'[INFO] building gender branch model')
        # model = Sequential()
        # first block
        model = Conv2D(20, (5, 5), padding='same')(inputs)
        model = Activation('relu')(model)
        model = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(model)
        # second block
        model = Conv2D(50, (5, 5), padding='same')(model)
        model = Activation('relu')(model)
        model = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(model)
        # FC block
        model = Flatten()(model)
        model = Dense(500)(model)
        model = Activation('relu')(model)
        # softmax classifier
        model = Dense(num_classes)(model)
        model = Activation('softmax', name='gen_out')(model)
        # print the Structure of the model and return it
        # print(f'Model Structure:\n{model.summary()}')
        return model
    @staticmethod
    def age_model(inputs, num_classes=1):
        print('[INFO] building age branch model')
        # first block
        model = Conv2D(16, (3, 3), padding='same')(inputs)
        model = Activation('relu')(model)
        model = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(model)
        # second block
        model = Conv2D(32, (3, 3), padding='same')(model)
        model = Activation('relu')(model)
        model = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(model)
        # third block
        model = Conv2D(64, (3, 3), padding='same')(model)
        model = Activation('relu')(model)
        model = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(model)
        # FC block
        model = Flatten()(model)
        model = Dense(500)(model)
        model = Activation('relu')(model)
        # classifier
        model = Dense(num_classes)(model)
#         model = Activation('sigmoid', name='age_out')(model)
        model = Activation('linear', name='age_out')(model)

        return model

    @staticmethod
    def build(age_number, eth_number, gen_number):
        inputs = Input(shape=INPUT_SHAPE)
        age_branch = EthGenNet.age_model(inputs=inputs, num_classes=age_number)
        ethnicity_branch = EthGenNet.ethnicity_model(inputs=inputs, num_classes=eth_number)
        gender_branch = EthGenNet.gender_model(inputs=inputs, num_classes=gen_number)
        model = Model(
            inputs=inputs,
            outputs=[age_branch, ethnicity_branch, gender_branch],
            name='ethgennet'
        )
        # print the Structure of the model and return it
        print(f'Model Structure:\n{model.summary()}')
        return model


In [None]:
data, labels = load_et_dataset(DATASET_PATH, verbose=5000)

In [None]:
# separating the labels
age_label = np.array([a[0] for a in labels], dtype='uint8')
eth_label = np.array([e[1] for e in labels], dtype='uint8')
gen_label = np.array([g[2] for g in labels], dtype='uint8')

# prepare the interpretation
age_mapping = max(age_label)
ethnicity_mapping = np.array(["White", "Black", "Asian", "Indian", "Hispanic"])
gender_mapping = np.array(["Male", "Female"])


In [None]:
# generate plot with the data
# plot of the ages
df = pd.DataFrame(labels, columns=['age', 'ethnicity', 'gender'])
# a lot of people in 30's
df.plot(kind='hist', y='ethnicity', title='Ethnicity')
df.plot(kind='hist', y='age', title='age')
df.plot(kind='hist', y='gender', title='gender')
df.plot(kind='scatter', x='ethnicity', y='age', alpha=0.01, title='ethnicity vs age')
df.plot(kind='scatter', x='gender', y='age', alpha=0.01, title='gender vs age')


In [None]:

# one hot encode labels
age_label_ohe = MinMaxScaler().fit_transform(age_label.reshape(-1, 1))
eth_label_ohe = LabelBinarizer().fit_transform(eth_label)
gen_label_ohe = LabelBinarizer().fit_transform(gen_label)
gen_label_ohe = to_categorical(gen_label_ohe)

In [None]:
# init the model, with the respective outputs
model = EthGenNet.build(age_number=1, eth_number=5, gen_number=2)
opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
model.compile(loss=LOSSES, optimizer=OPTIMIZER, metrics=METRICS, loss_weights=LOSS_WEIGHT)

In [None]:
print(f'[INFO] splitting the data using seed {SEED}')

(train_X, test_X, train_age_y, test_age_y, train_eth_y, test_eth_y, train_gen_y, test_gen_y) = train_test_split(data, age_label_ohe, eth_label_ohe, gen_label_ohe, test_size=0.2, stratify=eth_label_ohe, random_state=SEED)


In [None]:
# preparing a better train with callbacks
# Available metrics are: loss,eth_out_loss,gen_out_loss,eth_out_accuracy,gen_out_accuracy,val_loss,val_eth_out_loss,val_gen_out_loss,val_eth_out_accuracy,val_gen_out_accuracy

es_age = EarlyStopping(
    monitor="mean_absolute_distance",
    # monitor="val_loss",
    patience=EARLY_STOPPING_PATIENCE,
    restore_best_weights=True)

es_eth = EarlyStopping(
    monitor="val_eth_out_loss",
    patience=EARLY_STOPPING_PATIENCE,
    restore_best_weights=True)

es_gen = EarlyStopping(
    monitor="val_gen_out_loss",
    patience=EARLY_STOPPING_PATIENCE,
    restore_best_weights=True)
# training the model
H = model.fit(
    train_X,
    y={'age_out': train_age_y,'eth_out': train_eth_y, 'gen_out': train_gen_y},
    batch_size=BS,
    validation_data=(test_X, {'age_out': test_age_y, 'eth_out': test_eth_y, 'gen_out': test_gen_y}),
    steps_per_epoch=len(train_X) // BS,
    epochs=EPOCHS,
    # class_weight=class_weights,
    callbacks=[es_age, es_eth, es_gen],
    verbose=1)
print('[INFO] finished training')



In [None]:
# after training
time = datetime.now().strftime('%y-%m-%d %H:%M:%S')
model.save(f"{OUTPUT_PATH}/best_at{time}.hdf5")
save_plot(H, f"{OUTPUT_PATH}/best_at{time}plot.png")

In [None]:
# 9 random samples (make a function in helper)
nine_random_numbers = np.random.randint(low=0, high=len(test_X), size=9, dtype='int64')
nine_random_examples = np.array([test_X[i] for i in nine_random_numbers])
test = np.expand_dims(test_X[0], axis=0)
prediction = model.predict(nine_random_examples)


In [None]:
pred_ages = []
for pred in prediction[0]:
    pred_ages.append(pred*age_mapping)
pred_ages = [int(np.round(ages)) for ages in pred_ages]

pred_ethnicity = []
for pred in prediction[1]:
    eth_index = np.where(pred == np.amax(pred))
    pred_ethnicity.append(ethnicity_mapping[eth_index])
pred_genders = []
for pred in prediction[2]:
    gen_index = np.where(pred == np.amax(pred))
    pred_genders.append(gender_mapping[gen_index])

real_ages = []
real_ethnicity = []
real_genders = []

for i in nine_random_numbers:
    real_ages.append(int(np.round(test_age_y[i]*age_mapping)))
    real_ethnicity.append(ethnicity_mapping[np.where(test_eth_y[i] == 1)])
    real_genders.append(gender_mapping[np.where(test_gen_y.astype(np.int)[i] == 1)])

pictures = []
for i in nine_random_numbers:
    image = test_X[i]*255
    pictures.append(image.astype(int))


In [None]:
pred_ethnicity = np.squeeze(pred_ethnicity)
pred_genders = np.squeeze(pred_genders)
real_ages = np.squeeze(real_ages)


In [None]:
cols = 3
rows = math.ceil(len(pictures) / cols)
plt.style.use('seaborn')
fig, axs = plt.subplots(nrows=rows, ncols=cols)
for i, ax in enumerate(axs.flat):
    if i == len(pictures):
        break
    image = np.stack((pictures[i],pictures[i], pictures[i]), axis=2)
    image = np.squeeze(image)
    ax.set(xticks=[], yticks=[])
    ax.set_title(f'Predicted:\n {pred_genders[i]} {pred_ethnicity[i]}, {pred_ages[i]}')
    ax.set_xlabel(f'Real:\n {real_genders[i][0]} {real_ethnicity[i][0]}, {real_ages[i]}')
    ax.imshow(image)

plt.tight_layout()
plt.savefig('output/comparision.png')
plt.show()
