**Overview:**

This code builds a multi_task **(one)** model to predict (age, gender and race) using utkface dataset which is available on kaggle https://www.kaggle.com/datasets/jangedoo/utkface-new.
The dataset containes approximately 20,000 images with three labels (age, gender and race).

**Very important notice:**
- The purpose of this notebook is to try (transfer learning, image generating, augmentation, one model with three branches for age, race and gender).
- The model is a prototype trained only for 20 epochs and needs to be tuned. 



# **Import Libraries**

In [None]:
#importig required libraries
import numpy as np 
import pandas as pd
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from keras.utils import to_categorical
from PIL import Image
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# **Get data from kaggle**

In [None]:
#get the dataset from kaggle
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d jangedoo/utkface-new
! unzip utkface-new.zip -d utkface-new

# **Prepare the dataframe, and scale age column**

In [None]:
#building a dictionary for labels (id : label_text)
dataset_folder_name = 'utkface-new/UTKFace'
IM_WIDTH = IM_HEIGHT = 198

dataset_dict = {
    'race_id': {
        0: 'white', 
        1: 'black', 
        2: 'asian', 
        3: 'indian', 
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    }
}

dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((g, i) for i, g in dataset_dict['race_id'].items())

In [None]:
def parse_dataset(dataset_path, ext='jpg'):
    """
    Used to extract information about our dataset. It does iterate over all images and return a DataFrame with
    the data (age, gender and sex) of all files.
    """
    def parse_info_from_file(path):
        """
        Parse information from a single file
        """
        try:
            filename = os.path.split(path)[1]
            filename = os.path.splitext(filename)[0]
            age, gender, race, _ = filename.split('_')
            return int(age), dataset_dict['gender_id'][int(gender)], dataset_dict['race_id'][int(race)]
        except Exception as ex:
            return None, None, None
        
    files = glob.glob(os.path.join(dataset_path, "*.%s" % ext))
    
    records = []
    for file in files:
        info = parse_info_from_file(file)
        records.append(info)
        
    df = pd.DataFrame(records)
    df['file'] = files
    df.columns = ['age', 'gender', 'race', 'file']
    df = df.dropna()
    
    return df

In [None]:
from sklearn import preprocessing
df = parse_dataset(dataset_folder_name)
df['gender'] = df['gender'].map(lambda gender: dataset_dict['gender_alias'][gender])
df['race'] = df['race'].map(lambda race: dataset_dict['race_alias'][race])

In [None]:
#scaling age column
x = df['age'].values.reshape(-1, 1)
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(x)
x_scaled = min_max_scaler.transform(x)
df['age'] = x_scaled
df = shuffle(df)
df.head()

Unnamed: 0,age,gender,race,file
9684,0.095652,1,4,utkface-new/UTKFace/12_1_4_20170103200721583.j...
22913,0.66087,1,0,utkface-new/UTKFace/77_1_0_20170110160644117.j...
11557,0.382609,0,0,utkface-new/UTKFace/45_0_0_20170104172836234.j...
5999,0.269565,0,1,utkface-new/UTKFace/32_0_1_20170116002309943.j...
3913,0.704348,1,1,utkface-new/UTKFace/82_1_1_20170112224554283.j...


# **Altering built-in image generator to fit the multi-label task**

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
columns=["age", "gender","race"]
dataGen = ImageDataGenerator(rescale = 1./255,
                                  shear_range = 0.2,
                                  zoom_range = 0.2,
                                  vertical_flip=True,
                                  rotation_range=180,
                                  horizontal_flip = True,
                                  validation_split=0.2)
def generate_data_generator(generator, data,columns, data_type, toshuffle=True, tobreak=False, batchSize=32):
    generate = generator.flow_from_dataframe(dataframe = data,
                                        directory="",x_col="file",
                                        y_col=columns,
                                        subset=data_type,
                                        class_mode="raw",
                                        target_size = (198,198),
                                        batch_size = batchSize,
                                        shuffle = toshuffle)
    i=0
    while True:
        X,Y = next(generate)
        yield X, [Y[:,0], Y[:,1],Y[:,2]]
        i = i + 1
        if(tobreak == True and i == len(data)):
          break;
             

# **Build the CNN model**

**The model is based on VGG16 pretrained model with one layer above (each layer is a branch for one task)**

In [None]:
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Lambda
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import Input
import tensorflow as tf

class MultiModel():

    def race_part(self,bottom_model, num_races, D=128):
        top_model = bottom_model.output
        top_model = Flatten()(top_model)
        top_model = Dense(D, activation = "relu")(top_model)
        top_model = BatchNormalization()(top_model)
        top_model = Dropout(0.5)(top_model)
        top_model = Dense(num_races, activation = "softmax", name="race_output")(top_model)
        return top_model
    
    def gender_part(self,bottom_model, num_genders=2, D=128):
        top_model = bottom_model.output
        top_model = Flatten()(top_model)
        top_model = Dense(D, activation = "relu")(top_model)
        top_model = BatchNormalization()(top_model)
        top_model = Dropout(0.5)(top_model)
        top_model = Dense(1, activation = "sigmoid", name="gender_output")(top_model)
        return top_model

    def age_part(self,bottom_model, D=128):
        top_model = bottom_model.output
        top_model = Flatten()(top_model)
        top_model = Dense(D, activation = "relu")(top_model)
        top_model = BatchNormalization()(top_model)
        top_model = Dropout(0.5)(top_model)
        top_model = Dense(1, activation = "linear", name="age_output")(top_model)
        return top_model


    def full_model(self, width, height, num_races):

        input_shape = (height, width, 3)
        inputs = Input(shape=input_shape)
        from keras.applications import VGG16
        img_rows, img_cols = IM_WIDTH, IM_HEIGHT 
        model1 = VGG16(weights='imagenet', include_top = False, input_shape=(img_rows,img_cols,3))
        for layer in model1.layers:
            layer.trainable = False
        age = self.age_part(model1)
        race = self.race_part(model1,num_races)
        gender = self.gender_part(model1)

        model = Model(inputs=model1.input,
                     outputs = [age, gender,race])

        return model
    
model = MultiModel().full_model(IM_WIDTH, IM_HEIGHT, num_races=len(dataset_dict['race_alias']))

# **Train the model**

**loss_weights can be tuned to prioritize one branch over others**

In [None]:
from keras.optimizers import Adam

init_lr = 0.007
epochs = 20

opt = Adam(lr=init_lr, decay=init_lr / epochs)

model.compile(optimizer=opt, 
              loss={
                  'age_output': 'mse',
                  'gender_output': 'binary_crossentropy', 
                  'race_output': 'sparse_categorical_crossentropy', 
                  },
              loss_weights={
                  'age_output': 4., 
                  'gender_output': 3.,
                  'race_output': 3., 
                  },
              metrics={
                  'age_output': 'mae', 
                  'gender_output': 'accuracy',
                  'race_output': 'sparse_categorical_accuracy',
                  })

In [None]:
from keras.callbacks import ModelCheckpoint

checkpointer = ModelCheckpoint(
    filepath='multi_model.hdf5'
    , save_best_only=True
    , mode = 'auto'
)

history = model.fit_generator(generate_data_generator(dataGen, df, columns, 'training'),
                              epochs=epochs,
                              steps_per_epoch=362,
                              validation_data=generate_data_generator(dataGen, df, columns, 'validation'),
                              validation_steps=150,
                              callbacks=checkpointer)


`Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators.



Found 18964 validated image filenames.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# **Plot accuracy and error**

In [None]:
import plotly.graph_objects as go
plt.clf()
fig = go.Figure()
fig.add_trace(go.Scatter(
                    y=history.history['race_output_sparse_categorical_accuracy'],
                    name='Train'))

fig.add_trace(go.Scatter(
                    y=history.history['val_race_output_sparse_categorical_accuracy'],
                    name='Valid'))


fig.update_layout(height=450, 
                  width=600,
                  title='Accuracy for race feature',
                  xaxis_title='Epoch',
                  yaxis_title='Accuracy')

fig.write_html('acc_race.html', include_plotlyjs='cdn')

fig.show()

<Figure size 432x288 with 0 Axes>

In [None]:
plt.clf()

fig = go.Figure()
fig.add_trace(go.Scatter(
                    y=history.history['gender_output_accuracy'],
                    name='Train'))

fig.add_trace(go.Scatter(
                    y=history.history['val_gender_output_accuracy'],
                    name='Valid'))


fig.update_layout(height=450, 
                  width=600,
                  title='Accuracy for gender feature',
                  xaxis_title='Epoch',
                  yaxis_title='Accuracy')

fig.write_html('acc_gender.html', include_plotlyjs='cdn')

fig.show()

<Figure size 432x288 with 0 Axes>

In [None]:
plt.clf()

fig = go.Figure()
fig.add_trace(go.Scattergl(
                    y=history.history['age_output_mae'],
                    name='Train'))

fig.add_trace(go.Scattergl(
                    y=history.history['val_age_output_mae'],
                    name='Valid'))


fig.update_layout(height=450, 
                  width=600,
                  title='Mean Absolute Error for age feature',
                  xaxis_title='Epoch',
                  yaxis_title='Mean Absolute Error')

fig.write_html('mae_age.html', include_plotlyjs='cdn')

fig.show()

<Figure size 432x288 with 0 Axes>

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(
                    y=history.history['loss'],
                    name='Train'))

fig.add_trace(go.Scattergl(
                    y=history.history['val_loss'],
                    name='Valid'))


fig.update_layout(height=450, 
                  width=600,
                  title='Overall loss',
                  xaxis_title='Epoch',
                  yaxis_title='Loss')

fig.write_html('overall_loss.html', include_plotlyjs='cdn')

fig.show()