In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.data import AUTOTUNE
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv2D, SpatialDropout2D, MaxPool2D, Dense, Input, Flatten, BatchNormalization

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/age-gender-and-ethnicity-face-data-csv/age_gender.csv')
df = df.drop('img_name', axis = 1)
df.head()

The images are stored in pixels as a flat string. The images are sized 48, 48 so we'll convert this in a bit.

In [None]:
df[['age', 'ethnicity', 'gender']].nunique()

In [None]:
AGE_UNIQUE = 120
ETH_UNIQUE = 5
GENDER_UNIQUE = 1

IMAGE_SIZE = (48, 48)
DROP_RATE = 0.5
EPOCHS = 30
LEARNING_RATE = 1e-3
BATCH_SIZE = 512
RANDOM_SEED = 2

In [None]:
df['age'].value_counts()
sns.displot(df['age'])
plt.ylim(0, 100)

In [None]:
df['ethnicity'].value_counts()

There are five different ethnicities in this dataset, with 1-3 having fairly similar counts, 4 being the least represented, and 0 being far overrepresented. This could cause some problems so I'm going to stratify this.

In [None]:
df['gender'].value_counts()

In [None]:
df['pixels_test'] = df['pixels'].map(lambda x: np.array([i for i in x.split(' ')], dtype = float).reshape(IMAGE_SIZE) / 255)

Convering the string of pixels into an np array

In [None]:
df['pixels'] = df['pixels_test']
df = df.drop('pixels_test', axis = 1)

In [None]:
def ident_model():
    input_layer = Input(shape = (*IMAGE_SIZE, 1), name = 'Input_Layer')
    conv_1 = Conv2D(16, 3, activation = 'relu', name = 'Conv_1', padding = 'same')(input_layer)
    conv_2 = Conv2D(16, 3, activation = 'relu', name = 'Conv_2', padding = 'same')(conv_1)
    drop_1 = SpatialDropout2D(0.3)(conv_2)
    pool_1 = MaxPool2D((2,2), name = 'Pool_1')(drop_1)
    batch_1 = BatchNormalization()(pool_1)
    conv_3 = Conv2D(32, 3, activation = 'relu', name = 'Conv_3', padding = 'same')(batch_1)
    conv_4 = Conv2D(32, 3, activation = 'relu', name = 'Conv_4', padding = 'same')(conv_3)
    drop_2 = SpatialDropout2D(0.3)(conv_4)
    pool_2 = MaxPool2D((2,2), name = 'Pool_2')(drop_2)
    batch_2 = BatchNormalization()(pool_2)
    conv_5 = Conv2D(64, 3, activation = 'relu', name = 'Conv_5', padding = 'same')(batch_2)
    pool_3 = MaxPool2D((2,2), name = 'Pool_3')(conv_5)
    batch_3 = BatchNormalization()(pool_3)
    conv_6 = Conv2D(128, 3, activation = 'relu', name = 'Conv_6', padding = 'same')(batch_3)
    pool_4 = MaxPool2D((2,2), name = 'Pool_4')(conv_6)
    batch_4 = BatchNormalization()(pool_4)
    drop_3 = SpatialDropout2D(0.3)(batch_4)
    flat = Flatten()(drop_3)
    
    age_dense_1 = Dense(256, activation = 'relu', name = 'Age_Dense_1')(flat)
    age_dense_2 = Dense(256, activation = 'relu', name = 'Age_Dense_2')(age_dense_1)
    final_age = Dense(1, activation = 'relu', name = 'Final_Age')(age_dense_2)
    
    eth_dense_1 = Dense(256, activation = 'relu', name = 'Eth_Density_1')(flat)
    eth_dense_2 = Dense(256, activation = 'relu', name = 'Eth_Density_2')(eth_dense_1)
    final_eth = Dense(ETH_UNIQUE, activation = 'softmax', name = 'Final_ETH')(eth_dense_2)
    
    #gender_dense_1 = Dense(256, activation = 'relu', name = 'GENDER_Density')(flat)
    final_gender = Dense(GENDER_UNIQUE, activation = 'sigmoid', name = 'Final_GENDER')(flat)
    
    #return Model(inputs = input_layer, outputs = final_age, name = 'Age_Eth_Gender_Model')
    #return Model(inputs = input_layer, outputs = [final_age, final_eth, final_gender], name = 'Age_Eth_Gender_Model')
    #return Model(inputs = input_layer, outputs = [final_age, final_eth], name = 'Age_Eth_Gender_Model')
    return Model(inputs = input_layer, outputs = [final_age, final_eth, final_gender], name = 'Age_Eth_Gender_Model')
    #return Model(inputs = input_layer, outputs = final_age, name = 'Age_Eth_Gender_Model')
    

In [None]:
model = ident_model()
model.compile(optimizer = Adam(LEARNING_RATE), loss = {'Final_Age':'mean_squared_error','Final_ETH':'sparse_categorical_crossentropy', 'Final_GENDER': 'binary_crossentropy'}
              , metrics = {'Final_Age':'mean_absolute_error','Final_ETH':'sparse_categorical_accuracy', 'Final_GENDER': 'accuracy'})


In [None]:
plot_model(model, to_file = 'model.jpg', show_shapes = True)

The model has a set of coupled conv2d layers before max pooling. The additional conv layer allows for more feature extracting before reducing the dimensions, and I saw a slight boost in accuracy doing so. However, training that model gave amazing results eth/gender (>94%) in the training, and a much more modest 70-80% in the val, meaning that I'm overfitting. The spatial2d dropouts should help with the overfitting.

In [None]:
pixels = np.zeros((len(df), *IMAGE_SIZE))
age = np.zeros(len(df), dtype = 'int32')
eth = np.zeros(len(df), dtype = 'int')
gender = np.zeros(len(df), dtype = 'int')

for i in range(len(df)):
    pixels[i] = df['pixels'].iloc[i]
    age[i] = df['age'].iloc[i]
    eth[i] = df['ethnicity'].iloc[i]
    gender[i] = df['gender'].iloc[i]
    
X_train, X_val, age_train, age_val, eth_train, eth_val, gender_train, gender_val = train_test_split(pixels, age, eth, gender, stratify = eth, random_state = RANDOM_SEED)

Setting up the training/test splits. I decided to stratify on the eth since gender should be relatively easy to distinguish, and age is relatively hard (compare a 20 yr old to a 30 yr old to a 25 yr old). Also, eth has a major imbalance in the case of class 0, and a minor imbalance in the case of class 4

In [None]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
history = model.fit(x = X_train, y = [age_train, eth_train, gender_train], validation_data = (X_val, [age_val, eth_val, gender_val]), epochs = 50, callbacks = early_stopping)

Validation Scores:

Age MAE = 6.1

Eth Accuracy = 72.9%

Gender = 84.6%

I'm mostly satisfied with these values for now.

In [None]:
epochs = range(len(history.history['val_loss']))
sns.lineplot(x = epochs, y = history.history['val_loss'], color = 'r', label = 'Validation')
sns.lineplot(x = epochs, y = history.history['loss'], color = 'b', label = 'Training')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.title('Loss vs Epoch')
plt.legend()

In [None]:
sns.lineplot(x = epochs, y = history.history['val_Final_GENDER_accuracy'], label = 'Gender')
sns.lineplot(x = epochs, y = history.history['val_Final_ETH_sparse_categorical_accuracy'], label = 'Ethnicity')
plt.xlabel('Epoch')
plt.ylabel('Loss')

In [None]:
predictions = model.predict(X_val)

In [None]:
pred_ages = predictions[0].reshape(-1)
pred_ages = pred_ages.astype(int)
pred_eth = predictions[1]
pred_eth = [np.argmax(i) for i in pred_eth]
pred_gender = predictions[2].reshape(-1)
pred_gender[pred_gender >= 0.5] = 1
pred_gender[pred_gender < 0.5] = 0
pred_gender = pred_gender.astype(int)

Pred_ages starts off as a list of an np array, hence the reshape
Pred eth is a list of np arrays of length 5 with the probabilities to being in each class. Argmax gives the class corresponding to the highest probability
Pred Gender is a list of values 0-1, I binizarize these values to get the predicted gender

Let's see some examples.

In [None]:
fig, axes = plt.subplots(5, 5, figsize = (20, 20))
axes = axes.flatten()
for i in range(25):
    axes[i].imshow(X_val[i])
    axes[i].set_title(f'''    (Predicted, Actual)
                Age ({pred_ages[i]}, {age_val[i]})
                Eth ({pred_eth[i]}, {eth_val[i]} )
                Gender ({pred_gender[i]}, {gender_val[i]})''')
fig.tight_layout()

I'm pretty satisfied with these quick examples