In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
data = pd.read_csv('../input/age-gender-and-ethnicity-face-data-csv/age_gender.csv')

In [None]:
data.head()

In [None]:
# Sample some pictures

data['pixels']=data['pixels'].apply(lambda x:  np.array(x.split(), dtype="float32"))

plt.figure(figsize=(16,16))
for i in range(2500,2510):
    plt.subplot(5,5,(i%25)+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(data['pixels'].iloc[i].reshape(48,48))
    plt.xlabel(
        "Age:"+str(data['age'].iloc[i])+
        "  Ethnicity:"+str(data['ethnicity'].iloc[i])+
        "  Gender:"+ str(data['gender'].iloc[i])
    )
plt.show()

# DATA PREPROCESSING

In [None]:
# Reshape the pictures' pixels
data = pd.read_csv('../input/age-gender-and-ethnicity-face-data-csv/age_gender.csv')

X = data.drop(['age', 'ethnicity', 'age'], axis=1)
X = pd.Series(X['pixels'])
X = X.apply(lambda x: x.split(' '))
X = X.apply(lambda x: np.array(list(map(lambda z: np.int(z), x))))
X = np.array(X)
X = np.stack(np.array(X), axis=0)
pictures = np.reshape(X, (-1, 48, 48,1))

In [None]:
# Get the different labels

y_ethnicity = np.array(data['ethnicity'])
y_gender = np.array(data['gender'])

In [None]:
# Split the data into a training, validation, and testing sets for each tasks

random_seed = 2
x_gender_train, x_gender_tv, y_gender_train, y_gender_tv = train_test_split(pictures, y_gender, test_size=0.3, 
                                                                              random_state=random_seed)

x_gender_val, x_gender_test, y_gender_val, y_gender_test = train_test_split(x_gender_tv, y_gender_tv, test_size=0.5, 
                                                                              random_state=random_seed)

x_ethnicity_train, x_ethnicity_tv, y_ethnicity_train, y_ethnicity_tv = train_test_split(
    pictures, y_ethnicity, test_size=0.1, random_state=random_seed)

x_ethnicity_val, x_ethnicity_test, y_ethnicity_val, y_ethnicity_test = train_test_split(
    x_ethnicity_tv, y_ethnicity_tv, test_size=0.1, random_state=random_seed)


# Data Augmentation
datagen = ImageDataGenerator(rescale=1/255,
                             rotation_range=20,
                             brightness_range=[0.8,1.2],
                             zoom_range=0.2,
                             shear_range = 0.2,
                             width_shift_range=0.2,
                             height_shift_range=0.2,
                             horizontal_flip=True)
datagen.fit(x_gender_train)
datagen.fit(x_ethnicity_train)

# MODEL FOR PREDICTING GENDER

In [None]:
# Build the model

gender_model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3,3), activation='relu', input_shape=(48,48,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

gender_model.compile(optimizer='rmsprop', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

gender_model.summary()

In [None]:
gender_history = gender_model.fit(datagen.flow(x_gender_train, y_gender_train, batch_size=64),
                   validation_data=(x_gender_val, y_gender_val),
                   epochs=15,
                   verbose=2,
                   steps_per_epoch=len(x_gender_train)/64)

In [None]:
# Plot the performance history of the model 

acc = gender_history.history['accuracy']
val_acc = gender_history.history['val_accuracy']
loss = gender_history.history['loss']
val_loss = gender_history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()

plt.show()

In [None]:
# Evaluate the model on the test set

gender_model.evaluate(x_gender_test, y_gender_test)

We have a 86.28% accuracy on the testing with this relatively simple CNN that uses data augmentation

# MODEL FOR PREDICTING ETHNICITY

In [None]:
# First let's find how many ethnicities labels there are 

data['ethnicity'].unique()

In [None]:
# Build the model for ethnicity

ethnicity_model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3,3), activation='relu', input_shape=(48,48,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(5, activation='softmax'),
])

ethnicity_model.compile(optimizer='rmsprop', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

ethnicity_model.summary()

In [None]:
ethnicity_history = ethnicity_model.fit(datagen.flow(x_ethnicity_train, y_ethnicity_train, batch_size=64),
                   validation_data=(x_ethnicity_val, y_ethnicity_val),
                   epochs=15,
                   verbose=2,
                   steps_per_epoch=len(x_ethnicity_train)/64)

In [None]:
# Plot the performance history of the model 

acc = ethnicity_history.history['accuracy']
val_acc = ethnicity_history.history['val_accuracy']
loss = ethnicity_history.history['loss']
val_loss = ethnicity_history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()

plt.show()

In [None]:
# Evaluate the model on the test set

ethnicity_model.evaluate(x_ethnicity_test, y_ethnicity_test)

We have an accuracy of 63.87% on the testing set for predicting ethnicity. This is much lower than when I trained it but it happens, and it's only a basic CNN so it makes sense that the performance is not great.

credit to Gabriel Atkin and Shahraiz for some preprocessing code