# Age, Gender and Ethnicity Predictions

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from keras.models import Sequential
from keras.layers import Conv2D, Dropout, BatchNormalization, Flatten, Dense, MaxPooling2D
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import mean_absolute_error, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, roc_curve, auc, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from tf.keras.preprocessing.image import ImageDataGenerator
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_roc_curve(true_y, y_prob):
    """
    plots the roc curve based of the probabilities
    """

    fpr, tpr, thresholds = roc_curve(true_y, y_prob)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
df = pd.read_csv('age_gender.csv')

In [None]:
df.head()


In [None]:
df.describe().T

In [None]:
# Age
sns.distplot(df['age']);

In [None]:
# Gender
sns.countplot(df['gender'])

In [None]:
#ethnicity
sns.countplot(df['ethnicity'])


In [None]:
df['pixels'] = df['pixels'].map(lambda x: np.array(x.split(' '), dtype=np.float32).reshape(48, 48))

In [None]:
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2)

X = np.array(df['pixels'].to_list())
X_reshaped = X.reshape(X.shape[0], 48, 48, 1)

datagen.fit(X_reshaped)

In [None]:
# Plot some pictures
fig, axes = plt.subplots(1, 5, figsize=(20, 10))

for i in range(5):
    random_face = np.random.choice(len(df))
    
    age = df['age'][random_face]
    ethnicity = df['ethnicity'][random_face]
    gender = df['gender'][random_face]
    
    axes[i].set_title('Age: {0}, Ethnicity: {1}, Sex: {2}'.format(age, ethnicity, gender))
    axes[i].imshow(df['pixels'][random_face], cmap = 'gray')
    axes[i].axis('off')


## CNN

In [None]:
# Normalise images
if np.max(X_reshaped) > 1: X_reshaped = X_reshaped / 255

In [None]:
# Set some useful variables
input_shape = X_reshaped.shape[1:] 

n_epochs = 30
batch_size = 64

## Age prediction

In [None]:
# Set y
y = df['age'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

In [None]:
print('X Train Shape: ', X_train.shape)
print('y Train Shape: ', y_train.shape)
print('X Test Shape: ', X_test.shape)
print('y Test Shape: ', y_test.shape)

In [None]:
age_model = Sequential()

age_model.add(Conv2D(64, kernel_size=(3,3), input_shape=input_shape, activation='relu'))  
age_model.add(MaxPooling2D(pool_size=(2,2)))
age_model.add(BatchNormalization())

age_model.add(Conv2D(128, kernel_size=(3,3), activation='relu', padding='same'))
age_model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
age_model.add(MaxPooling2D(pool_size=(2,2)))
age_model.add(Dropout(0.3))
age_model.add(BatchNormalization())

age_model.add(Conv2D(256, kernel_size=(3,3), activation='relu', padding='same'))
age_model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
age_model.add(MaxPooling2D(pool_size=(2,2)))
age_model.add(Dropout(0.3))
age_model.add(BatchNormalization())

age_model.add(Conv2D(512, kernel_size=(3,3), activation='relu', padding='same'))
age_model.add(MaxPooling2D(pool_size=(2,2)))
age_model.add(Dropout(0.5))
age_model.add(BatchNormalization())

age_model.add(Flatten())
age_model.add(Dense(128, activation='relu'))
age_model.add(Dropout(0.4))
age_model.add(Dense(1)) 

age_model.compile(optimizer='adam', loss='mse', metrics = ['mae'])

age_model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(age_model, to_file='age_model.png', show_shapes=True, show_layer_names=True)

In [None]:

checkpoint = ModelCheckpoint('age_model.h5', monitor='val_loss', save_best_only=True, mode='min')
callbacks = [checkpoint]

In [None]:
history = age_model.fit(X_train, y_train,
                        epochs=30,   
                        batch_size=batch_size, 
                        callbacks=callbacks,
                        validation_data=(X_test, y_test)) 

In [None]:
pd.DataFrame(history.history).plot();

In [None]:
from tensorflow.keras.models import load_model
age_model = load_model('age_model.h5')

In [None]:
valid_score = age_model.evaluate(X_test, y_test, verbose=1)

In [None]:
# Make predictions 
y_pred = age_model.predict(X_test) 

In [None]:
print("MAE: {}".format(mean_absolute_error(y_test, y_pred)))

In [None]:
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))

In [None]:
print('RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

In [None]:
print('R2 Score: {}'.format(r2_score(y_test, y_pred)))

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
sozluk = {'Mean Absolute Error' : mae,
          'Mean Squared Error' : mse,
          'Root Mean Squared Error' : rmse, 
          'R2 Score' : r2}

sozluk

In [None]:
df2 = pd.DataFrame(sozluk, index = ['Age Model Metrics'])
df2

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(y_pred, y_test); 

## Gender prediction

In [None]:
# Set y
y = df['gender'].values

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

In [None]:
gender_model = Sequential()

gender_model.add(Conv2D(64, kernel_size=(3,3), input_shape=input_shape, activation='relu'))
gender_model.add(MaxPooling2D(pool_size=(2,2)))
gender_model.add(BatchNormalization())

gender_model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
#gender_model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
gender_model.add(MaxPooling2D(pool_size=(2,2)))
gender_model.add(Dropout(0.4))
gender_model.add(BatchNormalization())

gender_model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
gender_model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
gender_model.add(MaxPooling2D(pool_size=(2,2)))
gender_model.add(Dropout(0.5))
gender_model.add(BatchNormalization())

gender_model.add(Flatten())

gender_model.add(Dense(64, activation='relu'))
gender_model.add(Dropout(0.4))

gender_model.add(Dense(1, activation='sigmoid'))

gender_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 

gender_model.summary()

In [None]:
plot_model(gender_model, to_file='gender_model.png', show_shapes=True, show_layer_names=True)

In [None]:
checkpoint = ModelCheckpoint('gender_model.h5', monitor='val_loss', save_best_only=True, mode='min')
callbacks = [EarlyStopping(patience=4, monitor='val_accuracy', mode='max'), 
              ReduceLROnPlateau(patience=2, verbose=1), checkpoint]

In [None]:
history = gender_model.fit(X_train, y_train, 
                           epochs=15,
                           batch_size=batch_size,
                           callbacks=callbacks,
                           validation_data=(X_test, y_test))

In [None]:
gender_model = load_model('gender_model.h5')

In [None]:
# Metrics
valid_score = gender_model.evaluate(X_test, y_test, verbose=1)
print('Accuracy for test set: {0}'.format(valid_score[1])) 

In [None]:
y_pred = gender_model.predict(X_test) 
y_pred = [0 if i <0.5 else 1 for i in y_pred ] 

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, cmap='Greens', cbar=False, annot=True, fmt='d'); 

In [None]:
plot_roc_curve(y_test, y_pred)
print(f'Gender model 1 AUC score: {roc_auc_score(y_test, y_pred)}')

### Ethnicity Prediction

In [None]:
y = df['ethnicity'].values
y = to_categorical(y, num_classes=5)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

In [None]:
ethnicity_model = Sequential()

ethnicity_model.add(Conv2D(64, kernel_size=(3,3), input_shape=input_shape, activation='relu'))
ethnicity_model.add(MaxPooling2D(pool_size=(2,2)))
ethnicity_model.add(BatchNormalization())

ethnicity_model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
#ethnicity_model.add(Conv2D(128, kernel_size=(3,3), activation='relu'))
ethnicity_model.add(MaxPooling2D(pool_size=(2,2)))
ethnicity_model.add(Dropout(0.4))
ethnicity_model.add(BatchNormalization())

ethnicity_model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
ethnicity_model.add(Conv2D(256, kernel_size=(3,3), activation='relu'))
ethnicity_model.add(MaxPooling2D(pool_size=(2,2)))
ethnicity_model.add(Dropout(0.5))
ethnicity_model.add(BatchNormalization())

ethnicity_model.add(Flatten())

ethnicity_model.add(Dense(64, activation='relu'))
ethnicity_model.add(Dropout(0.4))

ethnicity_model.add(Dense(5, activation='softmax')) 

ethnicity_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 

ethnicity_model.summary()

In [None]:
plot_model(ethnicity_model, to_file='ethnicity_model.png', show_shapes=True, show_layer_names=True)

In [None]:
checkpoint = ModelCheckpoint('ethnicity_model.h5', monitor='val_loss', save_best_only=True, mode='min')

callbacks = [EarlyStopping(patience=4, monitor='val_accuracy', mode='max'), 
              ReduceLROnPlateau(patience=2, verbose=1), checkpoint]

In [None]:
history = ethnicity_model.fit(X_train, y_train, 
                           epochs=15,
                           batch_size=batch_size,
                           callbacks=callbacks,
                           validation_data=(X_test, y_test))

In [None]:
ethnicity_model = load_model('ethnicity_model.h5')

In [None]:
# Metrics
valid_score = ethnicity_model.evaluate(X_test, y_test, verbose=1)
print('Accuracy for test set: {0}'.format(valid_score[1]))

In [None]:
y_pred =ethnicity_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, cmap='Greens', cbar=False, annot=True, fmt='d'); 

In [None]:
cm

In [None]:

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
import numpy as np

plt.figure(figsize=(7, 7))
for i in range(5):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='Sınıf %d (area = %0.2f)' % (i, roc_auc))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Pozitif Rate')
plt.ylabel('True Pozitif Rate')
plt.title('Ethnicity Model ROC Eğrisi')
plt.legend(loc="lower right")
plt.show()


In [None]:
fpr

In [None]:
tpr