# Ethnicity, age and gender classifier using vanilla CNN

## Import Packages and Data Preprocessing

In [None]:
# import misc packages
import numpy as np
import pandas
import seaborn
import matplotlib.pyplot as plt

# import keras from tensorflow
from tensorflow import keras

# import sklearn packages
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

In [None]:
df = pandas.read_csv('../input/age-gender-and-ethnicity-face-data-csv/age_gender.csv')

In [None]:
df.iloc[0]

> delete **image_name** column

In [None]:
df = df.drop(['img_name'], axis = 1)

In [None]:
X = df['pixels'].apply(lambda x : np.array(x.split(), dtype = float))
X_bar = [np.array([val for val in X[i]]).reshape(48, 48) for i in range(X.shape[0])]

In [None]:
X = np.array(X_bar)
X = X/255.0

In [None]:
print(X.shape)

In [None]:
from tabulate import tabulate
info = [[col, len(df[col].unique()), df[col].count(), df[col].max(), df[col].min()] for col in ['age', 'ethnicity', 'gender']]
print(tabulate(info, headers = ['Feature', 'Unique vals', 'Count', 'Maximum', 'Minimum'], tablefmt = 'orgtbl'))

In [None]:
fig = plt.figure(figsize = (10,10))
i = 6
while i:
    x = np.random.randint(0, 20000)
    plt.subplot(2, 3, i)
    plt.imshow(X[x])
    plt.axis ('off')
    i -= 1
plt.show()

## First predict their ethnicity

In [None]:
y = np.array(df['ethnicity'])
y = keras.utils.to_categorical(y)

In [None]:
print(y[35:50, :])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
def cnn (num_filters_1, num_filters_2, hidden_nodes) :
    
    # add input  layer
    feature = keras.Input(shape = (48, 48, 1, ))
    
    # 1st convolution layer
    conv_layer_1 = keras.layers.Conv2D(num_filters_1, (3,3), padding = 'valid', activation = 'relu')(feature)
    drop_layer_1 = keras.layers.Dropout(0.2)(conv_layer_1)
    maxp_layer_1 = keras.layers.MaxPooling2D()(drop_layer_1)
    
    # 2nd convolution layer
    conv_layer_2 = keras.layers.Conv2D(num_filters_2, (3,3), padding = 'valid', activation = 'relu')(maxp_layer_1)
    drop_layer_2 = keras.layers.Dropout(0.2)(conv_layer_2)
    maxp_layer_2 = keras.layers.MaxPooling2D()(drop_layer_2)
    
    # 1st hidden layer
    flatten = keras.layers.Flatten()(maxp_layer_2)
    hidden_layer = keras.layers.Dense(hidden_nodes, activation = 'relu')(flatten)
    
    # add output layer
    output  = keras.layers.Dense(y.shape[1], activation= 'softmax')(hidden_layer)
    
    # compile the layers
    cnn = keras.Model(inputs = feature, outputs = output, name = 'MLP')
    cnn.compile(loss = 'categorical_crossentropy', optimizer  = 'adam', metrics = ['accuracy'])
    
    return cnn

In [None]:
model = keras.wrappers.scikit_learn.KerasClassifier(build_fn = cnn, verbose = 1)
param = {'num_filters_1' : [80, 100, 150],
         'num_filters_2' : [50,  60,  75],
         'hidden_nodes'  : [10]}
grd = GridSearchCV(estimator = model, param_grid = param, cv = KFold(n_splits = 3, shuffle = True, random_state = 1), verbose = 1)

In [None]:
search = grd.fit(X_train, y_train, epochs = 100, batch_size = 32, verbose = 0)

In [None]:
print(search.best_params_)

In [None]:
model = cnn(search.best_params_['num_filters_1'], search.best_params_['num_filters_2'], search.best_params_['hidden_nodes'])
print(model.summary())

In [None]:
model.fit(X_train, y_train, epochs = 135, batch_size = 32)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(y_pred[0,:])

In [None]:
y_pred = np.around(y_pred)

In [None]:
print(y_pred[0,:])
print(y_test[0,:])

> Results for classifying **Ethnicity**

In [None]:
print('accuracy : ' + str(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

## Next let's go for Gender

In [None]:
print(X.shape)

In [None]:
fig = plt.figure(figsize = (10,10))
plt.imshow(X[9])
plt.axis ('off')

In [None]:
y = np.array(df['gender'])
y = keras.utils.to_categorical(y)

> Applying to_categorical(), was not needed at all but this was sheer **laziness XD**

In [None]:
print(y[:10, :])

In [None]:
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
model = keras.wrappers.scikit_learn.KerasClassifier(build_fn = cnn, verbose = 1)
param = {'num_filters_1' : [75,100],
         'num_filters_2' : [40, 50],
         'hidden_nodes'  : [25, 50]}
grd = GridSearchCV(estimator = model, param_grid = param, cv = KFold(n_splits = 3, shuffle = True, random_state = 1), verbose = 1)

In [None]:
search = grd.fit(X_train, y_train, epochs = 50, batch_size = 32, verbose = 0)

In [None]:
print(search.best_params_)

In [None]:
model = cnn(search.best_params_['num_filters_1'], search.best_params_['num_filters_2'], search.best_params_['hidden_nodes'])
print(model.summary())

In [None]:
model.fit(X_train, y_train, epochs = 50, batch_size = 32)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = np.around(y_pred)

In [None]:
print(y_pred[0,:])
print(y_test[0,:])

> Results for gender

In [None]:
print('Accuracy : ' + str(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

## Lastly, work out for age. We will keep age as a continueous values (Even though they are finite and discrete, this makes more sense).

In [None]:
print(X.shape)

In [None]:
y = np.array(df['age'], dtype = float)

In [None]:
print(y[:])

In [None]:
print(y.shape)

> define a new **cnn** model for this task

In [None]:
def cnn_cont (num_filters_1, num_filters_2, hidden_nodes) :
    
    # add  input layer
    feature = keras.Input(shape = (48, 48, 1))
    
    # 1st convolution layer
    conv_layer_1 = keras.layers.Conv2D(num_filters_1, (3,3), padding = 'valid', activation = 'relu')(feature)
    drop_layer_1 = keras.layers.Dropout(0.2)(conv_layer_1)
    maxp_layer_1 = keras.layers.MaxPooling2D((2,2), padding = 'valid')(drop_layer_1)
    
    # 2nd convolution layer
    conv_layer_2 = keras.layers.Conv2D(num_filters_2, (3,3), padding = 'valid', activation = 'relu')(maxp_layer_1)
    drop_layer_2 = keras.layers.Dropout(0.2)(conv_layer_2)
    maxp_layer_2 = keras.layers.MaxPooling2D((2,2), padding = 'valid')(drop_layer_2)
    
    # hidden layer
    flatten = keras.layers.Flatten()(maxp_layer_2)
    hidden_layer = keras.layers.Dense(hidden_nodes, activation = 'relu')(flatten)
    
    # add output layer
    output = keras.layers.Dense(1, activation = 'linear')(hidden_layer)
    
    # compile the model
    cnn = keras.Model(inputs = feature, outputs = output, name = 'CNN')
    cnn.compile(loss = 'mse', metrics = ['mae'],  optimizer  =  'adam')
    
    return cnn

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
model = keras.wrappers.scikit_learn.KerasRegressor(build_fn = cnn_cont, verbose = 1)
param = {'num_filters_1' : [75,100],
         'num_filters_2' : [30, 40],
         'hidden_nodes' : [250,400]}
grd = GridSearchCV(estimator = model, param_grid = param, cv = KFold(n_splits = 3, shuffle = True, random_state = 1), verbose = 1)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
search = grd.fit(X_train, y_train, epochs = 150, batch_size= 32, verbose = 0)

In [None]:
print('Best score : ' + str(search.best_params_))
print(search.best_params_)

In [None]:
model = cnn_cont(search.best_params_['num_filters_1'],search.best_params_['num_filters_2'],search.best_params_['hidden_nodes'])
print(model.summary())

In [None]:
model.fit(X_train, y_train, epochs = 150, batch_size = 32)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(y_pred.shape)

In [None]:
y_pred = np.array(y_pred.reshape(-1))

In [None]:
print(y_pred[3000:3010])
print(y_test[3000:3010])

In [None]:
from sklearn.metrics import mean_absolute_error
print('Error : ' + str(mean_absolute_error(y_test, y_pred)))

> To get a better understanding of how well the model is performing, draw several plots

In [None]:
i = 10
while i :
    fig = plt.figure(figsize = (15,5))
    x = np.random.randint(0, 5000)
    plt.plot(y_test[x:x+150])
    plt.plot(y_pred[x:x+150])
    
    plt.legend(['actual values', 'predicted values'])
    plt.show()
    i -= 1

In [None]:
fig = plt.figure(figsize=(5,5))
plt.imshow(X_test[355])
plt.axis ('off')
print(y_pred[355])

In [None]:
fig = plt.figure(figsize=(5,5))
plt.imshow(X_test[877])
plt.axis ('off')
print(y_pred[877])

In [None]:
fig = plt.figure(figsize=(5,5))
plt.imshow(X_test[999])
plt.axis ('off')
print(y_pred[999])

In [None]:
fig = plt.figure(figsize=(5,5))
plt.imshow(X_test[666])
plt.axis ('off')
print(y_pred[666])