# **Predicting gender with ensemble learning approach: VottingClassifier**

![](https://scx2.b-cdn.net/gfx/news/hires/2018/gender.jpg)

# ***== Import libraries ==***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix, roc_curve, auc

import warnings
warnings.filterwarnings('ignore')

# ***== Data reading & cleaning ==***

In [None]:
data = pd.read_csv('../input/gender-classification-dataset/gender_classification_v7.csv')
print(data.shape)
data.head()

In [None]:
# Encode the 'gender' variable: 0 for 'Male & 1 for 'Female'
code = {'Male':0, 'Female':1}
data['gender'] = data['gender'].map(code)
data.head()

In [None]:
data.info()

As can be observed, most of the features are actually categorical, only 'forehead_width_cm' and 'forehead_height_cm' are numerical. 

In [None]:
# Turn the features into the right data type
categories = [i for i in data.columns if data[i].dtype == 'int64']
for i in categories:
  data[i] = data[i].astype('category')

data.info()

In [None]:
# Investigate the number of unique values in each column
for i in data.columns:
  print(f'The column "{i}" has {len(data[i].value_counts())} unique values.')

# ***== Exploratory data analysis (EDA) ==***

In [None]:
print(data['gender'].value_counts())

pie, ax = plt.subplots(figsize=[15,10])
labels = [1, 0]
colors = ['#7b77ff', '#7df691']
plt.pie(x = data['gender'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors)
plt.title('Gender distribution')
plt.show()

In [None]:
plt.figure(figsize=(8,12))
my_pall={1:'#7b77ff', 0:'#7df691'}
sns.boxplot(x='gender', y="forehead_width_cm", data=data, palette=my_pall)
plt.title('Boxplot of forehead_width_cm by gender')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(8,12))
my_pall={1:'#7b77ff', 0:'#7df691'}
sns.boxplot(x='gender', y="forehead_height_cm", data=data, palette=my_pall)
plt.title('Boxplot of forehead_height_cm by gender')
plt.grid()
plt.show()

In [None]:
genders_diff = data.groupby('gender')[['forehead_width_cm','forehead_height_cm' ]].mean()

labels = ['forehead width', 'forehead height']
m_means =[genders_diff.iloc[0, 0], genders_diff.iloc[0,1]]
f_means = [genders_diff.iloc[1, 0], genders_diff.iloc[1,1]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(12,8))
rects1 = ax.bar(x - width/2, m_means, width, label='0', color = '#7df691')
rects2 = ax.bar(x + width/2, f_means, width, label='1', color = '#7b77ff')


ax.set_ylabel('Mean')
ax.set_title("Forehead's MEAN width & height by gender")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
plt.grid()
plt.show()

In [None]:
males = data[data['gender'] == 0]
females = data[data['gender'] == 1]
# HAIR comparison
pie, ax = plt.subplots(1,2, figsize=[15,10])
labels = ['Long', 'Not-Long']
colors_m = ['#35f154', '#adf9b9']
colors_f = ['#4a44ff', '#adaaff']
ax[0].pie(x = males['long_hair'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_m)
ax[0].set_title('Males hair type')
ax[1].pie(x = females['long_hair'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_f)
ax[1].set_title('Females hair type')
plt.show()

In [None]:
# NOSE WIDE comparison
pie, ax = plt.subplots(1,2, figsize=[15,10])
labels = ['Wide', 'Non-wide']
colors_m = ['#35f154', '#adf9b9']
colors_f = ['#4a44ff', '#adaaff']
ax[0].pie(x = males['nose_wide'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_m)
ax[0].set_title('Males wide / non-wide nose')
ax[1].pie(x = females['nose_wide'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_f)
ax[1].set_title('Females wide / non-wide nose')
plt.show()

In [None]:
# NOSE LONG comparison 
pie, ax = plt.subplots(1,2, figsize=[15,10])
labels = ['Long', 'Non-long']
colors_m = ['#35f154', '#adf9b9']
colors_f = ['#4a44ff', '#adaaff']
ax[0].pie(x = males['nose_long'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_m)
ax[0].set_title('Males long / non-long nose')
ax[1].pie(x = females['nose_long'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_f)
ax[1].set_title('Females long / non-long nose')
plt.show()

In [None]:
# LIPS comparison
pie, ax = plt.subplots(1,2, figsize=[15,10])
labels = ['Thin', 'Non-thin']
colors_m = ['#35f154', '#adf9b9']
colors_f = ['#4a44ff', '#adaaff']
ax[0].pie(x = males['lips_thin'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_m)
ax[0].set_title('Males thin / non-thin lips')
ax[1].pie(x = females['lips_thin'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_f)
ax[1].set_title('Females thin / non-thin lips')
plt.show()

In [None]:
# LIPS-NOSE distance comparison
pie, ax = plt.subplots(1,2, figsize=[15,10])
labels = ['Long', 'Short']
colors_m = ['#35f154', '#adf9b9']
colors_f = ['#4a44ff', '#adaaff']
ax[0].pie(x = males['distance_nose_to_lip_long'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_m)
ax[0].set_title('Males distance between nose and lips')
ax[1].pie(x = females['distance_nose_to_lip_long'].value_counts(), autopct='%.2f%%', explode=[0.02]*2, labels=labels, pctdistance=0.5, textprops={'fontsize': 14}, colors = colors_f)
ax[1].set_title('Females distance between nose and lips')
plt.show()

# ***== Data preparation ==***

In [None]:
# Separate features and target
features = data.iloc[:, :-1]
target = data.iloc[:, -1]

# Split them into training and testing set
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 123, shuffle = True, stratify = target)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# ***== Modelling ==***

## Initialize classifiers and hyperparameters

In [None]:
# Initiate classifiers
LR = LogisticRegression()
SGDC = SGDClassifier()
SVC = SVC()
KNN = KNeighborsClassifier()
DT = DecisionTreeClassifier()

# Initiate hyperparameters for classifiers
param_LR = {'C':[0.01, 0.1, 1, 10], 'penalty':['l1', 'l2']}
param_SGDC = {'alpha':[0.01, 0.1, 1, 10], 'loss':['hinge', 'log'], 'penalty':['l1', 'l2']}
param_SVC = {'C':[0.01, 0.1, 1, 10], 'gamma':[0.01, 0.1, 1, 10]}
param_KNN = {'n_neighbors':[2,3,4,5,6]}
param_DT = {'criterion':['gini', 'entropy'], 'max_depth': [3,4,5,6], 'min_samples_leaf':[0.1, 0.5, 1, 1.5, 2]}

## Hyperparameters tuning

In [None]:
# Logistic egression
search_LR = GridSearchCV(LR, param_LR)
search_LR.fit(X_train, y_train)
print(f'Best CV params {search_LR.best_params_}')
print(f'Best CV accuracy {search_LR.best_score_}')
print(f'Test accuracy of best hypers {search_LR.score(X_test, y_test)}')

In [None]:
# SGDClassifier
search_SGDC = GridSearchCV(SGDC, param_SGDC)
search_SGDC.fit(X_train, y_train)
print(f'Best CV params {search_SGDC.best_params_}')
print(f'Best CV accuracy {search_SGDC.best_score_}')
print(f'Test accuracy of best hypers {search_SGDC.score(X_test, y_test)}')

In [None]:
# SVC
search_SVC = GridSearchCV(SVC, param_SVC)
search_SVC.fit(X_train, y_train)
print(f'Best CV params {search_SVC.best_params_}')
print(f'Best CV accuracy {search_SVC.best_score_}')
print(f'Test accuracy of best hypers {search_SVC.score(X_test, y_test)}')

In [None]:
# KNN
search_KNN = GridSearchCV(KNN, param_KNN)
search_KNN.fit(X_train, y_train)
print(f'Best CV params {search_KNN.best_params_}')
print(f'Best CV accuracy {search_KNN.best_score_}')
print(f'Test accuracy of best hypers {search_KNN.score(X_test, y_test)}')

In [None]:
# DecisionTree
search_DT = GridSearchCV(DT, param_DT)
search_DT.fit(X_train, y_train)
print(f'Best CV params {search_DT.best_params_}')
print(f'Best CV accuracy {search_DT.best_score_}')
print(f'Test accuracy of best hypers {search_DT.score(X_test, y_test)}')

## Ensemble Learning

In [None]:
from sklearn.svm import SVC

# Re-initiate the models with their best hyperparameters
LR = LogisticRegression(C=0.1, penalty='l2')
SGDC = SGDClassifier(alpha=0.01, loss='hinge', penalty='l2')
SVC = SVC(C=10, gamma=0.1)
KNN = KNeighborsClassifier(n_neighbors=5)
DT = DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=2)

# Define a list with tuples that contains classifier's name & classifier
classifiers = [('Logistic Regression', LR),
               ('SGDClassifier', SGDC),
               ('SVC', SVC),
               ('KNN', KNN),
               ('Decision Tree', DT)]

In [None]:
for c_name, c in classifiers:
    c.fit(X_train, y_train)
    preds = c.predict(X_test)
    print(f'{c_name} accuracy: {accuracy_score(y_test, preds)}')

In [None]:
# Initiate Voting Classifier
VC = VotingClassifier(estimators=classifiers)
VC.fit(X_train, y_train)
preds = VC.predict(X_test)
print(f'Voting Classifier score: {accuracy_score(y_test, preds)}')

Using the ensemble learning, the accuracy was not improved.

The best performancce was achieved by the DecisionTreeClassifier.

In [None]:
DT.fit(X_train, y_train)
preds = DT.predict(X_test)
confusion_matrix(y_test, preds)

In [None]:
plot_confusion_matrix(DT, X_test, y_test)

Out of 1001 samples, the DecisionTree missclassified 27.


In [None]:
print(classification_report(y_test, preds))

In [None]:
probs = DT.predict_proba(X_test)
pred = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(12,8))
plt.title('ROC')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()