In [1]:
import pandas as pd 
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

import warnings
warnings.filterwarnings('always')

In [2]:
df = pd.read_csv('gender-prediction.csv')
df

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
75,65,99,no,short,39,yes,green,female
76,61,98,no,short,37,no,brown,female
77,67,119,yes,short,40,no,black,male
78,70,190,yes,medium,43,no,gray,male


## Question 1

In [3]:
labels = preprocessing.LabelEncoder()

beard_encoded = labels.fit_transform(df['beard'])
hairLength_encoded = labels.fit_transform(df['hair_length'])
scarf_encoded = labels.fit_transform(df['scarf'])
eyeColor_encoded = labels.fit_transform(df['eye_color'])

x_encoded = list(zip(df['height'], df['weight'], beard_encoded, hairLength_encoded, df['shoe_size'], scarf_encoded, eyeColor_encoded))

In [4]:
y = df['gender']
y_encoded = labels.fit_transform(y)

In [5]:
X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.33, random_state = 2)

In [6]:
randomForest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [7]:
randomForest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)



MLPClassifier()

In [8]:
randomForest_prediction = randomForest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [9]:
# Confusion Matrices
randomForestModel_cm = confusion_matrix(y_test, randomForest_prediction)
svcModel_cm = confusion_matrix(y_test, svc_prediction)
mlpModel_cm = confusion_matrix(y_test, mlp_prediction)

print('Random Forest Conf Matrix')
print(randomForestModel_cm)

print('Support Vector Machine Conf Matrix')
print(svcModel_cm)

print('Multilayer Perceptron Conf Matrix')
print(mlpModel_cm)

Random Forest Conf Matrix
[[10  0]
 [ 0 17]]
Support Vector Machine Conf Matrix
[[ 7  3]
 [ 3 14]]
Multilayer Perceptron Conf Matrix
[[ 9  1]
 [ 1 16]]


In [10]:
randomForestModel_acc = accuracy_score(y_test, randomForest_prediction)*100
svcModel_acc = accuracy_score(y_test, svc_prediction)*100
mlpModel_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", randomForestModel_acc, "%")
print("Support Vector Machine accuracy", svcModel_acc, "%")
print("Multilayer Perceptron accuracy", mlpModel_acc, "%")

Random forest accuracy 100.0 %
Support Vector Machine accuracy 77.77777777777779 %
Multilayer Perceptron accuracy 92.5925925925926 %


In [11]:
#using train/test split ratio of 80/20
X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.2, random_state = 2)

In [12]:
randomForest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [13]:
randomForest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)

MLPClassifier()

In [14]:
randomForest_prediction = randomForest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [15]:
# Q2 - 2
# The Accuracy of the Support Vector Machine and Multilayer Perceptron have gone up
randomForestModel_acc = accuracy_score(y_test, randomForest_prediction)*100
svcModel_acc = accuracy_score(y_test, svc_prediction)*100
mlpModel_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", randomForestModel_acc, "%")
print("Support Vector Machine accuracy", svcModel_acc, "%")
print("Multilayer Perceptron accuracy", mlpModel_acc, "%")

Random forest accuracy 100.0 %
Support Vector Machine accuracy 81.25 %
Multilayer Perceptron accuracy 75.0 %


<!-- ## Question 2 -->

In [16]:
# 2 most powerful attributes are believed to be 
# "Beard" and "scarf" 
# as these can easily distiguish between a male and a female

In [17]:
#Rerunning the code after removing "Beard" and "scarf" 
x_encoded = list(zip(df['height'], df['weight'], hairLength_encoded, df['shoe_size'], eyeColor_encoded))
y = df['gender']
y_encoded = labels.fit_transform(y)

In [18]:
X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.2, random_state = 2)

In [19]:
randomForest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [20]:
randomForest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)



MLPClassifier()

In [21]:
randomForest_prediction = randomForest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [22]:
randomForestModel_acc = accuracy_score(y_test, randomForest_prediction)*100
svcModel_acc = accuracy_score(y_test, svc_prediction)*100
mlpModel_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", randomForestModel_acc, "%")
print("Support Vector Machine accuracy", svcModel_acc, "%")
print("Multilayer Perceptron accuracy", mlpModel_acc, "%")

print('\n')

randomForestModel_prec = precision_score(y_test, randomForest_prediction)*100
svcModel_prec = precision_score(y_test, svc_prediction)*100
mlpModel_prec = precision_score(y_test, mlp_prediction)*100
print("Random forest precision_score", randomForestModel_prec, "%")
print("Support Vector Machine precision_score", svcModel_prec, "%")
print("Multilayer Perceptron precision_score", mlpModel_prec, "%")

print('\n')

randomForestModel_rec = recall_score(y_test, randomForest_prediction)*100
svcModel_rec = recall_score(y_test, svc_prediction)*100
mlpModel_rec = recall_score(y_test, mlp_prediction)*100
print("Random forest recall_score", randomForestModel_rec, "%")
print("Support Vector Machine recall_score", svcModel_rec, "%")
print("Multilayer Perceptron recall_score", mlpModel_rec, "%")

print('\n')

randomForestModel_f1 = f1_score(y_test, randomForest_prediction)*100
svcModel_f1 = f1_score(y_test, svc_prediction)*100
mlpModel_f1 = f1_score(y_test, mlp_prediction)*100
print("Random forest f1_score", randomForestModel_f1, "%")
print("Support Vector Machinef1_score", svcModel_f1, "%")
print("Multilayer Perceptron f1_score", mlpModel_f1, "%")

Random forest accuracy 93.75 %
Support Vector Machine accuracy 81.25 %
Multilayer Perceptron accuracy 100.0 %


Random forest precision_score 90.9090909090909 %
Support Vector Machine precision_score 81.81818181818183 %
Multilayer Perceptron precision_score 100.0 %


Random forest recall_score 100.0 %
Support Vector Machine recall_score 90.0 %
Multilayer Perceptron recall_score 100.0 %


Random forest f1_score 95.23809523809523 %
Support Vector Machinef1_score 85.71428571428572 %
Multilayer Perceptron f1_score 100.0 %


## Question 3

In [23]:
x_encoded = list(zip(df['height'], df['weight'], beard_encoded, hairLength_encoded, df['shoe_size'], scarf_encoded, eyeColor_encoded))
y = df['gender']
y_encoded = labels.fit_transform(y)

In [24]:
#Monte Carlo cross-validation
monte_carlo = ShuffleSplit(n_splits=4,test_size=0.33,random_state=1)

In [25]:
decisionTree_model = DecisionTreeClassifier()

In [26]:
monteCarlo_acc = cross_val_score(decisionTree_model,x_encoded,y_encoded,cv=monte_carlo).mean() * 100
monteCarlo_f1 = cross_val_score(decisionTree_model,x_encoded,y_encoded, scoring="f1", cv=monte_carlo).mean() * 100
print("Monte Carlo cross-validation accuracy", monteCarlo_acc, "%")
print("Monte Carlo cross-validation F1 score", monteCarlo_f1, "%")

Monte Carlo cross-validation accuracy 97.22222222222221 %
Monte Carlo cross-validation F1 score 97.72727272727273 %


In [27]:
#Leave P-Out cross-validation
lpout = LeavePOut(2)
lpout.get_n_splits(x_encoded)

3160

In [28]:
decisionTree_model = DecisionTreeClassifier()

In [29]:
leavePout_acc = cross_val_score(decisionTree_model,x_encoded,y_encoded,cv=lpout).mean() *100
leavePout_f1 = cross_val_score(decisionTree_model,x_encoded,y_encoded,cv=lpout, scoring="f1_weighted").mean() * 100
print("Leave P-Out cross-validation accuracy", leavePout_acc, "%")
print("Leave P-Out cross-validation F1 score", leavePout_f1, "%")

Leave P-Out cross-validation accuracy 94.08227848101266 %
Leave P-Out cross-validation F1 score 94.01898734177216 %


## Question 4

In [30]:
df2 = pd.read_csv('gender-prediction-modified.csv')
df2

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
80,69,158,no,medium,43,no,black,male
81,71,157,yes,long,42,no,brown,male
82,67,152,yes,short,43,no,brown,male
83,62,106,no,long,38,yes,brown,female


In [31]:
train_data = df2.iloc[:80]
train_data

test_data = df2.iloc[80:]
test_data

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
80,69,158,no,medium,43,no,black,male
81,71,157,yes,long,42,no,brown,male
82,67,152,yes,short,43,no,brown,male
83,62,106,no,long,38,yes,brown,female
84,67,130,no,long,37,yes,black,female


In [32]:
labels = preprocessing.LabelEncoder()

train_beard_encoded = labels.fit_transform(train_data['beard'])
train_hair_length_encoded = labels.fit_transform(train_data['hair_length'])
train_scarf_encoded = labels.fit_transform(train_data['scarf'])
train_eye_color_encoded = labels.fit_transform(train_data['eye_color'])

In [33]:
X_train = list(zip(train_data['height'], train_data['weight'], train_beard_encoded, train_hair_length_encoded, train_data['shoe_size'], train_scarf_encoded, train_eye_color_encoded))
Y_train = train_data['gender']

In [34]:
test_beard_encoded = labels.fit_transform(test_data['beard'])
test_hair_length_encoded = labels.fit_transform(test_data['hair_length'])
test_scarf_encoded = labels.fit_transform(test_data['scarf'])
test_eye_color_encoded = labels.fit_transform(test_data['eye_color'])

In [36]:
x_test = list(zip(test_data['height'], test_data['weight'], test_beard_encoded, test_hair_length_encoded, test_data['shoe_size'], test_scarf_encoded, test_eye_color_encoded))
y_test = test_data['gender']

In [37]:
gnb_model = GaussianNB()

In [38]:
gnb_model.fit(X_train,Y_train)

GaussianNB()

In [39]:
prediction = gnb_model.predict(x_test)

In [41]:
model_acc = accuracy_score(y_test, prediction)*100
model_precision = precision_score(y_test, prediction, average=None).mean() *100
model_recall = recall_score(y_test, prediction, average=None).mean() *100

print("Gaussian Naive Bayes accuracy = ", model_acc, "%")
print("Gaussian Naive Bayes precision = ", model_precision, "%")
print("Gaussian Naive Bayes recall = ", model_recall, "%")

Gaussian Naive Bayes accuracy =  100.0 %
Gaussian Naive Bayes precision =  100.0 %
Gaussian Naive Bayes recall =  100.0 %
