In [1]:
import pandas as pd 
from sklearn import preprocessing

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import warnings
warnings.filterwarnings('always')

In [3]:
df = pd.read_csv('gender-prediction.csv')
df

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
75,65,99,no,short,39,yes,green,female
76,61,98,no,short,37,no,brown,female
77,67,119,yes,short,40,no,black,male
78,70,190,yes,medium,43,no,gray,male


## Question 1

In [4]:
labels = preprocessing.LabelEncoder()

beard_encoded = labels.fit_transform(df['beard'])
hair_length_encoded = labels.fit_transform(df['hair_length'])
scarf_encoded = labels.fit_transform(df['scarf'])
eye_color_encoded = labels.fit_transform(df['eye_color'])

x_encoded = list(zip(df['height'], df['weight'], beard_encoded, hair_length_encoded, df['shoe_size'], scarf_encoded, eye_color_encoded))

In [5]:
y = df['gender']
y_encoded = labels.fit_transform(y)

In [6]:
X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.33, random_state = 2)

In [7]:
random_forest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [8]:
random_forest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)

MLPClassifier()

In [9]:
random_forest_prediction = random_forest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [10]:
random_forest_model_acc = accuracy_score(y_test, random_forest_prediction)*100
svc_model_acc = accuracy_score(y_test, svc_prediction)*100
mlp_model_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", random_forest_model_acc, "%")
print("Support Vector Machine accuracy", svc_model_acc, "%")
print("Multilayer Perceptron accuracy", mlp_model_acc, "%")

Random forest accuracy 100.0 %
Support Vector Machine accuracy 77.77777777777779 %
Multilayer Perceptron accuracy 62.96296296296296 %


In [11]:
#using train/test split ratio of 80/20

X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.2, random_state = 2)

In [12]:
random_forest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [13]:
random_forest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)

MLPClassifier()

In [14]:
random_forest_prediction = random_forest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [15]:
random_forest_model_acc = accuracy_score(y_test, random_forest_prediction)*100
svc_model_acc = accuracy_score(y_test, svc_prediction)*100
mlp_model_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", random_forest_model_acc, "%")
print("Support Vector Machine accuracy", svc_model_acc, "%")
print("Multilayer Perceptron accuracy", mlp_model_acc, "%")

Random forest accuracy 100.0 %
Support Vector Machine accuracy 81.25 %
Multilayer Perceptron accuracy 62.5 %


## Question 2

In [16]:
# 2 attributes believed to be most “powerful”
# Beard and scarf
#remvoing these two attributes

In [17]:
x_encoded = list(zip(df['height'], df['weight'], hair_length_encoded, df['shoe_size'], eye_color_encoded))
y = df['gender']
y_encoded = labels.fit_transform(y)

In [18]:
X_train, x_test, Y_train, y_test = train_test_split(x_encoded, y_encoded, test_size = 0.2, random_state = 2)

In [19]:
random_forest_model = RandomForestClassifier()
svc_model = SVC()
mlp_model = MLPClassifier()

In [20]:
random_forest_model.fit(X_train,Y_train)
svc_model.fit(X_train,Y_train)
mlp_model.fit(X_train,Y_train)

MLPClassifier()

In [21]:
random_forest_prediction = random_forest_model.predict(x_test)
svc_prediction = svc_model.predict(x_test)
mlp_prediction = mlp_model.predict(x_test)

In [22]:
random_forest_model_acc = accuracy_score(y_test, random_forest_prediction)*100
svc_model_acc = accuracy_score(y_test, svc_prediction)*100
mlp_model_acc = accuracy_score(y_test, mlp_prediction)*100
print("Random forest accuracy", random_forest_model_acc, "%")
print("Support Vector Machine accuracy", svc_model_acc, "%")
print("Multilayer Perceptron accuracy", mlp_model_acc, "%")

Random forest accuracy 100.0 %
Support Vector Machine accuracy 81.25 %
Multilayer Perceptron accuracy 81.25 %


## Question 3

In [23]:
x_encoded = list(zip(df['height'], df['weight'], beard_encoded, hair_length_encoded, df['shoe_size'], scarf_encoded, eye_color_encoded))
y = df['gender']
y_encoded = labels.fit_transform(y)

In [24]:
#Monte Carlo cross-validation
monte_carlo = ShuffleSplit(n_splits=5,test_size=0.33,random_state=7)

In [25]:
decision_tree_model = DecisionTreeClassifier()

In [26]:
monte_carlo_acc = cross_val_score(decision_tree_model,x_encoded,y_encoded,cv=monte_carlo).mean() * 100
monte_carlo_f1 = cross_val_score(decision_tree_model,x_encoded,y_encoded, scoring="f1", cv=monte_carlo).mean() * 100
print("Monte Carlo cross-validation accuracy", monte_carlo_acc, "%")
print("Monte Carlo cross-validation F1 score", monte_carlo_f1, "%")

Monte Carlo cross-validation accuracy 94.07407407407406 %
Monte Carlo cross-validation F1 score 95.32820512820513 %


In [27]:
#Leave P-Out cross-validation
lpo = LeavePOut(2)
lpo.get_n_splits(x_encoded)

3160

In [28]:
decision_tree_model = DecisionTreeClassifier()

In [29]:
leave_pout_acc = cross_val_score(decision_tree_model,x_encoded,y_encoded,cv=lpo).mean() *100
leave_pout_f1 = cross_val_score(decision_tree_model,x_encoded,y_encoded,cv=lpo, scoring="f1_weighted").mean() * 100
print("Leave P-Out cross-validation accuracy", leave_pout_acc, "%")
print("Leave P-Out cross-validation F1 score", leave_pout_f1, "%")

Leave P-Out cross-validation accuracy 94.06645569620254 %
Leave P-Out cross-validation F1 score 94.24050632911393 %


## Question 4

In [30]:
df2 = pd.read_csv('gender-prediction_new.csv')
df2

FileNotFoundError: [Errno 2] No such file or directory: 'gender-prediction_new.csv'

In [None]:
train_data = df2.iloc[:80]
train_data

test_data = df2.iloc[80:]
test_data

In [None]:
labels = preprocessing.LabelEncoder()

train_beard_encoded = labels.fit_transform(train_data['beard'])
train_hair_length_encoded = labels.fit_transform(train_data['hair_length'])
train_scarf_encoded = labels.fit_transform(train_data['scarf'])
train_eye_color_encoded = labels.fit_transform(train_data['eye_color'])

In [None]:
X_train = list(zip(train_data['height'], train_data['weight'], train_beard_encoded, train_hair_length_encoded, train_data['shoe_size'], train_scarf_encoded, train_eye_color_encoded))
Y_train = train_data['gender']

In [None]:
test_beard_encoded = labels.fit_transform(test_data['beard'])
test_hair_length_encoded = labels.fit_transform(test_data['hair_length'])
test_scarf_encoded = labels.fit_transform(test_data['scarf'])
test_eye_color_encoded = labels.fit_transform(test_data['eye_color'])

In [None]:
x_test = list(zip(test_data['height'], test_data['weight'], test_beard_encoded, test_hair_length_encoded, test_data['shoe_size'], test_scarf_encoded, test_eye_color_encoded))
y_test = test_data['gender']

In [None]:
gnb_model = GaussianNB()

In [None]:
gnb_model.fit(X_train,Y_train)

In [None]:
prediction = gnb_model.predict(x_test)

In [None]:
model_acc = accuracy_score(y_test, prediction)*100
model_precision = precision_score(y_test, prediction, average=None).mean() *100
model_recall = recall_score(y_test, prediction, average=None).mean() *100
print("Gaussian Naive Bayes accuracy = ", model_acc, "%")
print("Gaussian Naive Bayes precision = ", model_precision, "%")
print("Gaussian Naive Bayes recall = ", model_recall, "%")