In [1]:
# to identify gender of a person using a SVM

In [2]:
# Initial Step is to import the required libraries
# import libraries
import numpy as vsk_np

# This is the SVM Classifies
from sklearn.svm import SVC # Support Vector Classifier
# sklearn is a package
# svm is a library under sklearn
# SVC is a class under svm library

# We need to use Logistic Regression Library to compare our result with SVM
# and check performance of both models becuase of both are classifiers
from sklearn.linear_model import LogisticRegression

# This library is used to provide the result as accuracy score
# and also check the performance of the model
from sklearn.metrics import accuracy_score

In [3]:
# Step-1 Defining datasets using arrays
# We have to use two features one for height and other one for weight

# [height, weight]
# X is an independent variable and y is dependent variable
# because based on height & weight you will be determining the gender of a person
X = [[181, 80], [177, 70], [160, 60], [154, 54], [166, 65], [190, 90], 
     [175, 64], [177, 70], [159, 55], [171, 75], [181, 85], [168, 75], 
     [168, 77]]

# y is also having the data values in form of categorical (male and female) way
y = ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
     'female', 'male', 'male', 'female', 'female']

In [4]:
X # features or values from independent variables

[[181, 80],
 [177, 70],
 [160, 60],
 [154, 54],
 [166, 65],
 [190, 90],
 [175, 64],
 [177, 70],
 [159, 55],
 [171, 75],
 [181, 85],
 [168, 75],
 [168, 77]]

In [5]:
y

['male',
 'male',
 'female',
 'female',
 'male',
 'male',
 'female',
 'female',
 'female',
 'male',
 'male',
 'female',
 'female']

In [6]:
# Step-2 Spliting dataset into Training and Testing datasets

# We are getting the training data and testing data
# Import library for splitting the dataset into train and test.
from sklearn.model_selection import train_test_split

#Using the train_test_split to create train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size = 0.40)

In [7]:
print(X_train) # Training datasets 

[[168, 77], [190, 90], [181, 85], [177, 70], [181, 80], [166, 65], [171, 75]]


In [8]:
print (y_train)

['female', 'male', 'male', 'male', 'male', 'male', 'male']


In [9]:
print (X_test)

[[154, 54], [177, 70], [168, 75], [175, 64], [159, 55], [160, 60]]


In [10]:
print (y_test)

['female', 'female', 'female', 'female', 'female', 'female']


In [11]:
# Now we have two models for comparing between SVM and Logistic Regression
# Next step will be to build the models

#Step 3: Building the model using Support Vector Machine
#Support Vector Classifier

from sklearn.svm import SVC #importing SVM classifier
svm_clf = SVC() #Creating dynamic object for SVC class 
# we are creating object of the SVC class to access the various methods 
# of the SVC class

# Fit the model as per the training dataset
svm_clf.fit(X_train,y_train)

print ("Now SVM model is fit using training dataset.")

Now SVM model is fit using training dataset.


In [12]:
# Here we are building the logistic regression model
# Building the model using Logistic Regression class
from sklearn.linear_model import LogisticRegression

# we need to create an object to access the methods for logistic regression.
log_clf = LogisticRegression()

# Fitting the model as per training dataset
log_clf.fit(X_train,y_train)

print ("Now Logistic Regression model is fit using training dataset.")

Now Logistic Regression model is fit using training dataset.


In [13]:
# 1. imported libraries
# 2. Prepared the dataset
# 3. Split the dataset into Training and Testing datasets 
# 4. Built the models (SVM and Logistic Regression)

In [14]:
# Step 4: To Predict the model by testing dataset
# By using predict method we can predict the model as per testing dataset

#prediction
# we need to use features or values from independent variables
# data from testing dataset
y_pred_svm = svm_clf.predict(X_test)  
# X_test contains the features on which we are predicting 

y_pred_log = log_clf.predict(X_test)
#Prediting the model by testing data

In [15]:
print ("Actual Values:", y_test) # actual testing dataset
print ("Predicted by SVM:", y_pred_svm) # predicted testing dataset for SVM
print ("Predicted by Logistic Regression:", y_pred_log) # predicted testing dataset for Logistic regression

# From the displayed values you can observe that logistic regression for this
# problem is better

Actual Values: ['female', 'female', 'female', 'female', 'female', 'female']
Predicted by SVM: ['male' 'male' 'male' 'male' 'male' 'male']
Predicted by Logistic Regression: ['male' 'male' 'female' 'male' 'male' 'male']


In [16]:
# Step: 5 predicting a new value from dataset or newly inputted value

y_pred_svm_new = svm_clf.predict([[157, 100]]) 

y_pred_log_new = log_clf.predict([[157, 100]])

# print the predicted gender
print("Predicted Gender of a person by Logistic Regression Model :", y_pred_log_new)
print("Predicted Gender of a person by SVM Model : ", y_pred_svm_new)


Predicted Gender of a person by Logistic Regression Model : ['female']
Predicted Gender of a person by SVM Model :  ['male']


In [17]:
# Step 6: Model Evaluation

# When we need to calculate the accuracy of the model under classification 
# always use testing dataset to get the result

# we are measuring the performance of the model by accuracy score
from sklearn.metrics import accuracy_score

#accuracy scores
log_acc = accuracy_score(y_pred_log, y_test) #(prediction value , actual value)
svm_acc = accuracy_score(y_pred_svm, y_test)

print ("Accuracy of Logistic Regression Model in %:",log_acc*100)
print("Accuracy of Support Vector Machine Model in %:",svm_acc*100)

Accuracy of Logistic Regression Model in %: 16.666666666666664
Accuracy of Support Vector Machine Model in %: 0.0


In [18]:
# The main drawback of the SVM is that when we have small or limited amount
# of datasets, it may not provide better result.

In [19]:
# You want to print the classification report of the classification models
# use classification_report method from metrics library

from sklearn import metrics

print(metrics.classification_report(y_test, y_pred_log))
print(metrics.classification_report(y_test, y_pred_svm))


              precision    recall  f1-score   support

      female       1.00      0.17      0.29         6
        male       0.00      0.00      0.00         0

    accuracy                           0.17         6
   macro avg       0.50      0.08      0.14         6
weighted avg       1.00      0.17      0.29         6

              precision    recall  f1-score   support

      female       0.00      0.00      0.00       6.0
        male       0.00      0.00      0.00       0.0

    accuracy                           0.00       6.0
   macro avg       0.00      0.00      0.00       6.0
weighted avg       0.00      0.00      0.00       6.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
classifiers = ['Logistic Regression' , 'SVC']
accuracy = vsk_np.array([log_acc, svm_acc])

# argmax is methos which returns the maximum value from the array's values
max_acc = vsk_np.argmax(accuracy)

print(classifiers[max_acc] + ' is the better classifier for this problem')

Logistic Regression is the better classifier for this problem
