# Gender  Recognition using ML

In [1]:
import numpy as np  # used for working with arrays 
import pandas as pd # to make the dataframe 
from sklearn.feature_extraction.text import CountVectorizer as CV #extracting the features of text data 
from sklearn.feature_extraction import DictVectorizer as dv #Transforms lists of feature-value mappings to vectors.
from sklearn.model_selection import train_test_split  # for splitting the dataset 

In [2]:
#loading the dataset
df=pd.read_csv('name_gender.csv')# load a CSV file as a pandas dataframe.


In [3]:
df.head() #displays the first five rows of the dataframe

Unnamed: 0,name,gender,probability
0,Aaban,M,1.0
1,Aabha,F,1.0
2,Aabid,M,1.0
3,Aabriella,F,1.0
4,Aada,F,1.0


In [4]:
df.shape #  tells the number of rows and columns of a given DataFrame

(95023, 3)

In [5]:
df.columns

Index(['name', 'gender', 'probability'], dtype='object')

In [6]:
df=df.drop(['probability'],axis=1) #Remove rows or columns

In [7]:
df.shape

(95023, 2)

In [8]:
df.isnull().isnull().sum() # for checking the null value 

name      0
gender    0
dtype: int64

In [9]:
df[df.gender=='M'].size

69444

In [10]:
df[df.gender=='F'].size

120602

In [11]:
 df.gender.replace({'F':0,'M':1},inplace=True) # replaces the specified value with another specified value

In [12]:
df.gender.unique() #returns all unique elements of a column

array([1, 0], dtype=int64)

In [13]:
df.dtypes  # returns data type of each column in the DataFrame

name      object
gender     int64
dtype: object

In [14]:
#Feature extraction

In [15]:
x=df.name
x

0            Aaban
1            Aabha
2            Aabid
3        Aabriella
4             Aada
           ...    
95018        Zyvon
95019      Zyyanna
95020        Zyyon
95021        Zzyzx
95022    undefined
Name: name, Length: 95023, dtype: object

In [16]:
cv=CV() # creating object for count vectorizer
x=cv.fit_transform(x)
print(x)

  (0, 0)	1
  (1, 1)	1
  (2, 2)	1
  (3, 3)	1
  (4, 4)	1
  (5, 5)	1
  (6, 6)	1
  (7, 7)	1
  (8, 8)	1
  (9, 9)	1
  (10, 10)	1
  (11, 11)	1
  (12, 12)	1
  (13, 13)	1
  (14, 14)	1
  (15, 15)	1
  (16, 16)	1
  (17, 17)	1
  (18, 18)	1
  (19, 19)	1
  (20, 20)	1
  (21, 21)	1
  (22, 22)	1
  (23, 23)	1
  (24, 24)	1
  :	:
  (94998, 94999)	1
  (94999, 95000)	1
  (95000, 95001)	1
  (95001, 95002)	1
  (95002, 95003)	1
  (95003, 95004)	1
  (95004, 95005)	1
  (95005, 95006)	1
  (95006, 95007)	1
  (95007, 95008)	1
  (95008, 95009)	1
  (95009, 95010)	1
  (95010, 95011)	1
  (95011, 95012)	1
  (95012, 95013)	1
  (95013, 95014)	1
  (95014, 95015)	1
  (95015, 95016)	1
  (95016, 95017)	1
  (95017, 95018)	1
  (95018, 95019)	1
  (95019, 95020)	1
  (95020, 95021)	1
  (95021, 95022)	1
  (95022, 88881)	1


In [17]:
print(x[1])

  (0, 1)	1


In [18]:
x.shape

(95023, 95023)

In [19]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [20]:
# labels
y=df.gender
y

0        1
1        0
2        1
3        0
4        0
        ..
95018    1
95019    0
95020    1
95021    1
95022    0
Name: gender, Length: 95023, dtype: int64

In [21]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.35,random_state=0)

In [22]:
x_train

<61764x95023 sparse matrix of type '<class 'numpy.int64'>'
	with 61764 stored elements in Compressed Sparse Row format>

In [23]:
y_train

49220    0
4477     0
87222    0
75966    0
4181     0
        ..
21243    0
45891    0
42613    1
43567    1
68268    0
Name: gender, Length: 61764, dtype: int64

**Binary Classification**

Binary classification refers to those classification tasks that have two class labels
Here the  class with the gender male is assigned with the class label  1 and female is assigned with the class label  0

In [24]:
from sklearn.naive_bayes import  BernoulliNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier  
from sklearn.linear_model import LogisticRegression  

In [25]:
# Naive bayes classifier 
# Bernaulli NB
from sklearn.naive_bayes import  BernoulliNB
clf1=BernoulliNB()
clf1.fit(x_train,y_train)
clf1.score(x_test,y_test)


0.6342343425839623

In [26]:
#SVM
from sklearn import svm
clf2= svm.SVC(kernel='linear') # Linear Kernel
clf2.fit(x_train, y_train)
clf2.score(x_test,y_test)

0.6342343425839623

In [27]:
clf3=svm.SVC(kernel='rbf')#rbf kernel
clf3.fit(x_train,y_train)
clf3.score(x_test,y_test)


0.6342343425839623

In [28]:
#Decision tree
clf4=DecisionTreeClassifier(random_state=0,criterion='entropy')
clf4.fit(x_train,y_train)
clf4.score(x_test,y_test)

0.6342343425839623

In [29]:
#Logistic Regression
clf5=LogisticRegression(random_state=0)
clf5.fit(x_train,y_train)
clf5.score(x_test,y_test)



0.6342343425839623

**Sample Prediction**

In [30]:
sample1=["Sakshi"]

In [31]:
vect1=cv.transform(sample1).toarray()

In [32]:
clf1.predict(vect1)

array([0], dtype=int64)

In [33]:
sample2=["Sakshi","Hrithik","Aniket","Namrata","Dan","Stuti"]
vect2=cv.transform(sample2).toarray()
clf1.predict(vect2)

array([0, 0, 0, 0, 0, 0], dtype=int64)

In [34]:
 def genderpredictor(a):
   test_name=[a]
   vector=cv.transform(test_name).toarray()
   if clf1.predict(vector)==0:
     print("Female")
   else:
     print("Male")

In [35]:
genderpredictor("Sakina")  

Female


In [36]:
namelist = ['abigahil',
 'abigai',
 'abigail',
 'aaliya',
 'abigailgrace']
for i in namelist:
    print(genderpredictor(i))

Female
None
Female
None
Female
None
Female
None
Female
None


# Using a custom function for feature analysis 


In [37]:
def features(name):
 name=name.lower()
 return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
  }

In [38]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Robert","abira","abony"]))

[{'first-letter': 'r', 'first2-letters': 'ro', 'first3-letters': 'rob', 'last-letter': 't', 'last2-letters': 'rt', 'last3-letters': 'ert'}
 {'first-letter': 'a', 'first2-letters': 'ab', 'first3-letters': 'abi', 'last-letter': 'a', 'last2-letters': 'ra', 'last3-letters': 'ira'}
 {'first-letter': 'a', 'first2-letters': 'ab', 'first3-letters': 'abo', 'last-letter': 'y', 'last2-letters': 'ny', 'last3-letters': 'ony'}]


In [39]:
# Extract the features for the dataset
df_X = features(df['name'])

In [40]:
df_y = df['gender']

In [41]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"]) #machine readable text
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [42]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [43]:

# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [44]:
dfX_train

array([{'first-letter': 'm', 'first2-letters': 'ma', 'first3-letters': 'mal', 'last-letter': 'z', 'last2-letters': 'az', 'last3-letters': 'laz'},
       {'first-letter': 'l', 'first2-letters': 'lu', 'first3-letters': 'lun', 'last-letter': 'd', 'last2-letters': 'rd', 'last3-letters': 'ord'},
       {'first-letter': 'i', 'first2-letters': 'id', 'first3-letters': 'ide', 'last-letter': 'a', 'last2-letters': 'ha', 'last3-letters': 'sha'},
       ...,
       {'first-letter': 's', 'first2-letters': 'se', 'first3-letters': 'sep', 'last-letter': 'r', 'last2-letters': 'hr', 'last3-letters': 'ehr'},
       {'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'ada', 'last-letter': 'l', 'last2-letters': 'al', 'last3-letters': 'dal'},
       {'first-letter': 'c', 'first2-letters': 'ch', 'first3-letters': 'cha', 'last-letter': 'h', 'last2-letters': 'ph', 'last3-letters': 'eph'}],
      dtype=object)

In [45]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<63665x8139 sparse matrix of type '<class 'numpy.float64'>'
	with 381990 stored elements in Compressed Sparse Row format>

In [60]:
# Model building Using DecisionTree
 
clf1= DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
clf1.fit(my_xfeatures, dfy_train)
# Accuracy on test set
score1=clf1.score(dv.transform(dfX_test), dfy_test)
score1

0.8727597423305058

In [61]:
# Naive bayes classifier 
# Bernaulli NB

clf2=BernoulliNB()
dfx_train=dv.transform(dfX_train)
clf2.fit(dfx_train,dfy_train)
dfx_test=dv.transform(dfX_test)
score2=clf2.score(dfx_test,dfy_test)
score2



0.8579628802857325

In [62]:
#SVM

clf3= svm.SVC(kernel='linear') # Linear Kernel
clf3.fit(dfx_train, dfy_train)
score3=clf3.score(dfx_test,dfy_test)
score3

0.8734294278971874

In [63]:
clf4= svm.SVC(kernel='rbf') # rbf Kernel
clf4.fit(dfx_train, dfy_train)
score4=clf4.score(dfx_test,dfy_test)
score4

0.8873333758530518

In [64]:
#Logistic Regression
clf5=LogisticRegression(random_state=0)
clf5.fit(dfx_train,dfy_train)
score5=clf5.score(dfx_test,dfy_test)
score5

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8797754958862173

In [65]:
models=pd.DataFrame({'Model':['Decision Tree','Naive Bayes','SVM_linear','SVM_rbf','Logistic Regression'],
                    'Accuracy':[score1,score2,score3,score4,score5]})
models.sort_values(by='Accuracy',ascending=False)

Unnamed: 0,Model,Accuracy
3,SVM_rbf,0.887333
4,Logistic Regression,0.879775
2,SVM_linear,0.873429
0,Decision Tree,0.87276
1,Naive Bayes,0.857963


In [51]:
# Build Features and Transform them
sample= ["Ravan"]
transform_dv =dv.transform(features(sample))

In [52]:
vect3 = transform_dv.toarray()

In [55]:
# Predicting Gender of Name
# Male is 1,female = 0
clf4.predict(vect3)

array([1], dtype=int64)

In [56]:
if clf4.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [57]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if clf4.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [58]:
random_name_list = ["Sakshi","Hrithik","Aniket","Namrata","Dan","Stuti"]

In [59]:
for n in random_name_list:
    print(genderpredictor1(n))

Female
None
Male
None
Male
None
Female
None
Male
None
Female
None


In [None]:
#We can use accuracy when we are interested in predicting both 0 and 1 correctly and our dataset is balanced enough