In [1]:
import nltk

In [2]:
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [3]:
nltk.corpus.names.fileids()

['female.txt', 'male.txt']

In [4]:
fnames = nltk.corpus.names.words('female.txt')
mnames = nltk.corpus.names.words('male.txt')

In [5]:
len(fnames)

5001

In [6]:
len(mnames)

2943

In [7]:
import pandas as pd

In [8]:
names = pd.DataFrame(fnames,columns=['names'])
names['gender'] = 'female'

In [9]:
names

Unnamed: 0,names,gender
0,Abagael,female
1,Abagail,female
2,Abbe,female
3,Abbey,female
4,Abbi,female
...,...,...
4996,Zorine,female
4997,Zsa Zsa,female
4998,Zsazsa,female
4999,Zulema,female


In [10]:
all_names = pd.concat([names, pd.DataFrame({'names':mnames,'gender':'male'})])

In [11]:
all_names

Unnamed: 0,names,gender
0,Abagael,female
1,Abagail,female
2,Abbe,female
3,Abbey,female
4,Abbi,female
...,...,...
2938,Zeus,male
2939,Zippy,male
2940,Zollie,male
2941,Zolly,male


In [12]:
def feature_extraction(x):
    x = x.lower()
    features = {}
    features['first'] = x[0]
    features['last'] = x[-1]
    features['nchar'] = len(x)
    features['vowel_count'] = 0
    for char in 'abcdefghijklmnopqrstuvwxyz':
        features['contains_'+char] = char in x
        features['countof_'+char] = x.count(char)

        if (char in 'aeiou') & (char in x):
            features['vowel_count'] += x.count(char) # count total. no of vowels
    if features['vowel_count'] > 0:
        features['vowels'] = 1
    else:
        features['vowels'] = 0
    return features


In [13]:
feature_extraction('Shubham')

{'first': 's',
 'last': 'm',
 'nchar': 7,
 'vowel_count': 2,
 'contains_a': True,
 'countof_a': 1,
 'contains_b': True,
 'countof_b': 1,
 'contains_c': False,
 'countof_c': 0,
 'contains_d': False,
 'countof_d': 0,
 'contains_e': False,
 'countof_e': 0,
 'contains_f': False,
 'countof_f': 0,
 'contains_g': False,
 'countof_g': 0,
 'contains_h': True,
 'countof_h': 2,
 'contains_i': False,
 'countof_i': 0,
 'contains_j': False,
 'countof_j': 0,
 'contains_k': False,
 'countof_k': 0,
 'contains_l': False,
 'countof_l': 0,
 'contains_m': True,
 'countof_m': 1,
 'contains_n': False,
 'countof_n': 0,
 'contains_o': False,
 'countof_o': 0,
 'contains_p': False,
 'countof_p': 0,
 'contains_q': False,
 'countof_q': 0,
 'contains_r': False,
 'countof_r': 0,
 'contains_s': True,
 'countof_s': 1,
 'contains_t': False,
 'countof_t': 0,
 'contains_u': True,
 'countof_u': 1,
 'contains_v': False,
 'countof_v': 0,
 'contains_w': False,
 'countof_w': 0,
 'contains_x': False,
 'countof_x': 0,
 'contain

In [14]:
df = pd.DataFrame(feature_extraction('Shubham').values(),index=feature_extraction('Shubham').keys()).T
df

Unnamed: 0,first,last,nchar,vowel_count,contains_a,countof_a,contains_b,countof_b,contains_c,countof_c,...,countof_v,contains_w,countof_w,contains_x,countof_x,contains_y,countof_y,contains_z,countof_z,vowels
0,s,m,7,2,True,1,True,1,False,0,...,0,False,0,False,0,False,0,False,0,1


In [15]:
labled_list = [(name,'Female') for name in fnames] + [(name,'Male') for name in mnames]

In [16]:
labled_list[0]

('Abagael', 'Female')

In [17]:
feature_list = [(feature_extraction(name),gender) for name, gender in labled_list]

In [18]:
from numpy import random

In [19]:
random.shuffle(feature_list)

In [20]:
len(feature_list)

7944

In [21]:
len(feature_list)*.80

6355.200000000001

In [22]:
train,test = feature_list[:7000],feature_list[7000:]

In [23]:
clf = nltk.NaiveBayesClassifier.train(train)

In [24]:
print(nltk.classify.accuracy(clf,test))

0.7616525423728814


In [28]:
clf.classify(feature_extraction('Shubham'))

'Male'

In [29]:
pred = clf.classify_many([test_data for test_data, label in test])

In [30]:
lables = [label for test_data, label in test]

In [31]:
pd.crosstab(pd.Series(pred),pd.Series(lables))

col_0,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,478,116
Male,109,241


In [None]:
clf.classify(feature_extraction('Cameron'))

'Female'

In [None]:
clf.classify(feature_extraction('Scott'))

'Male'