In [34]:
import nltk
# male and female names dataset
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
#dataset size
print len(names.words())

7944


In [35]:
# using last letter of names as a feature
# if last letter is 'y' the gender is most probably female eg. emily,amy,penny etc..
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('sanju')

{'last_letter': 'u'}

In [36]:
from nltk import NaiveBayesClassifier
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]#last letter as feature
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = NaiveBayesClassifier.train(train_set)

In [44]:
print nltk.classify.accuracy(classifier, test_set)

0.77


In [38]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = u'a'           female : male   =     33.6 : 1.0
             last_letter = u'k'             male : female =     30.9 : 1.0
             last_letter = u'f'             male : female =     16.4 : 1.0
             last_letter = u'm'             male : female =     12.1 : 1.0
             last_letter = u'p'             male : female =     11.7 : 1.0
             last_letter = u'v'             male : female =     11.1 : 1.0
             last_letter = u'd'             male : female =      9.2 : 1.0
             last_letter = u'o'             male : female =      8.1 : 1.0
             last_letter = u'r'             male : female =      6.6 : 1.0
             last_letter = u'u'             male : female =      5.5 : 1.0


In [39]:
# last letter and last 2 letters are taken as features.
def gender_features2(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [40]:
random.shuffle(labeled_names)
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.792


In [41]:
# new set of features, first and last charecter in the name and also number and if the charecter occurs in the name
def gender_features3(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [42]:
gender_features3('sanju') 

{'count(a)': 1,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 0,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 0,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 1,
 'count(t)': 0,
 'count(u)': 1,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 's',
 'has(a)': True,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': False,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': False,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': True,
 'has(t)': False,
 'has(u)': True,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'u'}

In [43]:
random.shuffle(labeled_names)
featuresets = [(gender_features3(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.77
