In [10]:
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import names
import random
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\nax\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


True

In [11]:
def gender_features(word): 
    """
    >>> gender_features('Max')  # feature: 
    {'last_letter': 'k'}
    """
    return {'last_letter': word[-1]}

In [12]:
names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
print(names[:3])

feature_sets = [(gender_features(n), g) for (n, g) in names]
print(feature_sets[:3])
train_set, test_set = feature_sets[500:], feature_sets[:500]

[('Yank', 'male'), ('Faun', 'female'), ('Vanny', 'female')]
[({'last_letter': 'k'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'y'}, 'female')]


In [13]:
classifier = NaiveBayesClassifier.train(train_set)
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(5))


male
female
0.732
Most Informative Features
             last_letter = 'k'              male : female =     44.1 : 1.0
             last_letter = 'a'            female : male   =     34.6 : 1.0
             last_letter = 'f'              male : female =     13.1 : 1.0
             last_letter = 'p'              male : female =     11.1 : 1.0
             last_letter = 'v'              male : female =     10.4 : 1.0
None


In [14]:
from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

In [15]:
from collections import OrderedDict
from string import ascii_lowercase

def gender_features2(name):
    features = OrderedDict()
    features['firstletter'] = name[0].lower()
    features['lastletter'] = name[-1].lower()
    for letter in ascii_lowercase:
        features['count(%s)' % letter] = name.lower().count(letter)
        features['has(%s)' % letter] = letter in name.lower()
    return features

In [16]:
feature_sets2 = [(gender_features2(n), g) for (n, g) in names]
train_set, test_set = feature_sets2[500:], feature_sets2[:500]
classifier = NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)


0.754

In [17]:

train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

train_set = [(gender_features(n), g) for (n, g) in train_names]
devtest_set = [(gender_features(n), g) for (n, g) in devtest_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
classifier = NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))


0.732


In [18]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))
print(errors[:10])

def gender_features3(word):
    """
    >>> gender_features3('Shrek')
    {'suffix2': 'ek', 'suffix1': 'k'}
    """
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

[('female', 'male', 'Charo'), ('male', 'female', 'Sidnee'), ('male', 'female', 'Jeremie'), ('female', 'male', 'Annabal'), ('male', 'female', 'Abbie'), ('male', 'female', 'Spense'), ('female', 'male', 'Beitris'), ('male', 'female', 'Towny'), ('female', 'male', 'Gwendolyn'), ('female', 'male', 'Jesselyn')]


In [19]:
train_set = [(gender_features3(n), g) for (n, g) in train_names]
devtest_set = [(gender_features3(n), g) for (n, g) in devtest_names]
test_set = [(gender_features3(n), g) for (n, g) in test_names]
classifier = NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.752
