In [1]:
from nltk.classify import apply_features
from nltk.corpus import names
import nltk
import random

In [2]:
class NameGenderFeatureExtractor(object):
    def number_of_vowels(self, word):
        return sum(letter in 'aeiou' for letter in word.lower())

    def features(self, word):
        return {
            'suffix_1': word[-1:],
            'suffix_2': word[-2:],
            # 'name_length': len(word),
            # 'first_letter': word[0],
            # 'number_of_vowels': self.number_of_vowels(word),
            # 'ratio_vowels': self.number_of_vowels(word) / len(word)
        }

In [4]:
class NameGenderData(object):
    def __init__(self, feature_extractor):
        self._feature_extractor = feature_extractor
        self._labeled_names = self._label_names()

    def _label_names(self):
        from nltk.corpus import names
        labeled_names = ([(name, 'male') for name in names.words('male.txt')] 
            + [(name, 'female') for name in names.words('female.txt')])
        import random
        random.shuffle(labeled_names)
        print('labeled_names: {} elements'.format(len(labeled_names)))

        return labeled_names

    def split_train_devtest_test_set(self):
        train_names = self._labeled_names[1500:]
        devtest_names = random.sample(self._labeled_names[500:1500], 100)
        test_names = self._labeled_names[:500]

        train_set = [(self._feature_extractor.features(n), gender) for (n, gender) in train_names]
        devtest_set = [(self._feature_extractor.features(n), gender) for (n, gender) in devtest_names]
        test_set = [(self._feature_extractor.features(n), gender) for (n, gender) in test_names]

        return {
            'train': (train_names, train_set),
            'devtest': (devtest_names, devtest_set),
            'test': (test_names, test_set)
        }

    def evaluate(self, classifier, devtest_set):
        accuracy = nltk.classify.accuracy(classifier, devtest_set)
        print('Accuracy: {}'.format(accuracy))
        classifier.show_most_informative_features(10)

    def error_analysis(self, classifier, devtest_set):
        errors = []
        for (name, tag) in devtest_set:
            guess = classifier.classify(self._feature_extractor.features(name))
            if guess != tag:
                errors.append( (tag, guess, name) )
        for (tag, guess, name) in sorted(errors):
            print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))


In [5]:
my_test_names = ['Neo', 'Trinity']

feature_extractor = NameGenderFeatureExtractor()
# for name in my_test_names:
#     print(feature_extractor.features(name))
# exit()

data_prep = NameGenderData(feature_extractor)
split_sets = data_prep.split_train_devtest_test_set()
train_names, train_set = split_sets['train']
devtest_names, devtest_set = split_sets['devtest']
test_names, test_set = split_sets['test']

classifier = nltk.NaiveBayesClassifier.train(train_set)

for name in my_test_names:
    print('Classification for [{}]: {}'.format(name, classifier.classify(feature_extractor.features(name))))

data_prep.evaluate(classifier, devtest_set)
data_prep.error_analysis(classifier, devtest_names)


labeled_names: 7944 elements
Classification for [Neo]: male
Classification for [Trinity]: female
Accuracy: 0.8
Most Informative Features
                suffix_2 = u'na'          female : male   =     92.2 : 1.0
                suffix_2 = u'la'          female : male   =     64.8 : 1.0
                suffix_2 = u'ia'          female : male   =     48.8 : 1.0
                suffix_1 = u'k'             male : female =     42.1 : 1.0
                suffix_1 = u'a'           female : male   =     33.6 : 1.0
                suffix_2 = u'sa'          female : male   =     32.9 : 1.0
                suffix_2 = u'us'            male : female =     28.7 : 1.0
                suffix_2 = u'rd'            male : female =     26.6 : 1.0
                suffix_2 = u'ra'          female : male   =     23.3 : 1.0
                suffix_2 = u'ld'            male : female =     22.8 : 1.0
correct=female   guess=male     name=Brandais                      
correct=female   guess=male     name=Caron   