In [1]:
import nltk

In [3]:
# define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}
print(gender_features('Shrek'))

{'last_letter': 'k'}


In [4]:
# resource for male and female first names
from nltk.corpus import names
print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [5]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
print(len(namesgender))
print(namesgender[:20])   # first 20
print(namesgender[7924:])  # last 20

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [6]:
# put the list into random order
import random
random.shuffle(namesgender)
print(namesgender[:20])

[('Charlot', 'female'), ('Carol-Jean', 'female'), ('Leonerd', 'male'), ('Demetrius', 'male'), ('Tabatha', 'female'), ('Baillie', 'male'), ('Norina', 'female'), ('Nancey', 'female'), ('Pier', 'female'), ('Nichole', 'female'), ('Lauraine', 'female'), ('Udell', 'male'), ('Barth', 'male'), ('Reuben', 'male'), ('Katinka', 'female'), ('Christophe', 'male'), ('Christ', 'male'), ('Klee', 'male'), ('Andee', 'female'), ('Kacey', 'female')]


In [7]:
# featuresets represent each name as features and a label
featuresets = [(gender_features(n), g) for (n, g) in namesgender]
print(featuresets[:20])

[({'last_letter': 't'}, 'female'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 's'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'r'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'h'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female')]


In [10]:
# create training and test sets, run a classifier and show the accuracy
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [11]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.774


In [12]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     30.9 : 1.0
             last_letter = 'f'              male : female =     16.0 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'v'              male : female =     11.3 : 1.0
             last_letter = 'd'              male : female =      9.4 : 1.0
             last_letter = 'm'              male : female =      9.2 : 1.0
             last_letter = 'o'              male : female =      8.5 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'g'              male : female =      5.1 : 1.0
             last_letter = 'z'              male : female =      4.4 : 1.0
             last_letter = 's'              male : female =      4.1 : 1.0

In [13]:
# creating lots of features
#   there are probably too many features but we are demonstrating different
#     types of features
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [14]:
features = gender_features2('Shrek')
print(len(features))
print(features)

54
{'firstletter': 's', 'lastletter': 'k', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 1, 'has(e)': True, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 1, 'has(k)': True, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 0, 'has(n)': False, 'count(o)': 0, 'has(o)': False, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 1, 'has(r)': True, 'count(s)': 1, 'has(s)': True, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [15]:
# create feature sets using this function
featuresets2 = [(gender_features2(n), g) for (n, g) in namesgender]

In [16]:
# create new training and test sets, classify and look at accuracy
train_set, test_set = featuresets2[500:], featuresets2[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.778


In [17]:
## Experiment related to lab exercise for today
# go back and separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]

In [18]:
# use our original features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# save the classifier accuracy for use in the exercise
print(nltk.classify.accuracy(classifier, test_set))

0.774


In [19]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

In [20]:
errors = geterrors(test_names)
print(len(errors))

113


In [21]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

printerrors(errors)

correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Alyson                        
correct=female   guess=male     name=Angil                         
correct=female   guess=male     name=Bliss                         
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Carol-Jean                    
correct=female   guess=male     name=Caron                         
correct=female   guess=male     name=Ceil                          
correct=female   guess=male     name=Charleen                      
correct=female   guess=male     name=Charlot                       
correct=female   guess=male     name=Coral                         
correct=female   guess=male     name=Corliss                       
correct=female   guess=male     name=Cris       

In [22]:
# evaluation measures showing performance of classifier

from nltk.metrics import *

reflist = []
testlist = []
for (features, label) in test_set:
    reflist.append(label)
    testlist.append(classifier.classify(features))

print(reflist[:30])
print(testlist[:30])

['female', 'female', 'male', 'male', 'female', 'male', 'female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'male', 'male', 'female', 'female', 'female', 'male', 'female', 'female', 'female', 'female', 'male', 'female', 'male', 'male']
['male', 'male', 'male', 'male', 'female', 'female', 'female', 'female', 'male', 'female', 'female', 'male', 'female', 'male', 'female', 'female', 'male', 'female', 'female', 'female', 'female', 'male', 'male', 'female', 'female', 'male', 'female', 'male', 'female', 'female']


In [23]:
# Confusion matrix gives true positives, false negatives, false positives, and true negatives
#   where we interpret female as "yes" and male as "no"
cm = ConfusionMatrix(reflist, testlist)
print(cm)

       |   f     |
       |   e     |
       |   m   m |
       |   a   a |
       |   l   l |
       |   e   e |
-------+---------+
female |<255> 51 |
  male |  62<132>|
-------+---------+
(row = reference; col = test)



In [25]:
# define a set of item identifiers that are gold labels and a set of item identifiers that are predicted labels
# this uses index numbers for the labels

reffemale = set([i for i,label in enumerate(reflist) if label == 'female'])
refmale = set([i for i,label in enumerate(reflist) if label == 'male'])
testfemale = set([i for i,label in enumerate(testlist) if label == 'female'])
testmale = set([i for i,label in enumerate(testlist) if label == 'male'])

print(reffemale)

{0, 1, 4, 6, 7, 8, 9, 10, 14, 18, 19, 20, 22, 23, 24, 25, 27, 32, 33, 35, 36, 38, 39, 42, 44, 46, 47, 48, 51, 53, 54, 55, 56, 57, 59, 60, 61, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 77, 78, 79, 83, 84, 85, 88, 90, 92, 94, 96, 97, 99, 101, 102, 103, 104, 105, 106, 107, 113, 115, 116, 117, 122, 124, 125, 126, 128, 130, 132, 133, 135, 136, 138, 140, 142, 143, 144, 145, 146, 148, 150, 151, 152, 154, 158, 161, 162, 163, 164, 165, 169, 173, 175, 177, 179, 180, 182, 183, 184, 186, 189, 191, 193, 195, 196, 197, 198, 200, 201, 202, 203, 204, 205, 206, 208, 209, 212, 213, 214, 215, 218, 219, 220, 222, 223, 224, 225, 226, 227, 229, 231, 233, 234, 236, 241, 243, 247, 248, 250, 251, 252, 254, 255, 256, 258, 260, 261, 263, 264, 265, 268, 269, 270, 276, 277, 278, 279, 280, 281, 282, 284, 285, 287, 288, 289, 290, 292, 293, 295, 300, 304, 306, 307, 308, 310, 311, 313, 314, 316, 317, 318, 319, 322, 323, 324, 327, 329, 330, 331, 332, 338, 339, 341, 345, 347, 348, 349, 350, 351, 354, 357, 358, 359, 361, 3

In [26]:
# compute precision, recall and F-measure for each label

def printmeasures(label, refset, testset):
    print(label, 'precision:', precision(refset, testset))
    print(label, 'recall:', recall(refset, testset)) 
    print(label, 'F-measure:', f_measure(refset, testset))

printmeasures('female', reffemale, testfemale)
printmeasures('male', refmale, testmale)

female precision: 0.804416403785489
female recall: 0.8333333333333334
female F-measure: 0.8186195826645264
male precision: 0.7213114754098361
male recall: 0.6804123711340206
male F-measure: 0.7002652519893899


In [27]:
# another feature extraction function for the exercise
def gender_features3(word):
    return {'suffix1': word[-1],'suffix2': word[-2]}

print(gender_features3('Shrek'))

{'suffix1': 'k', 'suffix2': 'e'}
