In [47]:
"""
Natural Language Processing with Python - Chapter 6

http://nltk.org/book/ch06.html
"""

import nltk
import random
#from show import show

def gender_features(word):
    return {'last_letter': word[-3:]}

names = ([(name, 'male') for name in nltk.corpus.names.words('male.txt')] + \
        [(name, 'female') for name in nltk.corpus.names.words('female.txt')])
random.shuffle(names)
#show(names[0:4])

print(names[0:20])


[('Valentine', 'female'), ('Delmar', 'male'), ('Clarie', 'female'), ('Euphemia', 'female'), ('Stefano', 'male'), ('Taylor', 'male'), ('Greggory', 'male'), ('Kirby', 'female'), ('Hollis', 'male'), ('Madel', 'female'), ('Galina', 'female'), ('Joey', 'female'), ('Kaitlynn', 'female'), ('Dyna', 'female'), ('Rebekkah', 'female'), ('Bettie', 'female'), ('Virge', 'male'), ('Patsy', 'male'), ('Dita', 'female'), ('Eben', 'male')]


In [48]:
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

print(gender_features('Neo'))

print(classifier.classify(gender_features('Neo')) )
# 'male'
print( classifier.classify(gender_features('Trinity')) )
print( classifier.classify(gender_features('Snoopy')) )
print( classifier.classify(gender_features('gamja')) )
print( classifier.classify(gender_features('fjksdjfsdlt')) )
# 'female'
print( nltk.classify.accuracy(classifier, test_set) )
# 0.758
classifier.show_most_informative_features(5)
# Most Informative Features
#              last_letter = 'a'            female : male   =     38.3 : 1.0
#              last_letter = 'k'              male : female =     31.4 : 1.0
#              last_letter = 'f'              male : female =     15.3 : 1.0
#              last_letter = 'p'              male : female =     10.6 : 1.0
#              last_letter = 'w'              male : female =     10.6 : 1.0



{'last_letter': 'Neo'}
female
female
female
female
female
0.72
Most Informative Features
             last_letter = 'ard'            male : female =     29.5 : 1.0
             last_letter = 'ana'          female : male   =     26.4 : 1.0
             last_letter = 'tta'          female : male   =     25.5 : 1.0
             last_letter = 'nne'          female : male   =     20.5 : 1.0
             last_letter = 'old'            male : female =     16.1 : 1.0


### 위의 사항에 대한 정리

#### 이름 글자의 마지막을 통계적으로 산출하여 입력된 분류기로 분류하는 방법
#### 정확한 분류로 볼수는 없어 보임


In [50]:

from nltk.classify import apply_features
train_set = apply_features(gender_features, names[500:])
test_set = apply_features(gender_features, names[:500])

classifier = nltk.NaiveBayesClassifier.train(train_set)

print ( classifier.classify(gender_features('Neo')) )
# 'male'
print ( classifier.classify(gender_features('Trinity')) )
# 'female'
print ( nltk.classify.accuracy(classifier, test_set) )
# 0.758
classifier.show_most_informative_features(5)


female
female
0.72
Most Informative Features
             last_letter = 'ard'            male : female =     29.5 : 1.0
             last_letter = 'ana'          female : male   =     26.4 : 1.0
             last_letter = 'tta'          female : male   =     25.5 : 1.0
             last_letter = 'nne'          female : male   =     20.5 : 1.0
             last_letter = 'old'            male : female =     16.1 : 1.0


In [51]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

print (str(gender_features2('John'))[0:100])
# {'count(j)': 1, 'has(d)': False, 'count(b)': 0, ...}



random.shuffle(names)
featuresets = [(gender_features2(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print ( nltk.classify.accuracy(classifier, test_set) )
# 0.748


random.shuffle(names)
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print ( nltk.classify.accuracy(classifier, devtest_set) )
# 0.765

"""
Generate list of errors.
"""

errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

for (tag, guess, name) in sorted(errors)[0:5]: 
    print ( 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name) )
# correct=female   guess=male     name=Cindelyn
# correct=female   guess=male     name=Katheryn
# correct=female   guess=male     name=Kathryn
# correct=male     guess=female   name=Aldrich
# correct=male     guess=female   name=Mitch
# correct=male     guess=female   name=Rich


def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}



random.shuffle(names)
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print ( nltk.classify.accuracy(classifier, devtest_set) )
# 0.782


{'count(v)': 0, 'count(b)': 0, 'count(l)': 0, 'count(k)': 0, 'has(r)': False, 'has(t)': False, 'coun
0.794
0.751
correct=female   guess=male     name=Agace                         
correct=female   guess=male     name=Aimil                         
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Brook                         
0.77
