# Name Gender Identifier

In [58]:
def gender_features(word):
    return {'last_letter' : word[-2:]}



In [59]:
gender_features('WORD')

{'last_letter': 'RD'}

#### The returned dictionary is known as a feature set.

## Exploring the names corpus¶

In [60]:
import nltk
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [61]:
from nltk.corpus import names

In [62]:
names.readme().replace('\n', ' ')

'Names Corpus, Version 1.3 (1994-03-29) Copyright (C) 1991 Mark Kantrowitz Additions by Bill Ross  This corpus contains 5001 female names and 2943 male names, sorted alphabetically, one per line.  You may use the lists of names for any purpose, so long as credit is given in any published work. You may also redistribute the list if you provide the recipients with a copy of this README file. The lists are not in the public domain (I retain the copyright on the lists) but are freely redistributable.  If you have any additions to the lists of names, I would appreciate receiving them.  Mark Kantrowitz <mkant+@cs.cmu.edu> http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/'

In [63]:
names.fileids()

['female.txt', 'male.txt']

In [64]:
names.words('female.txt')[:10]

['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale']

## Building the classifier¶

In [65]:
labeled_names = ([(name, 'female') for name in names.words('female.txt')] + [(name, 'male') for name in names.words('male.txt')])
labeled_names[:5]

[('Abagael', 'female'),
 ('Abagail', 'female'),
 ('Abbe', 'female'),
 ('Abbey', 'female'),
 ('Abbi', 'female')]

In [66]:
import random
random.shuffle(labeled_names)
labeled_names[:5]

[('Jehu', 'male'),
 ('Pieter', 'male'),
 ('Gerianne', 'female'),
 ('Sholom', 'male'),
 ('Romeo', 'male')]

In [67]:
feature_set = [(gender_features(n), gender) for (n, gender) in labeled_names]
feature_set[:5]

[({'last_letter': 'hu'}, 'male'),
 ({'last_letter': 'er'}, 'male'),
 ({'last_letter': 'ne'}, 'female'),
 ({'last_letter': 'om'}, 'male'),
 ({'last_letter': 'eo'}, 'male')]

In [68]:
len(feature_set)

7944

In [69]:
from nltk import NaiveBayesClassifier

In [70]:
train_set = round(len(feature_set) * 0.7)
train_set

5561

In [71]:
train_data, test_data = feature_set[: train_set], feature_set[train_set:]

In [84]:
train_data[:5]

[({'last_letter': 'hu'}, 'male'),
 ({'last_letter': 'er'}, 'male'),
 ({'last_letter': 'ne'}, 'female'),
 ({'last_letter': 'om'}, 'male'),
 ({'last_letter': 'eo'}, 'male')]

In [85]:
test_data[:5]

[({'last_letter': 'el'}, 'male'),
 ({'last_letter': 'da'}, 'female'),
 ({'last_letter': 'el'}, 'male'),
 ({'last_letter': 'an'}, 'male'),
 ({'last_letter': 'ey'}, 'male')]

In [72]:
classifier = NaiveBayesClassifier.train(train_data)

In [73]:
# Last one word
# classifier.show_most_informative_features(10)

In [74]:
# Last two words
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'na'           female : male   =    122.7 : 1.0
             last_letter = 'la'           female : male   =     57.6 : 1.0
             last_letter = 'us'             male : female =     53.0 : 1.0
             last_letter = 'ta'           female : male   =     35.0 : 1.0
             last_letter = 'ia'           female : male   =     32.0 : 1.0
             last_letter = 'rt'             male : female =     25.9 : 1.0
             last_letter = 'rd'             male : female =     23.2 : 1.0
             last_letter = 'ch'             male : female =     21.3 : 1.0
             last_letter = 'ra'           female : male   =     20.0 : 1.0
             last_letter = 'ld'             male : female =     18.0 : 1.0


In [75]:
# Last three words
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'na'           female : male   =    122.7 : 1.0
             last_letter = 'la'           female : male   =     57.6 : 1.0
             last_letter = 'us'             male : female =     53.0 : 1.0
             last_letter = 'ta'           female : male   =     35.0 : 1.0
             last_letter = 'ia'           female : male   =     32.0 : 1.0
             last_letter = 'rt'             male : female =     25.9 : 1.0
             last_letter = 'rd'             male : female =     23.2 : 1.0
             last_letter = 'ch'             male : female =     21.3 : 1.0
             last_letter = 'ra'           female : male   =     20.0 : 1.0
             last_letter = 'ld'             male : female =     18.0 : 1.0


### Testing the classifier

In [76]:
classifier.labels()

['male', 'female']

In [77]:
from nltk.classify import accuracy

round(accuracy(classifier, test_data), 2)

0.77

In [78]:
classifier.classify(gender_features('Laura'))

'female'

In [79]:
classifier.classify(gender_features('Jobin'))

'male'

## Edit Distance

In [80]:
from nltk.metrics import edit_distance

edit_distance("John", "Joan")

1

In [81]:
from nltk import MaxentClassifier

me_classifier = MaxentClassifier.train(train_data, max_iter=25)

  ==> Training (25 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.373
             2          -0.34780        0.810
             3          -0.33436        0.810
             4          -0.32638        0.810
             5          -0.32109        0.810
             6          -0.31733        0.810
             7          -0.31452        0.810
             8          -0.31234        0.810
             9          -0.31060        0.810
            10          -0.30918        0.810
            11          -0.30799        0.810
            12          -0.30699        0.810
            13          -0.30614        0.810
            14          -0.30540        0.810
            15          -0.30475        0.810
            16          -0.30418        0.810
            17          -0.30367        0.810
            18          -0.30321        0.810
            19          -0.30280        0.810
  

In [82]:
round(accuracy(me_classifier, test_data), 2)

0.77

In [83]:
me_classifier.show_most_informative_features(10)

  -7.238 last_letter=='na' and label is 'male'
  -6.150 last_letter=='la' and label is 'male'
  -5.180 last_letter=='ta' and label is 'male'
  -4.954 last_letter=='ia' and label is 'male'
   4.644 last_letter=='da' and label is 'female'
   4.644 last_letter=='sa' and label is 'female'
   4.644 last_letter=='om' and label is 'male'
   4.644 last_letter=='dd' and label is 'male'
   4.644 last_letter=='no' and label is 'male'
   4.644 last_letter=='os' and label is 'male'
