# Text Classification: Names Corpus  

## `nltk`  

From the site: 
https://gist.github.com/vinovator/6e5bf1e1bc61687a1e809780c30d6bf6
https://www.geeksforgeeks.org/python-gender-identification-by-name-using-nltk/

### Training on the last letter of the name

In [174]:
import nltk
from nltk.corpus import names
import random

In [175]:
mcorpus = [(name, "male") for name in names.words("male.txt")]
fcorpus = [(name, "female") for name in names.words("female.txt")]
random.shuffle(mcorpus); random.shuffle(fcorpus)
print(mcorpus[0:5],len(mcorpus));
print(fcorpus[0:5],len(fcorpus))

[('King', 'male'), ('Osbert', 'male'), ('Dov', 'male'), ('Kelly', 'male'), ('Jodie', 'male')] 2943
[('Freddi', 'female'), ('Kati', 'female'), ('Carlita', 'female'), ('Indira', 'female'), ('Gerda', 'female')] 5001


In [176]:
# Divide the feature sets into training and test sets
train_setm, dev_setm, test_setm = mcorpus[1000:], mcorpus[1001:2000], mcorpus[2001:2943]
train_setf, dev_setf, test_setf = fcorpus[1700:], fcorpus[1701:3400], fcorpus[3401:5001]

train_set = train_setm + train_setf
dev_set = dev_setm + dev_setf
test_set = test_setm + test_setf

In [177]:
def features(word):
    return {"last_letter": word[-1]}

In [178]:
# Process the names through feature extractor
train_set1 = [(features(n), gender) for (n, gender) in train_set]
dev_set1 = [(features(n), gender) for (n, gender) in dev_set]
test_set1 = [(features(n), gender) for (n, gender) in test_set]

In [183]:
# Train the naiveBayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set1)

# Test the accuracy of the classifier on the test data
print(nltk.classify.accuracy(classifier, test_set1))  # returns 0.78 for now

# examine classifier to determine which feature is most effective for
# distinguishing the name's gender
print(classifier.show_most_informative_features(10))

0.7734067663257278
Most Informative Features
             last_letter = 'a'            female : male   =     38.1 : 1.0
             last_letter = 'f'              male : female =     20.9 : 1.0
             last_letter = 'w'              male : female =     16.4 : 1.0
             last_letter = 'o'              male : female =     14.1 : 1.0
             last_letter = 'z'              male : female =     10.7 : 1.0
             last_letter = 'd'              male : female =     10.1 : 1.0
             last_letter = 'p'              male : female =      8.5 : 1.0
             last_letter = 'v'              male : female =      7.8 : 1.0
             last_letter = 'm'              male : female =      7.5 : 1.0
             last_letter = 'r'              male : female =      6.6 : 1.0
None


### Training on the last 3 letters of the name

In [180]:
def features2(word):
    return {"last3letters": word[-3:]}  # feature set

# Process the names through feature extractor
train_set2 = [(features2(n), gender) for (n, gender) in train_set]
dev_set2 = [(features2(n), gender) for (n, gender) in dev_set]
test_set2 = [(features2(n), gender) for (n, gender) in test_set]

In [182]:
# Train the naiveBayes classifier
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

# Test the accuracy of the classifier on the test data
print(nltk.classify.accuracy(classifier2, test_set2))  # returns 0.78 for now

# examine classifier to determine which feature is most effective for
# distinguishing the name's gender
print(classifier2.show_most_informative_features(10))
 

0.8619197482297404
Most Informative Features
            last3letters = 'nne'          female : male   =     22.5 : 1.0
            last3letters = 'ita'          female : male   =     20.3 : 1.0
            last3letters = 'tta'          female : male   =     19.9 : 1.0
            last3letters = 'ard'            male : female =     19.8 : 1.0
            last3letters = 'ana'          female : male   =     19.4 : 1.0
            last3letters = 'ela'          female : male   =     12.4 : 1.0
            last3letters = 'ene'          female : male   =     11.1 : 1.0
            last3letters = 'dra'          female : male   =     10.7 : 1.0
            last3letters = 'son'            male : female =      9.4 : 1.0
            last3letters = 'lyn'          female : male   =      8.8 : 1.0
None


## `sklearn`  

From the site: https://blog.ayoungprogrammer.com/2016/04/determining-gender-of-name-with-80.html/    

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

Create the names corpus with gender assignment; each name is converted to lowercase.

In [None]:
labeled_names = ([(name.lower(), "M") for name in names.words("male.txt")] +
                 [(name.lower(), "F") for name in names.words("female.txt")])

my_data = np.asarray(labeled_names) 

Using `sklearn`, we train a model on numbers associated with each letter in the name, where a=1, b=2, c=3, ... in this manner a model is created based on integer values.

In [232]:
def name_count(name):
    arr = np.zeros(65)
    for ind, x in enumerate(name):
        arr[ord(x)-ord('a')] += 1
    return arr

name_map = np.vectorize(name_count, otypes=[np.ndarray])
Xlist = name_map(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.7219679633867276
0.7292143401983219
0.7315026697177727
0.7257818459191457
0.7265446224256293


In [186]:
def name_count2(name):
    arr = np.zeros(65+26)
    for ind, x in enumerate(name):
        arr[ord(x)-ord('a')] += 1
        arr[ord(x)-ord('a')+26] += ind+1
    return arr

name_map2 = np.vectorize(name_count2, otypes=[np.ndarray])
Xlist = name_map2(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.7829900839054157
0.7807017543859649
0.7864225781845919
0.7688787185354691
0.7936689549961862


In [193]:
def name_count3(name):
    arr = np.zeros(1800)
    # Iterate each character
    for ind, x in enumerate(name):
        arr[ord(x)-ord('a')] += 1
        arr[ord(x)-ord('a')+26] += ind+1
    # Iterate every 2 characters
    for x in range(len(name)-1):
        ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a')) + 52
        arr[ind] += 1
    return arr

name_map3 = np.vectorize(name_count3, otypes=[np.ndarray])
Xlist = name_map3(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.7963386727688787
0.7967200610221206
0.7894736842105263
0.7917620137299771
0.7921434019832189


In [194]:
def name_count4(name):
    arr = np.zeros(1800)
    # Iterate each character
    for ind, x in enumerate(name):
        arr[ord(x)-ord('a')] += 1
        arr[ord(x)-ord('a')+26] += ind+1
    # Iterate every 2 characters
    for x in range(len(name)-1):
        ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a')) + 52
        arr[ind] += 1
    # Last character
    arr[-3] = ord(name[-1])-ord('a')+1
    # Second Last character
    arr[-2] = ord(name[-2])-ord('a')+1
    # Length of name
    arr[-1] = len(name)
    return arr

name_map4 = np.vectorize(name_count4, otypes=[np.ndarray])
Xlist = name_map4(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.8180778032036613
0.8188405797101449
0.8096872616323417
0.8032036613272311
0.8081617086193745


In [195]:
def name_count7(name):
    arr = np.zeros(1800)
    # Iterate each character
    for ind, x in enumerate(name):
        arr[ord(x)-ord('a')] += 1
        arr[ord(x)-ord('a')+26] += ind+1
    # Iterate every 2 characters
    for x in range(len(name)-1):
        ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a')) + 52
        arr[ind] += 1
    # Last character
    arr[-3] = ord(name[-1])-ord('a')+1
    # Second Last character
    arr[-2] = ord(name[-2])-ord('a')+1
    # Length of name
    arr[-1] = len(name)
    return arr

name_map7 = np.vectorize(name_count7, otypes=[np.ndarray])
Xlist = name_map7(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=150, min_samples_split=20)
    clf.fit(Xtr, ytr)
    print(clf.feature_importances_.argsort()[-10:][::-1])
    print(np.mean(clf.predict(Xte) == yte))

[1797   26 1798   40   30   34    0   43   29    8]
0.8081617086193745
[1797   26 1798   40   30    0   34   43   22   29]
0.8119755911517925
[1797   26 1798   40    0   30   34   43 1799    8]
0.8005339435545386
[1797   26 1798   40   30    0   34   43   29 1799]
0.8012967200610221
[1797   26 1798   40    0   30   34   43   29 1799]
0.8154080854309688


The following code trains a model based on the last letter of the name.

In [224]:
def name_count8(name):
    arr = np.zeros(1)
    arr[0] = ord(name[-1])-ord('a')+1
    return arr

name_map8 = np.vectorize(name_count8, otypes=[np.ndarray])
Xlist = name_map8(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=150, min_samples_split=20)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.761632341723875
0.7715484363081617
0.7528604118993135
0.7570556826849733
0.7589626239511823


The following code trains a model based on the last three letters of the name.

In [228]:
def name_count9(name):
    arr = np.zeros(3)
    arr[0] = ord(name[-1])-ord('a')+1
    arr[1] = ord(name[-2])-ord('a')+1
    for ind, x in enumerate(name):
        if len(name)>=3:
            arr[2] = ord(name[-3])-ord('a')+1
    
    return arr

name_map9 = np.vectorize(name_count9, otypes=[np.ndarray])
Xlist = name_map9(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=150, min_samples_split=20)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))

0.7890922959572845
0.7913806254767353
0.8012967200610221
0.7940503432494279
0.8032036613272311


In [221]:
def name_count10(name):
    arr = np.zeros(3)
    arr[0] = ord(name[-1])-ord('a')+1
    arr[1] = ord(name[-2])-ord('a')+1
    # Order of a's
    for ind, x in enumerate(name):
        if x == 'a':
            arr[2] += ind+1
    
    return arr

name_map10 = np.vectorize(name_count10, otypes=[np.ndarray])
Xlist = name_map10(np.asarray(list(zip(*my_data))[0],dtype=str))
X = np.array(Xlist.tolist())
y = [x[1] for x in my_data]

for x in range(5):
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.33)
    clf = RandomForestClassifier(n_estimators=150, min_samples_split=20)
    clf.fit(Xtr, ytr)
    print(np.mean(clf.predict(Xte) == yte))


0.7784134248665141
0.7738367658276125
0.7734553775743707
0.7837528604118993
0.7738367658276125


In [207]:
idx = np.random.choice(np.arange(len(Xlist)), 10, replace=False)
Xname = [x[0] for x in my_data]
xs = [Xname[x] for x in idx]
ys = [y[x] for x in idx]
pred = clf.predict(X[idx])

for a,b, p in zip(xs,ys, pred):
    print(a,b, p)

reid M M
galina F F
bishop M M
channa F F
uta F F
tracy M M
gillian F M
cindi F F
eustacia F F
raleigh M M


In [231]:
for x in range(5): 
    print(X[x])
    print(Xname[x])

[18.  9. 13.]
aamir
[14. 15. 18.]
aaron
[25.  5.  2.]
abbey
[5. 9. 2.]
abbie
[20. 15.  2.]
abbot
