#Load the gender names from NLTK (nltk.corpus.names, female.txt and male.txt).
Create a notebook that performs word classification that assigns the names to the correct gender.

In [1]:
import nltk
from nltk.corpus import names

In [2]:
import random
random.seed(0)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
)

In [3]:
def gender_features(word):
    return {'last_letter':word[-1]}
  
# preparing a list of examples and corresponding class labels.
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')])
  
random.shuffle(labeled_names)
  
# we use the feature extractor to process the names data.
featuresets = [(gender_features(n), gender) 
               for (n, gender)in labeled_names]
  
# Divide the resulting list of feature
# sets into a training set and a test set.
train_set, test_set = featuresets[500:], featuresets[:500]

# The training set is used to 
# train a new "naive Bayes" classifier.
classifier = nltk.NaiveBayesClassifier.train(train_set)
  
print(classifier.classify(gender_features('Hannah')))
  
# output should be 'female'
print(nltk.classify.accuracy(classifier, train_set))
  
# it shows accurancy of our classifier and train_set. 

female
0.7624932831810854


In [4]:
def gender_features(word):
    return {'last_letter':word[-1]}
  
# preparing a list of examples and corresponding class labels.
labeled_names = ([(name, 'male') for name in names.words('male.txt')]+
             [(name, 'female') for name in names.words('female.txt')])
  
random.shuffle(labeled_names)
print(labeled_names)

[('Noble', 'male'), ('Indira', 'female'), ('Rochelle', 'female'), ('Chanda', 'female'), ('Carlene', 'female'), ('Harrie', 'female'), ('Bryna', 'female'), ('Vince', 'male'), ('Sal', 'female'), ('Tiena', 'female'), ('Daniella', 'female'), ('Charlean', 'female'), ('Casey', 'female'), ('Zahara', 'female'), ('Abbie', 'female'), ('Caron', 'female'), ('Mauricio', 'male'), ('Milton', 'male'), ('Georgeanna', 'female'), ('Waylen', 'male'), ('Shana', 'female'), ('Hillary', 'female'), ('Zola', 'female'), ('Reena', 'female'), ('Andy', 'male'), ('Brea', 'female'), ('Wylma', 'female'), ('Sallyann', 'female'), ('Kurt', 'male'), ('Jenda', 'female'), ('Vasilis', 'male'), ('Effie', 'female'), ('Marsiella', 'female'), ('Imogen', 'female'), ('Robert', 'male'), ('Jacklyn', 'female'), ('Anatol', 'male'), ('Duane', 'male'), ('Urbanus', 'male'), ('Blaire', 'female'), ('Fina', 'female'), ('Alidia', 'female'), ('Brice', 'male'), ('Sandie', 'female'), ('Lorene', 'female'), ('Alfred', 'male'), ('Susan', 'female'),

#1.Make sure that the data is balanced. One label should not be x1.5 the other one. 

In [8]:
females = ([name for name in names.words('female.txt')])
males = ([name for name in names.words('male.txt')])
len(females), len(males)

(5001, 2943)

In [9]:
females = list(females)
random.shuffle(females)

In [10]:
females = females[:len(males)]

In [11]:
words = females + list(males)
labels = [0] * len(females) + [1] * len(males)

len(words), len(labels)

(5886, 5886)

#2.Create features that might be an indicator of a name’s gender.

In [12]:
def word_features(word):
    return {'last_letter':word[-1]}
word_features = [
    word_features(w)
    for w in words
]

#3.Split the dataset to 75% 10%, 15% for train, validation, and test respectively.

In [13]:
train_features, test_features, train_labels, test_labels = \
    train_test_split(word_features, labels, test_size=0.15, stratify=labels, random_state=0)

train_features, val_features, train_labels, val_labels = \
    train_test_split(train_features, train_labels, test_size=1/10, stratify=train_labels, random_state=0)

len(train_features), len(test_features), len(val_features)

(4502, 883, 501)

#4.Use a Random Forest to do the classification

In [14]:
featuresets = [(gender_features(n), gender) 
               for (n, gender)in labeled_names]
  
# Divide the resulting list of feature
# sets into a training set and a test set.
train_set, test_set = featuresets[500:], featuresets[:500]

# The training set is used to 
# train a new "naive Bayes" classifier.
classifier = nltk.NaiveBayesClassifier.train(train_set)
  
print(classifier.classify(gender_features('Hannah')))
  
# output should be 'female'
print(nltk.classify.accuracy(classifier, train_set))
  
# it shows accurancy of our classifier and train_set. 

female
0.761687264911338


#5.Calculate the ROC AUC, precision, recall, accuracy, and confusion matrix for each dataset split

In [15]:
dv = DictVectorizer(sparse=False)
train_vectors = dv.fit_transform(train_features)
val_vectors = dv.transform(val_features)
test_vectors = dv.transform(test_features)

In [16]:
nb = BernoulliNB()
nb.fit(train_vectors, train_labels)

BernoulliNB()

In [17]:
train_predict = nb.predict(train_vectors)
val_predict = nb.predict(val_vectors)
test_predict =  nb.predict(test_vectors)

Training Performance

In [18]:
roc_auc_score(train_labels, train_predict)

0.770990670812972

In [19]:
print(classification_report(train_labels, train_predict))

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      2251
           1       0.75      0.82      0.78      2251

    accuracy                           0.77      4502
   macro avg       0.77      0.77      0.77      4502
weighted avg       0.77      0.77      0.77      4502



In [20]:
confusion_matrix(train_labels, train_predict)

array([[1625,  626],
       [ 405, 1846]], dtype=int64)

Validation Performance

In [21]:
roc_auc_score(val_labels, val_predict)

0.7483665338645418

In [22]:
print(classification_report(val_labels, val_predict))

              precision    recall  f1-score   support

           0       0.79      0.68      0.73       250
           1       0.72      0.82      0.76       251

    accuracy                           0.75       501
   macro avg       0.75      0.75      0.75       501
weighted avg       0.75      0.75      0.75       501



In [23]:
confusion_matrix(val_labels, val_predict)

array([[170,  80],
       [ 46, 205]], dtype=int64)

Test Performance

In [24]:
roc_auc_score(test_labels, test_predict)

0.72599809154431

In [25]:
print(classification_report(test_labels, test_predict))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       442
           1       0.70      0.78      0.74       441

    accuracy                           0.73       883
   macro avg       0.73      0.73      0.73       883
weighted avg       0.73      0.73      0.73       883



In [26]:
confusion_matrix(test_labels, test_predict)

array([[296, 146],
       [ 96, 345]], dtype=int64)

#7.Show the top 10 and the bottom 10 features based on importance

In [27]:
best_neg = nb.feature_log_prob_[0].argsort()
best_pos = nb.feature_log_prob_[1].argsort()

In [28]:
[dv.feature_names_[idx] for idx in best_neg[-10:][::-1]]

['last_letter=a',
 'last_letter=e',
 'last_letter=y',
 'last_letter=n',
 'last_letter=i',
 'last_letter=l',
 'last_letter=h',
 'last_letter=s',
 'last_letter=t',
 'last_letter=r']

In [29]:
[dv.feature_names_[idx] for idx in best_pos[-10:][::-1]]

['last_letter=n',
 'last_letter=e',
 'last_letter=y',
 'last_letter=s',
 'last_letter=d',
 'last_letter=l',
 'last_letter=r',
 'last_letter=o',
 'last_letter=t',
 'last_letter=h']

In [30]:
diff_pos_neg = (nb.feature_log_prob_[1] - nb.feature_log_prob_[0]).argsort()

In [31]:
[dv.feature_names_[idx] for idx in diff_pos_neg[:10]]

['last_letter=a',
 'last_letter=i',
 'last_letter=e',
 'last_letter=y',
 'last_letter=h',
 'last_letter=l',
 'last_letter=x',
 'last_letter=j',
 'last_letter=n',
 'last_letter=z']

In [32]:
[dv.feature_names_[idx] for idx in diff_pos_neg[-10:][::-1]]

['last_letter=f',
 'last_letter=c',
 'last_letter=k',
 'last_letter=o',
 'last_letter=v',
 'last_letter=d',
 'last_letter=r',
 'last_letter=m',
 'last_letter=g',
 'last_letter=p']