In [187]:
## Importing relevant packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
import random
from random import shuffle
from nltk.metrics.scores import (precision, recall)
from sklearn.model_selection import cross_val_score


In [188]:
## Define the function that extracts the last letter and last 2 letters of the names 
## as features

def gender_features(word): ## accuracy of 0.858 with European names only, 0.742 with Maori inc.
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [197]:
## Classification algorithm
if __name__ == "__main__":
    """ Starting block """

    # Extract the data sets
    labeled_names = (pd.read_csv("Names_Combined.csv")[['Name', 'Gender', 'Ethnicity']])
    
    print(len(labeled_names))  # 782 names
    
     # Shuffle the names in the list
    labeled_names = labeled_names.sample(frac=1)
    
    # Extract features from dataset
    feature_sets = [(gender_features(labeled_names.Name.iloc[i]),
                                   labeled_names.Gender.iloc[i]) 
                    for i in range(0, len(labeled_names))]
    
    #Extract ethnicity from dataset
    feature_sets_ethnicity = [labeled_names.Ethnicity.iloc[i] 
                              for i in range(0, len(labeled_names))]
    
    # Divide the feature sets into training and test sets
    train_set, test_set = feature_sets[:500], feature_sets[500:]
    
    train_ethnicity, test_ethnicity = feature_sets_ethnicity[:500], feature_sets_ethnicity[500:]
    
    # Train the naiveBayes classifier
    classifierNB = nltk.NaiveBayesClassifier.train(train_set)

    # Test the accuracy of the classifier on the test data
    print(nltk.classify.accuracy(classifierNB, test_set))# returns 0.78 for now

    # Examine classifier to determine which feature is most effective for
    # distinguishing the name's gender
    print(classifierNB.show_most_informative_features(10))

782
0.75177304964539
Most Informative Features
                 suffix1 = 'd'                 M : F      =      9.4 : 1.0
                 suffix1 = 'a'                 F : M      =      9.3 : 1.0
                 suffix2 = 'in'                M : F      =      5.1 : 1.0
                 suffix1 = 's'                 M : F      =      5.1 : 1.0
                 suffix2 = 'ie'                F : M      =      4.7 : 1.0
                 suffix2 = 'on'                M : F      =      4.4 : 1.0
                 suffix2 = 'is'                M : F      =      4.3 : 1.0
                 suffix2 = 'an'                M : F      =      3.9 : 1.0
                 suffix2 = 'ra'                F : M      =      3.9 : 1.0
                 suffix2 = 'te'                F : M      =      3.7 : 1.0
None


In [198]:
## Turning test set into a DataFrame from a list  
test_df = pd.DataFrame(test_set)

## Extracting the column of actual genders from the test df
y_test = test_df[1]


In [199]:
## Predict the genders of the test dataset and save these in a list. 
y_pred = []

for i in range(0, 282):
        y_pred.append(classifierNB.classify(test_df.iloc[i,0]))

## Print a confusion matrix of actual vs predicted genders for the test dataset. 
print(confusion_matrix(y_test, y_pred)) ## Accuracy of 77%

## Get a classification report - precision, recall
print(classification_report(y_test, y_pred))
    

[[114  41]
 [ 29  98]]
             precision    recall  f1-score   support

          F       0.80      0.74      0.77       155
          M       0.71      0.77      0.74       127

avg / total       0.76      0.75      0.75       282



In [200]:
## Add ethnicity information to test dataset
test_ethnicity_df = pd.DataFrame(test_ethnicity)

ethnicities_test_df = test_df.merge(right = test_ethnicity_df, left_index = True, right_index=True)
ethnicities_test_df.columns = ['features', 'gender', 'ethnicity']
print(ethnicities_test_df.head())

                            features gender ethnicity
0  {'suffix1': 'r', 'suffix2': 'er'}      M  European
1  {'suffix1': 'a', 'suffix2': 'la'}      F  European
2  {'suffix1': 'o', 'suffix2': 'lo'}      M  European
3  {'suffix1': 'l', 'suffix2': 'll'}      M  European
4  {'suffix1': 'y', 'suffix2': 'ey'}      F  European


In [201]:
## Split test set up by ethnicity
maori_test_subset = ethnicities_test_df[ethnicities_test_df.ethnicity == 'Maori'][['features', 'gender']]

european_test_subset = ethnicities_test_df[ethnicities_test_df.ethnicity == 'European'][['features', 'gender']]


In [202]:
print(len(maori_test_subset)) 
print(len(european_test_subset)) 

27
255


In [203]:
## Predict the genders of the Maori test dataset and save these in a list. 

y_maori_test = maori_test_subset['gender']

y_maori_pred = []

for i in range(0, len(maori_test_subset)):
        y_maori_pred.append(classifierNB.classify(maori_test_subset.iloc[i,0]))

## Print a confusion matrix of actual vs predicted genders for the Maori test dataset. 
print(confusion_matrix(y_maori_test, y_maori_pred)) 

## Get classification report - precision, recall etc. 
print(classification_report(y_maori_test, y_maori_pred))

[[13  2]
 [ 7  5]]
             precision    recall  f1-score   support

          F       0.65      0.87      0.74        15
          M       0.71      0.42      0.53        12

avg / total       0.68      0.67      0.65        27



In [204]:
## predict the genders of the european test dataset and save these in a list. 
y_european_test = european_test_subset['gender']

y_european_pred = []

for i in range(0, len(european_test_subset)):
        y_european_pred.append(classifierNB.classify(european_test_subset.iloc[i,0]))

## print a confusion matrix of actual vs predicted genders for the european test dataset. 
print(confusion_matrix(y_european_test, y_european_pred)) ## Accuracy of 73%

## get precision, recall etc. 
print(classification_report(y_european_test, y_european_pred))


[[101  39]
 [ 22  93]]
             precision    recall  f1-score   support

          F       0.82      0.72      0.77       140
          M       0.70      0.81      0.75       115

avg / total       0.77      0.76      0.76       255

