In [1]:
# https://blog.ayoungprogrammer.com/2016/04/determining-gender-of-name-with-80.html/
# https://www.kdnuggets.com/2019/01/solve-90-nlp-problems-step-by-step-guide.html

In [2]:
# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
import random, string
import nltk
from pprint import pprint
import pandas as pd
import numpy as np
pd.set_option('display.expand_frame_repr', False)
# nltk.download('names')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

import matplotlib.pyplot as plt
# plt.rcParams['axes.labelsize'] = 10
# plt.rcParams['xtick.labelsize'] = 8
# plt.rcParams['ytick.labelsize'] = 8
import seaborn as sns
# sns.set_style('darkgrid')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

### Load Data

In [4]:
from nltk.corpus import names

In [5]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] 
                 + [(name, 'female') for name in names.words('female.txt')])

# seed the shuffle to get always the same results
random.Random(123).shuffle(labeled_names)

df_orig = pd.DataFrame(labeled_names, columns=['name', 'label'])

#### Set aside test data

In [6]:
df_train = df_orig.sample(frac=0.8, random_state=200)
df_test = df_orig.drop(df_train.index)
print(len(df_train), len(df_test))
df_train.head()

6355 1589


Unnamed: 0,name,label
7073,Bryn,male
1837,Caty,female
2263,Cletus,male
426,Bryce,male
6680,Sadie,female


In [7]:
# male / female distribution
print('Cnt male names: {}'.format(len(df_train[df_train['label']=='male'])))
print('Cnt female names: {}'.format(len(df_train[df_train['label']=='female'])))

Cnt male names: 2376
Cnt female names: 3979


In [8]:
# get max name length
max = 0
for n in df_orig['name']:
    if len(n) > max:
        max = len(n)
#         print(n)
print(max)

15


#### Data Preprocessing

These actions were prompted by the CountVectorizer().get_feature_names() output

In [9]:
# view compound names
# print(df[df['name'].str.contains(' ')].name)
# print(df[df['name'].str.contains("'")].name)
# print(df[df['name'].str.contains('-')].name)

df = df_train.copy()

# remove name E'Lane
df = df[~df['name'].str.contains("'")]

# collapse compound names
# df['name'] = df['name'].apply(lambda x: x.replace('-', ''))
# df['name'] = df['name'].apply(lambda x: x.replace(' ', ''))
print(len(df))

6355


### Feature Engineering

#### Character Order

In [10]:
def get_char_order(name):
    name = name.strip().lower()
    arr = np.zeros(15).astype(int)
    for ind, l in enumerate(name):
        try:
            arr[ind] = string.ascii_lowercase.index(l)+1
        except:
            arr[ind] = -1 # compound name
    return arr

# test
# get_char_order('eartha')

df['tmp'] = df['name'].apply(lambda x: get_char_order(x))

df = (pd.concat([df.reset_index(drop=True), 
                pd.DataFrame(df['tmp'].to_list()).reset_index(drop=True)],
               axis=1)
      .drop(df[['tmp']], axis=1))
df.head()

Unnamed: 0,name,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Bryn,male,2,18,25,14,0,0,0,0,0,0,0,0,0,0,0
1,Caty,female,3,1,20,25,0,0,0,0,0,0,0,0,0,0,0
2,Cletus,male,3,12,5,20,21,19,0,0,0,0,0,0,0,0,0
3,Bryce,male,2,18,25,3,5,0,0,0,0,0,0,0,0,0,0
4,Sadie,female,19,1,4,9,5,0,0,0,0,0,0,0,0,0,0


In [11]:
df[df['name']=='Jean-Christophe']

Unnamed: 0,name,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
4025,Jean-Christophe,male,10,5,1,14,-1,3,8,18,9,19,20,15,16,8,5


#### Compund Name Feature

In [12]:
# def is_compound(name):
#     name = name.strip()
#     if (' ' in name):
#         return 1
#     return 0

# df['is_compound'] = df['name'].apply(lambda x: is_compound(x))
# print(len(df))
# df.head()

#### Suffix Last Letter Feature

In [13]:
def suffix1(name):
    name = name.strip()
    return name[-1:].lower()

df['suffix1'] = df['name'].apply(lambda x: suffix1(x))
print(len(df))
df.head()

6355


Unnamed: 0,name,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,suffix1
0,Bryn,male,2,18,25,14,0,0,0,0,0,0,0,0,0,0,0,n
1,Caty,female,3,1,20,25,0,0,0,0,0,0,0,0,0,0,0,y
2,Cletus,male,3,12,5,20,21,19,0,0,0,0,0,0,0,0,0,s
3,Bryce,male,2,18,25,3,5,0,0,0,0,0,0,0,0,0,0,e
4,Sadie,female,19,1,4,9,5,0,0,0,0,0,0,0,0,0,0,e


#### Suffix 2nd Last Letter Feature

In [14]:
def suffix2(name):
    name = name.strip()
    return name[-2:-1].lower()

df['suffix2'] = df['name'].apply(lambda x: str(suffix2(x)))
print(len(df))
df.head()

6355


Unnamed: 0,name,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,suffix1,suffix2
0,Bryn,male,2,18,25,14,0,0,0,0,0,0,0,0,0,0,0,n,y
1,Caty,female,3,1,20,25,0,0,0,0,0,0,0,0,0,0,0,y,t
2,Cletus,male,3,12,5,20,21,19,0,0,0,0,0,0,0,0,0,s,u
3,Bryce,male,2,18,25,3,5,0,0,0,0,0,0,0,0,0,0,e,c
4,Sadie,female,19,1,4,9,5,0,0,0,0,0,0,0,0,0,0,e,i


#### Name Length Feature

In [15]:
def name_length(name):
    name = name.strip()
    return len(name)

df['name_length'] = df['name'].apply(lambda x: name_length(x))
print(len(df))
df.head()

6355


Unnamed: 0,name,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,suffix1,suffix2,name_length
0,Bryn,male,2,18,25,14,0,0,0,0,0,0,0,0,0,0,0,n,y,4
1,Caty,female,3,1,20,25,0,0,0,0,0,0,0,0,0,0,0,y,t,4
2,Cletus,male,3,12,5,20,21,19,0,0,0,0,0,0,0,0,0,s,u,6
3,Bryce,male,2,18,25,3,5,0,0,0,0,0,0,0,0,0,0,e,c,5
4,Sadie,female,19,1,4,9,5,0,0,0,0,0,0,0,0,0,0,e,i,5


#### Character Frequency Feature

In [16]:
# Test CounVectorizer
# corpus = ['Raoul', 'Anna']

# count_vectorizer = CountVectorizer(analyzer='char', min_df=1)
# X = count_vectorizer.fit_transform(corpus)
# print(count_vectorizer.get_feature_names())
# print(pd.DataFrame(X.toarray()))

# count_vectorizer = CountVectorizer(analyzer='char', min_df=0.6)
# X = count_vectorizer.fit_transform(corpus)
# print(count_vectorizer.get_feature_names())
# print(pd.DataFrame(X.toarray()))

In [17]:
corpus = df['name'].tolist()

# apply CountVectorizer()
count_vectorizer = CountVectorizer(analyzer='char', min_df=1)
X = count_vectorizer.fit_transform(corpus)
X = pd.DataFrame(X.toarray())

# inspect
print(count_vectorizer.get_feature_names())
X.head()
print(len(X), len(df))

# add the new columns to df
df = pd.concat([df.reset_index(drop=True), 
                X.reset_index(drop=True)], axis=1)

# inspect
print(len(df))
# print(df.columns)
df.head()

[' ', '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
6355 6355
6355


Unnamed: 0,name,label,0,1,2,3,4,5,6,7,...,18,19,20,21,22,23,24,25,26,27
0,Bryn,male,2,18,25,14,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,Caty,female,3,1,20,25,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,Cletus,male,3,12,5,20,21,19,0,0,...,0,0,1,1,1,0,0,0,0,0
3,Bryce,male,2,18,25,3,5,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,Sadie,female,19,1,4,9,5,0,0,0,...,0,0,1,0,0,0,0,0,0,0


#### Inspect Null Values

In [18]:
null_columns=df.columns[df.isnull().any()]
# print(df[null_columns].isnull().sum())
print(df[df.isnull().any(axis=1)][null_columns].head())

Empty DataFrame
Columns: []
Index: []


#### Encoding Binary Categorical Features

In [19]:
gender_mapping = {'male': 1, 'female': 0}
df['label'] = df['label'].map(lambda x: gender_mapping[x])
df.head()

Unnamed: 0,name,label,0,1,2,3,4,5,6,7,...,18,19,20,21,22,23,24,25,26,27
0,Bryn,1,2,18,25,14,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,Caty,0,3,1,20,25,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,Cletus,1,3,12,5,20,21,19,0,0,...,0,0,1,1,1,0,0,0,0,0
3,Bryce,1,2,18,25,3,5,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,Sadie,0,19,1,4,9,5,0,0,0,...,0,0,1,0,0,0,0,0,0,0


#### Encoding Multiclass Categorical Features

https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

In [20]:
# set 'name' to be the index
df.set_index('name', inplace=True)

# Convert categorical variable into dummy/indicator variables
df = pd.get_dummies(df, prefix_sep='_', columns=['suffix1', 'suffix2'], drop_first=True)
print(len(df))
df.head()

6355


Unnamed: 0_level_0,label,0,1,2,3,4,5,6,7,8,...,suffix2_p,suffix2_r,suffix2_s,suffix2_t,suffix2_u,suffix2_v,suffix2_w,suffix2_x,suffix2_y,suffix2_z
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bryn,1,2,18,25,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Caty,0,3,1,20,25,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Cletus,1,3,12,5,20,21,19,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Bryce,1,2,18,25,3,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sadie,0,19,1,4,9,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Classify

#### Split Train/Test

In [21]:
tmp = df.copy()
y = tmp.pop('label')
X = tmp

# 1st pass to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

print('len X_train: {}'.format(len(X_train)))
print('len X_test: {}'.format(len(X_test)))

len X_train: 4766
len X_test: 1589


#### Model

In [22]:
# Train the model using the training sets

nb = GaussianNB()
nb.fit(X_train.select_dtypes(include=[np.number]), 
       y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Predict & Evaluate

In [23]:
# Accuracy
# nltk.classify.accuracy(classifier, dev_set)
nb.score(X_train.select_dtypes(include=[np.number]), 
         y_train)

0.7742341586235837

#### Test unseen data

In [24]:
name = 'Arlo'
name = 'Anna'
name = 'Laura'
name = 'Eileen'

# get test name features (transpose to accomodate next step)
name_features = pd.DataFrame(get_features(name)).T

name_features = pd.concat([name_features, 
                           pd.DataFrame(vectorizer.transform([name]).toarray())],
                         axis=1)
name_features

# # make a helper df of the test name features
# name_features_enc = (pd.concat([
#                  pd.DataFrame(le_last_letter.transform(name_features.loc[:, 0]), columns=['last_letter_enc']),
#                  pd.DataFrame(le_last_2letters.transform(name_features.loc[:, 1]), columns=['last_2letters_enc']),
#                 ], 
#                 axis=1))
# print(name_features_enc)

# # predict label (must use first row of helper df & pass as list NOT as Series hence the outer [])
# predicted_label = nb.predict([name_features_enc.iloc[0,:]])
# print("Predicted label for {}: {}".format(name, predicted_label[0]))


NameError: name 'get_features' is not defined