In [1]:
# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
import random
import nltk
from pprint import pprint
import pandas as pd
import numpy as np
pd.set_option('display.expand_frame_repr', False)
# nltk.download('names')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

import matplotlib.pyplot as plt
# plt.rcParams['axes.labelsize'] = 10
# plt.rcParams['xtick.labelsize'] = 8
# plt.rcParams['ytick.labelsize'] = 8
import seaborn as sns
# sns.set_style('darkgrid')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing # for LabelEncoder()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

### Load Data

In [3]:
from nltk.corpus import names

In [4]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] 
                 + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)
df_orig = pd.DataFrame(labeled_names, columns=['name', 'label'])
print(len(df_orig))
df_orig.head()

7944


Unnamed: 0,name,label
0,Murdock,male
1,Marrilee,female
2,Isabelle,female
3,Juliette,female
4,Hilarie,female


#### Data Preprocessing

These actions were prompted by the CountVectorizer().get_feature_names() output

* Removing name E'Lane

In [5]:
# view compound names
# print(df[df['name'].str.contains(' ')].name)
# print(df[df['name'].str.contains("'")].name)
# print(df[df['name'].str.contains('-')].name)

df = df_orig.copy()
df = df[~df['name'].str.contains("'")]
print(len(df))

7943


### Feature Engineering

#### Compund Name Feature

In [6]:
def is_compound(name):
    name = name.strip()
    if (' ' in name) or ('-' in name):
        return 1
    return 0

df['is_compound'] = df['name'].apply(lambda x: is_compound(x))
print(len(df))
df.head()

7943


Unnamed: 0,name,label,is_compound
0,Murdock,male,0
1,Marrilee,female,0
2,Isabelle,female,0
3,Juliette,female,0
4,Hilarie,female,0


#### Suffix Bigram Feature

In [7]:
def suffix_bigram(name):
    name = name.strip()
    return name[-2:].lower()

df['suffix_bigram'] = df['name'].apply(lambda x: str(suffix_bigram(x)))
print(len(df))
df.head()

7943


Unnamed: 0,name,label,is_compound,suffix_bigram
0,Murdock,male,0,ck
1,Marrilee,female,0,ee
2,Isabelle,female,0,le
3,Juliette,female,0,te
4,Hilarie,female,0,ie


#### Suffix Unigram Feature

In [8]:
def suffix_unigram(name):
    name = name.strip()
    return name[-1:].lower()

df['suffix_unigram'] = df['name'].apply(lambda x: suffix_unigram(x))
print(len(df))
df.head()

7943


Unnamed: 0,name,label,is_compound,suffix_bigram,suffix_unigram
0,Murdock,male,0,ck,k
1,Marrilee,female,0,ee,e
2,Isabelle,female,0,le,e
3,Juliette,female,0,te,e
4,Hilarie,female,0,ie,e


#### Name Length Feature

In [9]:
def name_length(name):
    name = name.strip()
    return len(name)

df['name_length'] = df['name'].apply(lambda x: name_length(x))
print(len(df))
df.head()

7943


Unnamed: 0,name,label,is_compound,suffix_bigram,suffix_unigram,name_length
0,Murdock,male,0,ck,k,7
1,Marrilee,female,0,ee,e,8
2,Isabelle,female,0,le,e,8
3,Juliette,female,0,te,e,8
4,Hilarie,female,0,ie,e,7


#### Letter Frequency Feature

In [10]:
# Test CounVectorizer
# corpus = ['Raoul', 'Anna']

# count_vectorizer = CountVectorizer(analyzer='char', min_df=1)
# X = count_vectorizer.fit_transform(corpus)
# print(count_vectorizer.get_feature_names())
# print(pd.DataFrame(X.toarray()))

# count_vectorizer = CountVectorizer(analyzer='char', min_df=0.6)
# X = count_vectorizer.fit_transform(corpus)
# print(count_vectorizer.get_feature_names())
# print(pd.DataFrame(X.toarray()))

In [11]:
corpus = df['name'].tolist()

# apply CountVectorizer()
count_vectorizer = CountVectorizer(analyzer='char', min_df=1)
X = count_vectorizer.fit_transform(corpus)
X = pd.DataFrame(X.toarray())

# inspect
print(count_vectorizer.get_feature_names())
X.head()
print(len(X), len(df))

# add the new columns to df
df = pd.concat([df.reset_index(drop=True), 
                X.reset_index(drop=True)], axis=1)

# inspect
print(len(df))
# print(df.columns)
df.head()

[' ', '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
7943 7943
7943


Unnamed: 0,name,label,is_compound,suffix_bigram,suffix_unigram,name_length,0,1,2,3,...,18,19,20,21,22,23,24,25,26,27
0,Murdock,male,0,ck,k,7,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,Marrilee,female,0,ee,e,8,0,0,1,0,...,0,2,0,0,0,0,0,0,0,0
2,Isabelle,female,0,le,e,8,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
3,Juliette,female,0,te,e,8,0,0,0,0,...,0,0,0,2,1,0,0,0,0,0
4,Hilarie,female,0,ie,e,7,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


#### Inspect Null Values

In [12]:
null_columns=df.columns[df.isnull().any()]
# print(df[null_columns].isnull().sum())
print(df[df.isnull().any(axis=1)][null_columns].head())

Empty DataFrame
Columns: []
Index: []


#### Encoding Categorical Features

In [13]:
df.dtypes
# le_label = preprocessing.LabelEncoder()
# le_last_letter = preprocessing.LabelEncoder()
# le_last_2letters = preprocessing.LabelEncoder()

# # verify label encoder output
# le_label.fit_transform(df['label'])
# print(le_label.classes_)

# df = (pd.concat([df,
#                  pd.DataFrame(le_label.fit_transform(df['label']), 
#                               columns=['label_enc']),
#                  pd.DataFrame(le_last_letter.fit_transform(df['last_letter']), 
#                               columns=['last_letter_enc']),
#                  pd.DataFrame(le_last_2letters.fit_transform(df['last_2letters']), 
#                               columns=['last_2letters_enc'])
#                 ], 
#                 axis=1))


# df.head()

name              object
label             object
is_compound        int64
suffix_bigram     object
suffix_unigram    object
name_length        int64
0                  int64
1                  int64
2                  int64
3                  int64
4                  int64
5                  int64
6                  int64
7                  int64
8                  int64
9                  int64
10                 int64
11                 int64
12                 int64
13                 int64
14                 int64
15                 int64
16                 int64
17                 int64
18                 int64
19                 int64
20                 int64
21                 int64
22                 int64
23                 int64
24                 int64
25                 int64
26                 int64
27                 int64
dtype: object

### Classify

#### Split Train/Test

In [14]:
tmp = df.copy()
y = tmp.pop('label_enc')
X = tmp

# 1st pass to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     stratify=X.label, 
                                                    random_state=0
                                                    )

# 2nd pass to divide train set into train and dev sets
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, random_state=0)

print('len X_train: {}'.format(len(X_train)))
print('len X_dev: {}'.format(len(X_dev)))
print('len X_test: {}'.format(len(X_test)))

KeyError: 'label_enc'

In [None]:
print('Count male names in X_train: {}'.format(len(X_train.loc[X_train['label']=='male'])))
print('Count female names in X_train: {}'.format(len(X_train.loc[X_train['label']=='female'])))

#### Model

In [None]:
# Train the model using the training sets

nb = GaussianNB()
nb.fit(X_train.select_dtypes(include=[np.number]), 
       y_train)

#### Predict & Evaluate

In [None]:
# Accuracy
# nltk.classify.accuracy(classifier, dev_set)
nb.score(X_dev.select_dtypes(include=[np.number]), 
         y_dev)

#### Test unseen data

In [None]:
name = 'Arlo'
name = 'Anna'
name = 'Laura'
name = 'Eileen'

# get test name features (transpose to accomodate next step)
name_features = pd.DataFrame(get_features(name)).T

name_features = pd.concat([name_features, 
                           pd.DataFrame(vectorizer.transform([name]).toarray())],
                         axis=1)
name_features

# # make a helper df of the test name features
# name_features_enc = (pd.concat([
#                  pd.DataFrame(le_last_letter.transform(name_features.loc[:, 0]), columns=['last_letter_enc']),
#                  pd.DataFrame(le_last_2letters.transform(name_features.loc[:, 1]), columns=['last_2letters_enc']),
#                 ], 
#                 axis=1))
# print(name_features_enc)

# # predict label (must use first row of helper df & pass as list NOT as Series hence the outer [])
# predicted_label = nb.predict([name_features_enc.iloc[0,:]])
# print("Predicted label for {}: {}".format(name, predicted_label[0]))


#### Error Analysis

In [None]:
# Predict labels for X_dev and isolate records with incorrect pred.

# get predictons for X_dev
pred = nb.predict(X_dev[['last_letter_enc', 'last_2letters_enc']])

# make a df composed of X_dev cols
df = (pd.concat([X_dev.reset_index(drop=True), 
                  y_dev.reset_index(drop=True), 
                  pd.DataFrame(pred, columns=['predicted'])], axis=1))

# get all incorrect predictions
df_e = df[df['label_enc'] != df['predicted']]
# get all correct predictions
df_c= df[df['label_enc'] == df['predicted']]

In [None]:
# get subset of incorrect predictions
print('incorrect predictions\n')
i = 'n'
for g in ['male', 'female']:
    res = df_e[(df_e['last_letter']==i) & (df_e['label']==g)]
    print('cnt {} ending in {}: {}'.format(g, i,len(res)))
    if len(res)>0:
        print(res.head())

In [None]:
# get subset of correct predictions
print('correct predictions\n')
i = 'n'
for g in ['male', 'female']:
    res = df_c[(df_c['last_letter']==i) & (df_c['label']==g)]
    print('cnt {} ending in {}: {}'.format(g, i,len(res)))
    if len(res)>0:
        print(res.head())

### Appendix: Exploratory Data Analysis

In [None]:
# sns.countplot(x=df.label, data=df) 
# plt.show()
pd.DataFrame(df.groupby('label').size(), columns=['count']).T

In [None]:
sns.countplot(x=df.last_letter, hue=df.label, data=df) 
plt.show()

In [None]:
df_agg = df.groupby(['last_2letters', 'label']).count().reset_index()
# df_agg = df_agg[df_agg['count']>=50]
df_agg.head()

# print(df_agg[(df_agg['label']=='male') & (df_agg['count']>=100)])
# print(df_agg[(df_agg['label']=='female') & (df_agg['count']>=100)])

# sns.barplot(x=df_agg['last_2letters'], y=df_agg['count'], hue=df_agg['label'], data=df_agg) 
# plt.show()

In [None]:
# Most Informative Features
# classifier.show_most_informative_features(5)
# """
# Most Informative Features
#              last_letter = 'k'              male : female =     42.2 : 1.0
#              last_letter = 'a'            female : male   =     33.8 : 1.0
#              last_letter = 'v'              male : female =     17.2 : 1.0
#              last_letter = 'f'              male : female =     15.6 : 1.0
#              last_letter = 'p'              male : female =     11.6 : 1.0
# """