In [107]:
# https://blog.ayoungprogrammer.com/2016/04/determining-gender-of-name-with-80.html/
# https://www.kdnuggets.com/2019/01/solve-90-nlp-problems-step-by-step-guide.html
# https://towardsdatascience.com/encoding-categorical-features-21a2651a065c
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# https://eli5.readthedocs.io/en/latest/tutorials/sklearn-text.html
# https://fizzylogic.nl/2017/11/07/learn-how-to-build-flexible-machine-learning-pipelines-in-sklearn/
# https://gist.github.com/amberjrivera/8c5c145516f5a2e894681e16a8095b5c

In [18]:
import nltk
# nltk.download('names')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [22]:
import random, string
import pandas as pd
import numpy as np

In [93]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

### Load Data

In [4]:
from nltk.corpus import names

In [5]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] 
                 + [(name, 'female') for name in names.words('female.txt')])

# seed the shuffle to get always the same results
random.Random(123).shuffle(labeled_names)

df_orig = pd.DataFrame(labeled_names, columns=['name', 'label'])

In [6]:
# get max name length
max = 0
for n in df_orig['name']:
    if len(n) > max:
        max = len(n)
#         print(n)
print(max)

15


#### Data Preprocessing

In [7]:
gender_mapping = {'male': 1, 
                  'female': 0}

df_orig['label'] = df_orig['label'].map(lambda x: gender_mapping[x])
df_orig.head()

Unnamed: 0,name,label
0,Cordelie,0
1,Peggie,0
2,Solange,0
3,Rana,0
4,Jessy,0


In [8]:
# male / female distribution
print('Cnt male names: {}'.format(len(df_orig[df_orig['label']==1])))
print('Cnt female names: {}'.format(len(df_orig[df_orig['label']==0])))

Cnt male names: 2943
Cnt female names: 5001


In [9]:
# Inspect Null Values
null_columns=df_orig.columns[df_orig.isnull().any()]
# print(df[null_columns].isnull().sum())
print(df_orig[df_orig.isnull().any(axis=1)][null_columns].head())

Empty DataFrame
Columns: []
Index: []


In [10]:
# view compound names
# print(df_orig[df_orig['name'].str.contains(' ')].name)
# print(df_orig[df_orig['name'].str.contains("'")].name)
# print(df_orig[df_orig['name'].str.contains('-')].name)

# df[df['name']=='Jean-Christophe']

# remove name E'Lane
df_orig = df_orig[~df_orig['name'].str.contains("'")]

print(len(df_orig))
df_orig.head()

7943


Unnamed: 0,name,label
0,Cordelie,0
1,Peggie,0
2,Solange,0
3,Rana,0
4,Jessy,0


#### Train / Test Split

In [11]:
# Train data
df_train = df_orig.sample(frac=0.8, random_state=200)
y_train = df_train.pop('label')

# Test data
df_test = df_orig.drop(df_train.index)
y_test = df_test.pop('label')

print(len(df_train), len(df_test))

6354 1589


### Feature Engineering

In [217]:
class PreProcessor(BaseEstimator, TransformerMixin):  
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return X
    
    def encode(self, df):
        # Convert categorical variable into dummy/indicator variables
#         print(df.shape)
        return pd.get_dummies(df, drop_first=False)
    
    def transform(self, X):
        
#         print(X[:5])
        res = []
        
        # character order
        for name in X:
            feats = []
            arr = np.zeros(15).astype(int)
            name = name.strip().lower()
            for ind, l in enumerate(name):
                try:
                    arr[ind] = string.ascii_lowercase.index(l)+1
                except:
                    arr[ind] = -1 # compound name
            [feats.append(i) for i in arr]

            # get name length
            feats.append(len(name))
            
            # get last letter
            feats.append(name[-1:].lower())

            # get second last letter
            feats.append(name[-2:-1].lower())
            
            # append set of features to main list
            res.append(feats)
        X = self.encode(pd.DataFrame(res))
        print(type(X.values))
        return X.values

#### Pipeline

In [218]:
features = []

pp = PreProcessor()
features.append(('preprocess', pp))

cv = CountVectorizer(analyzer='char')
features.append(('cv', cv))

# combine
all_features = FeatureUnion(features)

pipeline = Pipeline([
    ('all', all_features),
    ('clf', GaussianNB())
])

corpus = df_train['name'].copy()
# corpus = ['Raoul', 'Anna', 'Eileen']
pipeline.fit_transform(corpus, y_train)
# X = X.toarray() # dense data is required for fitting

# Fit Model
# nb = GaussianNB()
# nb.fit(X, y_train)

# Accuracy
# nb.score(X, y_train)
# nb.predict(X)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# tmp = df.copy()
# y = tmp.pop('label')
# X = tmp

# # 1st pass to create train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

# print('len X_train: {}'.format(len(X_train)))
# print('len X_test: {}'.format(len(X_test)))

#### Unseen Data

In [187]:
"""
To try to predict the outcome on a new document we need to extract the features 
using almost the same feature extracting chain as before. 
The difference is that we call transform instead of fit_transform on the transformers, 
since they have already been fit to the training set.
"""

corpus_new = pd.Series(['Raoul', 'Anna', 'Eileen'])
X_new = pipeline.transform(corpus_new)

X_new = X_new.toarray() # dense data is required for fitting
print(X_new)
print(X.shape, X_new.shape)

# Predict
nb.predict(X_new)

[[18  1 15 21 12  0  0  0  0  0  0  0  0  0  0  5  0  1  0  0  0  1  0  0
   1  0  0  0  0  0  0  0  0  0  0  1  0  0  1  0  0  1  0  0  1  0  0  0
   0  0]
 [ 1 14 14  1  0  0  0  0  0  0  0  0  0  0  0  4  1  0  0  0  1  0  0  0
   2  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 5  9 12  5  5 14  0  0  0  0  0  0  0  0  0  6  0  0  1  1  0  0  0  0
   0  0  0  0  3  0  0  0  1  0  0  1  0  1  0  0  0  0  0  0  0  0  0  0
   0  0]]
(6354, 94) (3, 50)


ValueError: operands could not be broadcast together with shapes (3,50) (94,) 