# DATA 620 Project 3
### Natural Language Classification Model
### Steven Ellingson

In [1]:
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re
import warnings
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
os.chdir('C:\\Users\\steven.ellingson\\PycharmProjects\\cuny')

from nltk.corpus import names

In [2]:
df = pd.DataFrame()
df['name'] = names.words('male.txt')
df['male'] = 1
df2 = pd.DataFrame()
df2['name'] = names.words('female.txt')
df2['male'] = 0
df = df.append(df2)
df.reset_index(drop=False,inplace=True)
df['name'] = df['name'].str.lower()
df.sort_values('name').head(20)

Unnamed: 0,index,name,male
0,0,aamir,1
1,1,aaron,1
2943,0,abagael,0
2944,1,abagail,0
2945,2,abbe,0
2946,3,abbey,0
2,2,abbey,1
2947,4,abbi,0
2948,5,abbie,0
3,3,abbie,1


One thing I noticed is that they names are not distinct.  There are names that show up in both the male and female list:

In [3]:
dupe_df = df[df.duplicated('name')]
dupe_df.sort_values('name', inplace=True)
dupes = list(dupe_df.name)
print('There are ' + str(len(dupes)) + ' duplicate names in this dataset!')
print(dupes[0:20])

There are 368 duplicate names in this dataset!
['abbey', 'abbie', 'abby', 'addie', 'adrian', 'adrien', 'ajay', 'alex', 'alexis', 'alfie', 'ali', 'alix', 'allie', 'allyn', 'andie', 'andrea', 'andy', 'angel', 'angie', 'ariel']


Obviously this makes sense - Alex could be either a male or a female.  That being said I don't like that each of these just shows up twice in the dataset - i/e Stacy might be an androgynous name, but it's more likely to be female. This should either be a third category for our model, or we should be creating a regression to predict the percentage of a certain name that is male vs. female.  That being said the assignment was pretty specific about how to use this data and so I just left it for now.

In [4]:
seed = 1254
train,test = train_test_split(df, test_size=1000, random_state=seed)
devtest,test = train_test_split(test, test_size=500, random_state=seed)

#store these indexes for later, so we can easily pull from main df
train_indexes = list(train.index)
test_indexes = list(test.index)
devtest_indexes = list(devtest.index)

print(len(df),len(train),len(test),len(devtest))

7944 6944 500 500


I'll take starts and ends of names to create features.

Let's just pull them into a dataframe to see how they perform.

In [5]:
feature_df = pd.DataFrame(columns=['location','string','length','count','male_pct'])

for i in range(1, 6):
    for j in [-1,1]:
        #pull only names where length at least i
        df_i = train[train['name'].str.len()>=i]
        
        n = i*j
        if n > 0:
            temp_df = pd.DataFrame(df_i['name'].str[:n])
        else:
            temp_df = pd.DataFrame(df_i['name'].str[n:])
        temp_df['male'] = df_i['male']
        temp_df = temp_df.groupby('name', as_index=False).agg(['count','mean'])
        
        #Flatten and rename df
        temp_df.columns = list(map('_'.join, temp_df.columns.values))
        temp_df.reset_index(inplace=True)
        temp_df.columns =['string','count','male_pct']

        #Set the rest of the features
        if n > 0:
            temp_df['location'] = 'beginning'
        else:
            temp_df['location'] = 'end'
        temp_df['length'] = i
        feature_df = feature_df.append(temp_df)

feature_df.sort_values('count', ascending=False).head(20)


Unnamed: 0,location,string,length,count,male_pct
5,end,e,1,1656,0.246981
1,end,a,1,1566,0.016603
14,end,n,1,742,0.551213
24,end,y,1,692,0.416185
12,beginning,m,1,602,0.299003
0,beginning,a,1,578,0.318339
2,beginning,c,1,548,0.25365
100,end,ie,2,499,0.270541
18,beginning,s,1,488,0.440574
3,beginning,d,1,398,0.316583


In [6]:
filtered_df = feature_df[feature_df['count']>10]


filtered_df.sort_values(['male_pct','count'],ascending = [True,False]).head(10)

Unnamed: 0,location,string,length,count,male_pct
493,end,ina,3,157,0.0
38,end,da,2,113,0.0
1151,end,tte,3,88,0.0
905,end,ette,4,84,0.0
208,end,sa,2,81,0.0
642,end,lla,3,78,0.0
751,end,elle,4,69,0.0
774,end,nna,3,68,0.0
750,end,ella,4,60,0.0
625,end,lia,3,59,0.0


In [7]:
filtered_df.sort_values(['male_pct','count'],ascending = [False,False]).head(10)

Unnamed: 0,location,string,length,count,male_pct
1140,end,ton,3,47,1.0
35,end,ck,2,39,1.0
453,end,ick,3,26,1.0
3,end,c,1,21,1.0
2308,end,rick,4,19,1.0
98,end,ic,2,15,1.0
162,end,od,2,15,1.0
221,end,to,2,14,1.0
212,beginning,wo,2,14,1.0
76,end,art,3,14,1.0


I'll create a function to pull all of the features for a given name. It's important that I'm using ONLY the training set to give me the numbers involved.

In [8]:
def get_features(name, n, feature_df):
    if len(name) <  abs(n):
        male_pct = np.NaN
        count = np.NaN
    else:
        if n > 0:
            location = 'beginning'
            string = name[:n]
        else:
            location = 'end'
            string = name[n:]
            
        temp_df = feature_df[(feature_df['location']==location) & 
                              (feature_df['length']==abs(n)) &
                              (feature_df['string']==string)                          
            ]
        
        if len(temp_df) > 0:
            male_pct = temp_df.loc[temp_df.index[0], 'male_pct']
            count = temp_df.loc[temp_df.index[0], 'count']
        else:
            male_pct = np.NaN
            count = 0
    return([count, male_pct])

Let's check that the function works:

In [9]:
get_features('rosalinda', -3, feature_df)

[45, 0.0]

In [10]:
temp_df = train[train['name'].str[-3:]=='nda']
print(list(temp_df['name']))
print('Number of names that end with nda: ',str(len(temp_df)))
print('Number of males that end with nda: ',str(sum(temp_df['male'])))

['corenda', 'minda', 'wenda', 'vanda', 'melisenda', 'ramonda', 'kalinda', 'marylinda', 'linda', 'yolanda', 'fernanda', 'rhonda', 'lorinda', 'hinda', 'lynda', 'ferdinanda', 'zonda', 'celinda', 'selinda', 'glenda', 'manda', 'ronda', 'eolanda', 'shanda', 'clarinda', 'rosmunda', 'romonda', 'rosalinda', 'jacinda', 'secunda', 'belinda', 'malynda', 'melinda', 'lucinda', 'arlinda', 'delinda', 'randa', 'hynda', 'cinda', 'myranda', 'miranda', 'chanda', 'amanda', 'glynda', 'yalonda']
Number of names that end with nda:  45
Number of males that end with nda:  0


In [11]:
for i in range(1, 6):
    for j in [-1,1]:
        n = i*j
        if n > 0:
            feature_name = 'beg_'
        else:
            feature_name = 'end_'
        feature_list = df.apply(lambda x: get_features(x['name'], n, feature_df), axis=1)
        counts, males = map(list, zip(*feature_list))
        df[feature_name + str(i) + '_count'] = counts
        df[feature_name + str(i) + '_male_pct'] = males

df.head()

Unnamed: 0,index,name,male,end_1_count,end_1_male_pct,beg_1_count,beg_1_male_pct,end_2_count,end_2_male_pct,beg_2_count,...,beg_3_count,beg_3_male_pct,end_4_count,end_4_male_pct,beg_4_count,beg_4_male_pct,end_5_count,end_5_male_pct,beg_5_count,beg_5_male_pct
0,0,aamir,1,210,0.804762,578,0.318339,13,0.692308,1,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,aaron,1,742,0.551213,578,0.318339,156,0.858974,1,...,0.0,,4.0,0.75,0.0,,0.0,,0.0,
2,2,abbey,1,692,0.416185,578,0.318339,164,0.463415,24,...,10.0,0.5,3.0,0.333333,3.0,0.333333,3.0,0.333333,2.0,0.5
3,3,abbie,1,1656,0.246981,578,0.318339,499,0.270541,24,...,10.0,0.5,14.0,0.285714,3.0,0.333333,4.0,0.5,2.0,0.5
4,4,abbot,1,202,0.693069,578,0.318339,13,0.615385,24,...,10.0,0.5,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0


I will need to impute values for all the NAs. Because the NAs are not random, I don't just want to impute the means.  What I'll do is add binary variables to tell me if the value is NA or not, then I'll impute the mean for the male_pct, and impute 0 for the "count" variable.

In [12]:
df['length'] = df['name'].str.len()
df['end_1_na'] = 0
df['beg_1_na'] = 0
df['end_2_na'] = 0
df['beg_2_na'] = 0
df['end_3_na'] = 0
df['beg_3_na'] = 0
df['end_4_na'] = 0
df['beg_4_na'] = 0
df['end_5_na'] = 0
df['beg_5_na'] = 0
df.loc[df['end_1_male_pct'].isna(),'end_1_na'] = 1
df.loc[df['beg_1_male_pct'].isna(),'beg_1_na'] = 1
df.loc[df['end_2_male_pct'].isna(),'end_2_na'] = 1
df.loc[df['beg_2_male_pct'].isna(),'beg_2_na'] = 1
df.loc[df['end_3_male_pct'].isna(),'end_3_na'] = 1
df.loc[df['beg_3_male_pct'].isna(),'beg_3_na'] = 1
df.loc[df['end_4_male_pct'].isna(),'end_4_na'] = 1
df.loc[df['beg_4_male_pct'].isna(),'beg_4_na'] = 1
df.loc[df['end_5_male_pct'].isna(),'end_5_na'] = 1
df.loc[df['beg_5_male_pct'].isna(),'beg_5_na'] = 1

Now that we have our binary variables to indicate na, we can impute the mean male pct (0.37) for the percents, and 0 for the counts.

In [13]:
count_columns = [c for c in df.columns if '_count' in c]
male_pct_columns = [c for c in df.columns if '_male_pct' in c]
na_columns = [c for c in df.columns if '_na' in c]
X_columns = count_columns.copy()
X_columns.extend(male_pct_columns)
X_columns.extend(na_columns)
X_columns.append('length')

df[count_columns] = df[count_columns].fillna(0)
df[male_pct_columns] = df[male_pct_columns].fillna(0.37)
df.sort_values('beg_5_count').head()

Unnamed: 0,index,name,male,end_1_count,end_1_male_pct,beg_1_count,beg_1_male_pct,end_2_count,end_2_male_pct,beg_2_count,...,end_1_na,beg_1_na,end_2_na,beg_2_na,end_3_na,beg_3_na,end_4_na,beg_4_na,end_5_na,beg_5_na
5648,2705,kay,0,692,0.416185,301,0.20598,29,0.586207,134,...,0,0,0,0,0,0,1,1,1,1
5275,2332,janessa,0,1566,0.016603,378,0.325397,81,0.0,100,...,0,0,0,0,0,0,0,0,0,1
1771,1771,mort,1,202,0.693069,602,0.299003,48,0.958333,60,...,0,0,0,0,0,0,0,0,1,1
5284,2341,janie,0,1656,0.246981,378,0.325397,499,0.270541,100,...,0,0,0,0,0,0,0,0,1,1
5295,2352,jany,0,692,0.416185,378,0.325397,61,0.344262,100,...,0,0,0,0,0,0,0,0,1,1


Let's pull all the new features into all of our datasets.


In [14]:
train = df.iloc[train_indexes]
test =  df.iloc[test_indexes]
devtest =  df.iloc[devtest_indexes]

y_train = train['male']
X_train = train[X_columns]

y_test = test['male']
X_test = test[X_columns]

y_devtest = devtest['male']
X_devtest = devtest[X_columns]
print ('train mean:' + str(round(y_train.mean(),3)) + ', devtest mean:' + str(round(y_devtest.mean(),3)) + ', test mean:' + str(round(y_test.mean(),3)))

train mean:0.37, devtest mean:0.352, test mean:0.39


In [15]:
log = LogisticRegression(penalty='l1', solver='liblinear')
log.fit(X_train, y_train)
print ("Training Accuracy: " + str(round(100*log.score(X_train,y_train),1)) + '%')
print("Dev-Test Accuracy: " + str(round(100*log.score(X_devtest,y_devtest),1)) + '%')
cm = confusion_matrix(y_devtest, log.predict(X_devtest))
print(str(cm[0,0]) + " accurately predicted males, " + str(cm[1,1]) + " accurately predicted females")
print(str(cm[0,1]) + " false-males, " + str(cm[1,0]) + " false-females")

Training Accuracy: 95.6%
Dev-Test Accuracy: 80.6%
289 accurately predicted males, 114 accurately predicted females
35 false-males, 62 false-females


Here are the male names in which my model predicted female:

In [16]:
print(list(devtest.loc[(y_devtest==1) & (log.predict(X_devtest)==0), 'name']))

['del', 'rutledge', 'bernie', 'mickie', 'hillary', 'wallis', 'rory', 'randal', 'kelly', 'lex', 'demosthenis', 'petey', 'brice', 'alan', 'dyson', 'christy', 'christofer', 'hilliard', 'albrecht', 'blair', 'mayor', 'guido', 'horst', 'tyson', 'isadore', 'allah', 'kareem', 'klee', 'meredeth', 'zacharia', 'ossie', 'carroll', 'eddy', 'luigi', 'maximilian', 'gale', 'tiler', 'dean', 'shelby', 'tobe', 'chanderjit', 'marlow', 'pooh', 'joe', 'patty', 'giuseppe', 'franky', 'ted', 'ehud', 'gretchen', 'tommy', 'odin', 'mugsy', 'roni', 'axel', 'brooks', 'phil', 'jeb', 'gabriele', 'leslie', 'rube', 'alic']


Here are the female names in which my model predicted male:

In [17]:
print(list(devtest.loc[(y_devtest==0) & (log.predict(X_devtest)==1), 'name']))

['felipa', 'clemmy', 'brook', 'morlee', 'france', 'gillan', 'izzi', 'corny', 'saundra', 'jody', 'marnie', 'wilma', 'winifred', 'maddie', 'pavla', 'shayne', 'bill', 'grace', 'donnie', 'ricca', 'demetria', 'christian', 'karon', 'nan', 'sher', 'rhiamon', 'kerrin', 'rickie', 'shaine', 'cammy', 'flower', 'ronny', 'clemmie', 'pietra', 'sharron']


I think one set of features that could be helpful would be the interactions between the counts and the percentages.

Let's add those and see how the model reacts.

In [18]:
df['end_1_int'] = df['end_1_count']*df['end_1_male_pct']
df['beg_1_int'] = df['beg_1_count']*df['beg_1_male_pct']
df['end_2_int'] = df['end_2_count']*df['end_2_male_pct']
df['beg_2_int'] = df['beg_2_count']*df['beg_2_male_pct']
df['end_3_int'] = df['end_3_count']*df['end_3_male_pct']
df['beg_3_int'] = df['beg_3_count']*df['beg_3_male_pct']
df['end_4_int'] = df['end_4_count']*df['end_4_male_pct']
df['beg_4_int'] = df['beg_4_count']*df['beg_4_male_pct']
df['end_5_int'] = df['end_5_count']*df['end_5_male_pct']
df['beg_5_int'] = df['beg_5_count']*df['beg_5_male_pct']

int_columns = [c for c in df.columns if '_int' in c]
X_columns.extend(int_columns)

Pull from main DF again

In [19]:
train = df.iloc[train_indexes]
test =  df.iloc[test_indexes]
devtest =  df.iloc[devtest_indexes]

y_train = train['male']
X_train = train[X_columns]

y_devtest = devtest['male']
X_devtest = devtest[X_columns]

y_test = test['male']
X_test = test[X_columns]

print ('train mean:' + str(round(y_train.mean(),3)) + ', devtest mean:' + str(round(y_devtest.mean(),3)) + ', test mean:' + str(round(y_test.mean(),3)))

train mean:0.37, devtest mean:0.352, test mean:0.39


In [20]:
log = LogisticRegression(penalty='l1', solver='liblinear')
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print(pd.Series(log.coef_[0], index=X_columns).sort_values())

beg_4_int        -0.218164
length           -0.207504
beg_5_int        -0.110689
end_5_int        -0.109138
end_4_int        -0.056079
end_3_int        -0.021172
beg_3_int        -0.008926
beg_2_int        -0.005054
end_1_male_pct   -0.004933
end_1_count      -0.000500
beg_1_count      -0.000248
end_2_int        -0.000217
beg_2_na          0.000000
end_2_na          0.000000
beg_1_na          0.000000
end_1_na          0.000000
beg_2_male_pct    0.000000
end_2_male_pct    0.000000
beg_1_male_pct    0.000000
beg_2_count       0.000353
beg_1_int         0.001060
end_2_count       0.001801
end_1_int         0.002113
beg_3_count       0.002878
end_3_count       0.008263
end_4_na          0.011498
end_4_count       0.017161
end_3_na          0.057487
beg_4_count       0.076161
end_5_count       0.085914
beg_5_count       0.099309
beg_3_na          0.142310
end_5_na          0.327175
beg_4_na          0.347594
beg_5_na          0.368448
beg_3_male_pct    2.359304
end_3_male_pct    2.448311
e

In [22]:
print ("Training Accuracy: " + str(round(100*log.score(X_train,y_train),1)) + '%')
print("Dev-Test Accuracy: " + str(round(100*log.score(X_devtest,y_devtest),1)) + '%')
cm = confusion_matrix(y_devtest, log.predict(X_devtest))
print(str(cm[0,0]) + " accurately predicted males, " + str(cm[1,1]) + " accurately predicted females")
print(str(cm[0,1]) + " false-males, " + str(cm[1,0]) + " false-females")

Training Accuracy: 95.7%
Dev-Test Accuracy: 80.6%
289 accurately predicted males, 114 accurately predicted females
35 false-males, 62 false-females


So, pretty similar performance.  Let's try LASSO, which should help a bit with the overfitting.  It will use 5 fold cross-validation to give us the best value for Alpha.

In [23]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(alphas=[1, 0.8, 0.5, 0.2, 0.1, 0.01, 0.001, 0.0005, 0]).fit(X=X_train, y=y_train)

###Model Lasso regression
print(pd.Series(lasso.coef_, index=X_columns).sort_values())


length           -7.617802e-03
end_4_int        -3.669512e-03
end_3_int        -1.188740e-03
beg_5_count      -9.149040e-04
beg_4_int        -3.687655e-04
beg_3_int        -2.978957e-04
end_2_int        -1.181934e-04
beg_2_count      -3.856826e-05
beg_1_count      -3.430547e-05
beg_2_na          0.000000e+00
end_3_na          0.000000e+00
beg_3_na          0.000000e+00
beg_5_na          0.000000e+00
end_1_na          0.000000e+00
end_2_male_pct    0.000000e+00
beg_1_male_pct    0.000000e+00
end_2_na          0.000000e+00
beg_2_int        -0.000000e+00
beg_2_male_pct    0.000000e+00
beg_1_na          0.000000e+00
beg_4_na          8.015003e-16
end_1_count       8.542678e-06
end_2_count       2.897347e-05
end_1_int         3.341123e-05
beg_1_int         1.344904e-04
beg_3_count       1.786433e-04
end_3_count       1.915037e-04
end_5_count       2.765536e-04
end_4_count       4.730704e-04
beg_4_count       6.868930e-04
beg_5_male_pct    1.082240e-03
beg_5_int         5.854027e-03
end_5_in

In [24]:

log = LogisticRegressionCV(penalty='l1', solver='liblinear', Cs = 10, cv = 5)
log.fit(X_train, y_train)

print ("Training Accuracy: " + str(round(100*log.score(X_train,y_train),1)) + '%')
print("Dev-Test Accuracy: " + str(round(100*log.score(X_devtest,y_devtest),1)) + '%')
cm = confusion_matrix(y_devtest, log.predict(X_devtest))
print(str(cm[0,0]) + " accurately predicted males, " + str(cm[1,1]) + " accurately predicted females")
print(str(cm[0,1]) + " false-males, " + str(cm[1,0]) + " false-females")

Training Accuracy: 95.7%
Dev-Test Accuracy: 81.8%
287 accurately predicted males, 122 accurately predicted females
37 false-males, 54 false-females


OK, one last try.  Let's use a boosted tree to see if we can get better results than with the Logistic Regression.  XGBoost has a bunch of different hyperparameters - I will use a grid search to select the best ones.

In [25]:
xgb = XGBClassifier(objective = 'binary:logistic', random_state = seed)
param_dist = {'n_estimators': stats.randint(5, 1000),
              'learning_rate': stats.uniform(0.01, 0.07),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }
xg = RandomizedSearchCV(xgb, param_distributions = param_dist, n_iter = 25, scoring = 'f1', error_score = 0, verbose = 3, n_jobs = -1)
xg.fit(X_train, y_train)
xg.best_estimator_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   58.5s finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6795544161540505, gamma=0,
              learning_rate=0.0403783678365819, max_delta_step=0, max_depth=3,
              min_child_weight=2, missing=None, n_estimators=129, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1254,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7118978672086654, verbosity=1)

In [26]:
print ("Training Accuracy: " + str(round(100*sum(y_train== xg.predict(X_train))/len(y_train),1)) + '%')
print("Dev-Test Accuracy: " + str(round(100*sum(y_devtest== xg.predict(X_devtest))/len(y_devtest),1)) + '%')
cm = confusion_matrix(y_devtest, xg.predict(X_devtest))
print(str(cm[0,0]) + " accurately predicted males, " + str(cm[1,1]) + " accurately predicted females")
print(str(cm[0,1]) + " false-males, " + str(cm[1,0]) + " false-females")

Training Accuracy: 95.7%
Dev-Test Accuracy: 81.0%
288 accurately predicted males, 117 accurately predicted females
36 false-males, 59 false-females


Well our accuracy went down a bit now. So I guess we should be back to the LASSO regression.  For fun we'll see which one works better on the test set. First, the regression:

In [50]:
print("Test Accuracy: " + str(round(100*sum(y_test== log.predict(X_test))/len(y_test),1)) + '%')
cm2 = confusion_matrix(y_test, log.predict(X_test))
print(str(cm2[0,0]) + " accurately predicted males, " + str(cm2[1,1]) + " accurately predicted females")
print(str(cm2[0,1]) + " false-males, " + str(cm2[1,0]) + " false-females")

Test Accuracy: 76.6%
262 accurately predicted males, 121 accurately predicted females
43 false-males, 74 false-females


Quite a bit down there.  And the boosted tree:

In [51]:
print("Test Accuracy: " + str(round(100*sum(y_test== xg.predict(X_test))/len(y_test),1)) + '%')
cm2 = confusion_matrix(y_test, xg.predict(X_test))
print(str(cm2[0,0]) + " accurately predicted males, " + str(cm2[1,1]) + " accurately predicted females")
print(str(cm2[0,1]) + " false-males, " + str(cm2[1,0]) + " false-females")

Test Accuracy: 77.0%
266 accurately predicted males, 119 accurately predicted females
39 false-males, 76 false-females


OK, so both seemed to lose accuracy when moving to the test set.  This makes sense as we added an extra layer of overfitting by tuning using the devtest set.  

Vido: https://youtu.be/1zqCuJvXSXs