<h1 align="center"> Deep Learning gender from name - RNN LSTMs </h1>

#### we will use an LSTM RNN to learn gender as f(name). we will use a stacked LSTM with many-to-one architecture feeding charecter data_sets and predicting a binary outcome M/F. loss function used will be binary_crossentropy (a special case of categorical_crossentropy with m=2) and using adam optimizer (modified SGD) sample data_set /output would like this <br> ['r','a','k','e','s','h',' '] - male<br> ['p','r','a','d','e','e','p'] - male<br> ['g','a','n','g','a',' '] - female<br> and so on...

<img src="LSTM_RNN_architecture.jpg" width="800" height="600"/>

In [24]:
from __future__ import print_function

from sklearn.preprocessing import OneHotEncoder
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
import pandas as pd
import numpy as np
import os

In [25]:
#parameters
maxlen = 30
labels = 2

In [26]:
data_set = pd.read_csv("gender_data.csv",header=None)
data_set.columns = ['name','m_or_f']
data_set['namelen']= [len(str(i)) for i in data_set['name']]
data_set1 = data_set[(data_set['namelen'] >= 2) ]

In [27]:
data_set1.groupby('m_or_f')['name'].count()

m_or_f
f    6705
m    8475
Name: name, dtype: int64

In [28]:
names = data_set['name']
gender = data_set['m_or_f']
vocab = set(' '.join([str(i) for i in names]))
vocab.add('END')
len_vocab = len(vocab)

In [161]:
print(vocab)
print("vocab length is ",len_vocab)
print ("length of data_set is ",len(data_set1))

set([' ', '.', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8', 'END', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
vocab length is  39
length of input is  15226


In [162]:
char_index = dict((c, i) for i, c in enumerate(vocab))

In [163]:
print(char_index)

{' ': 0, '.': 1, '1': 2, '0': 3, '3': 4, '2': 5, '5': 6, '4': 7, '7': 8, '6': 9, '9': 10, '8': 11, 'END': 12, 'a': 13, 'c': 14, 'b': 15, 'e': 16, 'd': 17, 'g': 18, 'f': 19, 'i': 20, 'h': 21, 'k': 22, 'j': 23, 'm': 24, 'l': 25, 'o': 26, 'n': 27, 'q': 28, 'p': 29, 's': 30, 'r': 31, 'u': 32, 't': 33, 'w': 34, 'v': 35, 'y': 36, 'x': 37, 'z': 38}


In [164]:
#train test split
msk = np.random.rand(len(data_set1)) < 0.8
train = data_set1[msk]
test = data_set1[~msk]     

In [165]:
#take data_set upto max and truncate rest
#encode to vector space(one hot encoding)
#padd 'END' to shorter sequences
train_X = []
trunc_train_name = [str(i)[0:30] for i in train.name]
for i in trunc_train_name:
    tmp = [char_index[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(char_index["END"])
    train_X.append(tmp)

In [166]:
np.asarray(train_X).shape

(12198, 30)

In [179]:
def set_flag(i):
    tmp = np.zeros(39);
    tmp[i] = 1
    return(tmp)

In [184]:
set_flag(3)

array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

#### modify the code above to also convert each index to one-hot encoded representation

In [195]:
#take data_set upto max and truncate rest
#encode to vector space(one hot encoding)
#padd 'END' to shorter sequences
#also convert each index to one-hot encoding
train_X = []
train_Y = []
trunc_train_name = [str(i)[0:maxlen] for i in train.name]
for i in trunc_train_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    train_X.append(tmp)
for i in train.m_or_f:
    if i == 'm':
        train_Y.append([1,0])
    else:
        train_Y.append([0,1])
    

In [196]:
np.asarray(train_X).shape

(12198, 30, 39)

In [197]:
np.asarray(train_Y).shape

(12198, 2)

#### build model in keras ( a stacked LSTM model with many-to-one arch ) here 30 sequence and 2 output each for one category(m/f)

In [212]:
#build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, data_set_shape=(maxlen,len_vocab)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Build model...


In [206]:
test_X = []
test_Y = []
trunc_test_name = [str(i)[0:maxlen] for i in test.name]
for i in trunc_test_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    test_X.append(tmp)
for i in test.m_or_f:
    if i == 'm':
        test_Y.append([1,0])
    else:
        test_Y.append([0,1])
    

In [207]:
print(np.asarray(test_X).shape)
print(np.asarray(test_Y).shape)

(3028, 30, 39)
(3028, 2)


In [215]:
batch_size=1000
model.fit(train_X, train_Y,batch_size=batch_size,nb_epoch=10,validation_data=(test_X, test_Y))

Train on 12198 samples, validate on 3028 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5ff409ba10>

In [216]:
score, acc = model.evaluate(test_X, test_Y)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.453434576998
Test accuracy: 0.789299867978


In [288]:
name=["sandhya","jaspreet","rajesh"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))

In [289]:
pred

array([[ 0.62356585,  0.37643418],
       [ 0.72094178,  0.27905828],
       [ 0.90337974,  0.09662029]], dtype=float32)

#### Lets train more, clearly some very simple female names it doesnt get right like mentioned above (inspite it exists in training data)

In [290]:
batch_size=1000
model.fit(train_X, train_Y,batch_size=batch_size,nb_epoch=50,validation_data=(test_X, test_Y))

Train on 12198 samples, validate on 3028 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5fe98ba8d0>

In [460]:
score, acc = model.evaluate(test_X, test_Y)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.448404541104
Test accuracy: 0.864266842879


<h3 align="center"> lets look at the loss and accuracy chart as a function of epochs </h3><img src="loss_charts.bmp" alt="loss charts" width="500" height="350"/><img src="acc_charts.bmp" alt="loss charts"  width="500" height="350"/>

In [342]:
name=["sandhya","jaspreet","rajesh","kaveri","aditi deepak","arihant","sasikala","aditi","ragini rajaram"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[ 0.0859881 ,  0.91401184],
       [ 0.96310365,  0.03689628],
       [ 0.7148453 ,  0.28515476],
       [ 0.02246205,  0.97753793],
       [ 0.13607673,  0.86392319],
       [ 0.99559009,  0.00440993],
       [ 0.05380283,  0.94619709],
       [ 0.55060732,  0.44939268],
       [ 0.10676169,  0.89323831]], dtype=float32)

In [345]:
name=["abhi","abhi deepak","mr. abhi"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[ 0.15557961,  0.84442037],
       [ 0.25342518,  0.74657482],
       [ 0.8618474 ,  0.13815261]], dtype=float32)

In [502]:
name=["rajini","rajinikanth","mr. rajini"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[ 0.33718896,  0.66281104],
       [ 0.99896383,  0.00103616],
       [ 0.99664474,  0.00335527]], dtype=float32)

In [450]:
#save our model and data
model.save_weights('gender_model',overwrite=True)
train.to_csv("train_split.csv")
test.to_csv("test_split.csv")

In [464]:
evals = model.predict(test_X)
prob_m = [i[0] for i in evals]

In [479]:
out = pd.DataFrame(prob_m)
out['name'] = test.name.reset_index()['name']
out['m_or_f']=test.m_or_f.reset_index()['m_or_f']

In [483]:
out.head(10)
out.columns = ['prob_m','name','actual']
out.head(10)
out.to_csv("gender_pred_out.csv")