### LSTM MODEL 

Used character sequences which make up the name as our feature variable, with gender as labels. Used a stack LSTM model and a final dense layer with softmax activation (many-to-one setup). categorical cross-entropy loss is used with adam optimizer. A 20% dropout layer is added for regularization to avoid over-fitting.

This Model is trained on *GPU instance* took 5 mins to run 10 epochs.

This model gave the **test accuracy 100%**. The model might be overfitting. Try hyperparam tuning to reduce it. 

In [3]:
import pandas as pd
import re
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Dropout

Using TensorFlow backend.


## Load Data

In [4]:
df = pd.read_csv('data/allnames.tsv', sep='\t')
df = df.drop(columns='Person ID')
df = df.drop_duplicates(subset="Person Name")
print(df.shape)
print(df.head())

(99093, 3)
    Person Name Gender Train/Test
0         -minu   Male       Test
1  (.)p(...)nin   Male      Train
2   12th Planet   Male      Train
3      2 Chainz   Male      Train
4       50 Cent   Male      Train


# Preprocessing

In [5]:
# remove non latin chars

def clean_col(df, col_list, non_latin):
    df = df.copy().dropna()
    for col in col_list:
        # df[col] = df[col].str.strip()
        # df[col] = df[col].str.replace(r'([^\s\w]|_)+', '')
        # df[col] = df[col].apply(lambda x: x.encode("ascii", errors="ignore").decode())

        contains_non_latin = df[col].str.contains(non_latin)
        series = df[col].apply(
            lambda x: ''.join([c for c in
                               re.sub(r'\s+', ' ', x).strip()]).strip())
        df[col] = series

        # Get the mask of overly long utterances
        #keep = series.str.encode(encoding='utf-8').apply(len) < max_len
        df = df[(series != '') &
                (series != 'None') &
                (~contains_non_latin)]
        
    return df
    

In [6]:
col_list = ['Person Name']
non_latin = r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]'
clean_col(df, col_list, non_latin)

Unnamed: 0,Person Name,Gender,Train/Test
0,-minu,Male,Test
1,(.)p(...)nin,Male,Train
2,12th Planet,Male,Train
3,2 Chainz,Male,Train
4,50 Cent,Male,Train
5,A-Lin,Female,Train
6,A-Mei,Female,Test
7,A. C. Crispin,Female,Train
8,A. C. Newman,Male,Train
9,A. D. Walsh,Male,Train


In [7]:
# remove non ascii chars
def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)

df['Person Name'] = df['Person Name'].apply(remove_non_ascii)

In [8]:
# find the number of characters
# a-z, 0–9, space, dot and a special END token.
vocab = set(' '.join([str(i) for i in df['Person Name']]))
vocab.add('END')
len_vocab = len(vocab)

In [31]:
print("vocab", vocab)
print("len_vocab", len_vocab)

vocab {'o', '2', 'i', '-', 'h', 'u', 'J', 'k', 'd', '?', 'U', 'n', 'f', 'P', ')', '8', '+', '!', 'l', 'w', 'c', 'q', 'I', 'K', 'N', '"', 'B', 'M', '&', 'e', '1', '0', '_', "'", 'F', '$', 'L', 't', 'E', 'O', 'V', '5', 'Y', 'T', 'G', 'C', 'g', 'S', '9', ' ', 'r', '4', 'A', 'W', '7', 'D', 'X', 'R', 'H', 's', 'b', ':', '/', '.', 'a', '(', 'END', '3', 'z', 'x', 'p', 'y', 'Z', '6', 'm', ',', 'v', 'j', 'Q'}
len_vocab 79


In [10]:
char_idx = dict((c, i) for i, c in enumerate(vocab))
print(char_idx)

{'o': 0, '2': 1, 'i': 2, '-': 3, 'h': 4, 'u': 5, 'J': 6, 'k': 7, 'd': 8, '?': 9, 'U': 10, 'n': 11, 'f': 12, 'P': 13, ')': 14, '8': 15, '+': 16, '!': 17, 'l': 18, 'w': 19, 'c': 20, 'q': 21, 'I': 22, 'K': 23, 'N': 24, '"': 25, 'B': 26, 'M': 27, '&': 28, 'e': 29, '1': 30, '0': 31, '_': 32, "'": 33, 'F': 34, '$': 35, 'L': 36, 't': 37, 'E': 38, 'O': 39, 'V': 40, '5': 41, 'Y': 42, 'T': 43, 'G': 44, 'C': 45, 'g': 46, 'S': 47, '9': 48, ' ': 49, 'r': 50, '4': 51, 'A': 52, 'W': 53, '7': 54, 'D': 55, 'X': 56, 'R': 57, 'H': 58, 's': 59, 'b': 60, ':': 61, '/': 62, '.': 63, 'a': 64, '(': 65, 'END': 66, '3': 67, 'z': 68, 'x': 69, 'p': 70, 'y': 71, 'Z': 72, '6': 73, 'm': 74, ',': 75, 'v': 76, 'j': 77, 'Q': 78}


In [11]:
train = df[df['Train/Test'].str.contains('Train')]
train.count()

Person Name    79263
Gender         79263
Train/Test     79263
dtype: int64

In [12]:
test = df[df['Train/Test'].str.contains('Test')]
test.count()

Person Name    19830
Gender         19830
Train/Test     19830
dtype: int64

In [13]:
feature_train = train['Person Name']
print(feature_train.shape)
labels_train = train['Gender']
print(labels_train.shape)

(79263,)
(79263,)


In [14]:
feature_test = test['Person Name']
print(feature_test.shape)
labels_test = test['Gender']
print(labels_test.shape)

(19830,)
(19830,)


In [15]:
#take input upto max length and truncate rest
#encode to vector space(one hot encoding)
#pad 'END' to shorter sequences
maxlen = 30
train_X = []
trunc_train_name = [str(i)[0:30] for i in feature_train]
for i in trunc_train_name:
    tmp = [char_idx[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(char_idx["END"])
    train_X.append(tmp)

In [16]:
np.asarray(train_X).shape

(79263, 30)

In [17]:
def set_flag(i):
    tmp = np.zeros(79)
    tmp[i] = 1
    return(tmp)

In [18]:
set_flag(3)

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
train_X = []
train_Y = []
trunc_train_name = [str(i)[0:maxlen] for i in feature_train]
for i in trunc_train_name:
    tmp = [set_flag(char_idx[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_idx["END"]))
    train_X.append(tmp)
for i in labels_train:
    if i == 'm':
        train_Y.append([1,0])
    else:
        train_Y.append([0,1])

In [32]:
#[ex, max len, vocab_len]
print(np.asarray(train_X).shape)

(79263, 30, 79)


## Build Model

In [21]:
print("Building model....")
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,len_vocab)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Building model....


In [22]:
test_X = []
test_Y = []
trunc_test_name = [str(i)[0:maxlen] for i in feature_test]
for i in trunc_test_name:
    tmp = [set_flag(char_idx[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_idx["END"]))
    test_X.append(tmp)
for i in labels_test:
    if i == 'm':
        test_Y.append([1,0])
    else:
        test_Y.append([0,1])

In [23]:
print(np.asarray(test_X).shape)
print(np.asarray(test_Y).shape)

(19830, 30, 79)
(19830, 2)


In [24]:
# have to do this somehow values doesn't convert into numpy array
test_X = np.array(test_X)
test_Y = np.array(test_Y)
train_X = np.array(train_X)
train_Y = np.array(train_Y)

## Train Model

In [25]:
batch_size=1000
model.fit(train_X, train_Y,batch_size=batch_size,nb_epoch=10,validation_data=(test_X, test_Y))


  


Train on 79263 samples, validate on 19830 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe97e3d2eb8>

## Evaluate

In [26]:
score, acc = model.evaluate(test_X, test_Y)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.1920930376163597e-07
Test accuracy: 1.0


## Prediction

In [29]:
name=["Mila Kunis","Jennifer Lawrence","Brad Pitt"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_idx[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_idx["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))

In [30]:
pred

array([[4.1511804e-16, 1.0000000e+00],
       [4.1513389e-16, 1.0000000e+00],
       [4.1512439e-16, 1.0000000e+00]], dtype=float32)