In [1]:
import numpy as np
import pandas as pd
from numpy import genfromtxt
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.models import load_model
from sklearn.utils import shuffle

Using TensorFlow backend.


In [2]:
filename = "https://s3.amazonaws.com/name-gender/allnames.txt"
df=pd.read_csv(filename, sep=',', names = ["Name", "Gender", "Count"])
df.shape

(1891894, 3)

In [3]:
df.head()

Unnamed: 0,Name,Gender,Count
0,Mary,F,7065
1,Anna,F,2604
2,Emma,F,2003
3,Elizabeth,F,1939
4,Minnie,F,1746


In [4]:
df = df.drop(['Count'], axis=1)
df.head()

Unnamed: 0,Name,Gender
0,Mary,F
1,Anna,F
2,Emma,F
3,Elizabeth,F
4,Minnie,F


In [5]:
df = df.drop_duplicates()
df = shuffle(df)
df.shape
df.head()

Unnamed: 0,Name,Gender
891516,Cherline,F
119622,Rodrick,M
1468822,Aribella,F
1853493,Mucad,M
993054,Jeseca,F


In [6]:
#number of names
num_names = df.shape[0]
print(num_names)
# length of longest name
max_name_length = (df['Name'].map(len).max())
print(max_name_length)
# length of shortest name
min_name_length = (df['Name'].map(len).min())
print(min_name_length)

106695
15
2


In [7]:
print(df['Name'].head())

891516     Cherline
119622      Rodrick
1468822    Aribella
1853493       Mucad
993054       Jeseca
Name: Name, dtype: object


In [8]:
names = df['Name'].values
genders = df['Gender']
print(names)
genders.head()

['Cherline' 'Rodrick' 'Aribella' ..., 'Lacresha' 'Laurenne' 'Mayowa']


891516     F
119622     M
1468822    F
1853493    M
993054     F
Name: Gender, dtype: object

In [None]:
txt = ""
for n in names:
    txt += n.lower()
print(len(txt))
chars = sorted(set(txt))
alphabet_size = len(chars)
print('total chars:', len(chars))
print(chars)

In [None]:
char_indices = dict((c, i) for i, c in enumerate(chars))
print(char_indices)

In [10]:
char_indices = dict((str(unichr(c)), i) for i, c in enumerate(range(97,123)))
print(char_indices)
alphabet_size = 123-97

{'a': 0, 'c': 2, 'b': 1, 'e': 4, 'd': 3, 'g': 6, 'f': 5, 'i': 8, 'h': 7, 'k': 10, 'j': 9, 'm': 12, 'l': 11, 'o': 14, 'n': 13, 'q': 16, 'p': 15, 's': 18, 'r': 17, 'u': 20, 't': 19, 'w': 22, 'v': 21, 'y': 24, 'x': 23, 'z': 25}


In [22]:
char_indices['max_name_length'] = max_name_length
print(char_indices)


{'max_name_length': 15, 'a': 0, 'c': 2, 'b': 1, 'e': 4, 'd': 3, 'g': 6, 'f': 5, 'i': 8, 'h': 7, 'k': 10, 'j': 9, 'm': 12, 'l': 11, 'o': 14, 'n': 13, 'q': 16, 'p': 15, 's': 18, 'r': 17, 'u': 20, 't': 19, 'w': 22, 'v': 21, 'y': 24, 'x': 23, 'z': 25}


In [11]:
X = np.zeros((num_names, max_name_length, alphabet_size))
print(X.shape)

(106695, 15, 26)


In [12]:
for i,name in enumerate(names):
    name = name.lower()
    for t, char in enumerate(name):
        X[i, t,char_indices[char]] = 1
X[0,:,:]

array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [13]:
df['Gender'].value_counts()

F    66358
M    40337
Name: Gender, dtype: int64

In [14]:
#Y = np.where(df['Gender'].str.contains("F"), 1, other=-1)
Y = np.ones((num_names,2))
Y[df['Gender'] == 'F',0] = 0
Y[df['Gender'] == 'M',1] = 0
Y

array([[ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [15]:
print (X.shape)
print (Y.shape)

(106695, 15, 26)
(106695, 2)


In [16]:
data_dim = alphabet_size
timesteps = max_name_length
num_classes = 2

In [17]:
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(timesteps, data_dim)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [18]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

In [19]:
#model.fit(X, Y,
#          batch_size=64, epochs=5,
#          validation_data=(x_val, y_val))
model.fit(X, Y, validation_split=0.20, epochs=10, batch_size=64)

Train on 85356 samples, validate on 21339 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f76b649c1d0>

In [20]:
names_test = ['shahrukh', "rob", "victor","amaya","vetri", "swetha","binoy","moni","deep","deepa","rupu","rupa","gurpreet"]
num_test = len(names_test)

X_test = np.zeros((num_test, max_name_length, alphabet_size))

for i,name in enumerate(names_test):
    name = name.lower()
    for t, char in enumerate(name):
        X_test[i, t,char_indices[char]] = 1

predictions = model.predict(X_test)

for i,name in enumerate(names_test):
    print("{} ({})".format(names_test[i],"M" if predictions[i][0]>predictions[i][1] else "F"))

shahrukh (M)
rob (M)
victor (M)
amaya (F)
vetri (F)
swetha (F)
binoy (F)
moni (F)
deep (M)
deepa (F)
rupu (M)
rupa (F)
gurpreet (F)


In [21]:
model_type = '_layer-2_dropout-0.2_seq-512_act-softmax_opt-adam_epoch-1'
model.save('GenderLSTM{}.h5'.format(model_type))
char_indices['max_name_length'] = max_name_length
np.save('GenderLSTM{}.npy'.format(model_type), char_indices) 

In [24]:
model_type = '_layer-2_dropout-0.2_seq-512_act-softmax_opt-adam_epoch-1'
loaded_model = load_model('GenderLSTM{}.h5'.format(model_type))
loaded_char_indices = np.load('GenderLSTM{}.npy'.format(model_type)).item()
max_name_length = loaded_char_indices['max_name_length']
loaded_char_indices.pop('max_name_length', None)
alphabet_size = len(loaded_char_indices)
print(loaded_char_indices)

{'a': 0, 'c': 2, 'b': 1, 'e': 4, 'd': 3, 'g': 6, 'f': 5, 'i': 8, 'h': 7, 'k': 10, 'j': 9, 'm': 12, 'l': 11, 'o': 14, 'n': 13, 'q': 16, 'p': 15, 's': 18, 'r': 17, 'u': 20, 't': 19, 'w': 22, 'v': 21, 'y': 24, 'x': 23, 'z': 25}


In [None]:
print(max_name_length)
print(alphabet_size)

In [None]:
names_test = ['kanadpriya',"kanad","treena","dean","osei","rui",]
num_test = len(names_test)

X_test = np.zeros((num_test, max_name_length, alphabet_size))

for i,name in enumerate(names_test):
    name = name.lower()
    for t, char in enumerate(name):
        X_test[i, t,loaded_char_indices[char]] = 1

predictions = loaded_model.predict(X_test)

for i,name in enumerate(names_test):
    print("{} ({})".format(names_test[i],"M" if predictions[i][0]>predictions[i][1] else "F"))

In [None]:
!sudo yum install -y docker

In [None]:
!sudo service docker start

In [None]:
!sudo docker info

In [None]:
!git clone https://github.com/dbinoy/Sagemaker_BYOA-LSTM_Keras.git

In [None]:
!sudo rm -rf Sagemaker_BYOA-LSTM_Keras

In [60]:
%%writefile testfile

a=1
b=2
c=3

Overwriting testfile


In [61]:
%cat testfile


a=1
b=2
c=3

In [54]:
%rm testfile