In [1]:
from __future__ import print_function
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model, load_model, model_from_json
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, Input, TimeDistributed
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import optimizers
import pandas as pd
from sklearn.metrics import r2_score
import numpy as np
import sys
import h5py
import pickle
import os

  from ._conv import register_converters as _register_converters


In [2]:
#parameters
maxlen = 30
labels = 2

In [3]:
czech = pd.read_excel('czech.xlsx', encoding='latin',header = None)
czech.columns = ['Name', 'male_or_female']
czech['namelen'] = [len(str(i)) for i in czech['Name']]


In [4]:
cname = czech['Name']
collect = []


for i in range(len(cname)):
    collect.extend(list(str(cname[i]).lower()))
# collect.extend(['END'])
collect = set(collect)

czech.head()

Unnamed: 0,Name,male_or_female,namelen
0,Abigail,f,7
1,Ada,f,3
2,Adalberta,f,9
3,Adéla,f,5
4,Adelaida,f,8


In [5]:
data_set = pd.read_csv("gender_data.csv",header=None)
data_set.columns = ['name','male_or_female']
data_set['namelen']= [len(str(i)) for i in data_set['name']]
data_set1 = data_set[(data_set['namelen'] >= 2) ]

In [6]:
data_set1.groupby('male_or_female')['name'].count()

male_or_female
f    6705
m    8475
Name: name, dtype: int64

In [7]:
names = data_set['name']
gender = data_set['male_or_female']
vocab = set(' '.join([str(i) for i in names]))
vocab.add('END')
vocab = vocab.union(collect)
len_vocab = len(vocab)

In [8]:
print(vocab)
print("vocab length is ",len_vocab)
print ("length of data_set is ",len(data_set1))

{'ů', 'p', 'a', 'ú', 'g', 'e', 'ň', 'k', 'š', '6', 'y', '8', 'l', 'ď', 'q', '7', 'ť', 'r', 'm', '4', 'í', 'END', 's', 'č', '1', 'ó', 'n', 'ž', 'x', '0', 'b', 'ý', '.', ' ', 'ř', 'é', 'v', '3', 'o', '9', 'd', 'ě', 'j', 'c', '5', 'á', 'w', 'h', 'z', 'u', 'i', 'f', '2', 't'}
vocab length is  54
length of data_set is  15226


In [9]:
char_index = dict((c, i) for i, c in enumerate(vocab))

In [10]:
# with open('char_index.txt', 'wb') as handle:
#     pickle.dump(char_index, handle)

with open('char_index.txt', 'rb') as handle:
    char_index = pickle.loads(handle.read())


In [11]:
print(char_index)

{'ů': 23, 'p': 5, 'a': 45, 'ú': 26, 'g': 9, 'e': 1, 'ň': 47, 'š': 36, '6': 12, 'y': 50, '8': 53, 'l': 0, 'ď': 13, 'q': 10, '7': 2, 'k': 35, 'ť': 46, 'á': 17, 'r': 6, '5': 40, 'm': 28, '4': 29, 'í': 4, 'END': 43, 's': 8, 'č': 38, '1': 41, 'ó': 52, 'n': 51, 'ž': 39, 'x': 15, 'ý': 11, 'b': 25, 'ř': 31, '.': 34, ' ': 22, 'v': 19, '3': 49, 'o': 37, '9': 7, 'd': 16, 'ě': 24, 'j': 21, 'c': 42, 'é': 20, 'w': 33, 'h': 14, 'z': 3, 'u': 30, '0': 44, 'i': 32, 'f': 27, '2': 18, 't': 48}


In [12]:
#train test split
msk = np.random.rand(len(data_set1)) < 0.9
train = data_set1[msk]
test = data_set1[~msk]     


In [13]:
def set_flag(i):
    tmp = np.zeros(len_vocab);
    tmp[i] = 1
    return(tmp)

In [14]:
set_flag(3)

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

#### modify the code above to also convert each index to one-hot encoded representation

In [15]:
#take data_set upto max and truncate rest
#encode to vector space(one hot encoding)
#padd 'END' to shorter sequences
#also convert each index to one-hot encoding
train_X = []
train_Y = []
trunc_train_name = [str(i)[0:maxlen] for i in train.name]
for i in trunc_train_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    train_X.append(tmp)
for i in train.male_or_female:
    if i == 'm':
        train_Y.append([1,0])
    else:
        train_Y.append([0,1])
    
train_X=np.asarray(train_X)
train_Y=np.asarray(train_Y)

In [16]:
test_X = []
test_Y = []
trunc_test_name = [str(i)[0:maxlen] for i in test.name]
for i in trunc_test_name:
    tmp = [set_flag(char_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    test_X.append(tmp)
for i in test.male_or_female:
    if i == 'm':
        test_Y.append([1,0])
    else:
        test_Y.append([0,1])
    
test_X = np.asarray(test_X)
test_Y = np.asarray(test_Y)

In [17]:
print(np.asarray(test_X).shape)
print(np.asarray(test_Y).shape)

(1543, 30, 54)
(1543, 2)


In [18]:
vtrain_x = []
vtrain_y = []

train_name = [str(i) for i in czech.Name]
for i in train_name:
    tmp = [set_flag(char_index[j]) for j in str(i.lower())]
    for k in range(0, maxlen - len(str(i))):
        tmp.append(set_flag(char_index['END']))
    vtrain_x.append(tmp)
for i in czech.male_or_female:
    if i == 'm':
        vtrain_y.append([1,0])
    else:
        vtrain_y.append([0,1])
vtrain_x = np.asarray(vtrain_x)
vtrain_y = np.asarray(vtrain_y)

#### build model in keras ( a stacked LSTM model with many-to-one arch ) here 30 sequence and 2 output each for one category(m/f)

In [25]:
#build the model: 2 stacked LSTM
print('Build model...')
input_bilstm=Input(shape = (maxlen,len_vocab))
bi_one = Bidirectional(LSTM(512, return_sequences=True))(input_bilstm)
drop1 = Dropout(0.2)(bi_one)
bi_two = Bidirectional(LSTM(512, return_sequences=False))(drop1)
drop2 = Dropout(0.2)(bi_two)
output = Dense(2, activation='softmax')(drop2)
model = Model(input_bilstm, output)


optimizer = optimizers.Adam(lr = 0.01)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])


checkpoint_path = 'tmp/model.ckpt'
early_stopping = EarlyStopping(monitor='val_acc',patience=10, verbose=1)
model_checkpoint = ModelCheckpoint(checkpoint_path,monitor='val_acc',save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_acc',factor=0.5, patience=5, min_lr=0.0001, verbose=1)
print('Model Built')


Build model...
Model Built


In [26]:
batch_size=500
model.fit(train_X, train_Y,
          batch_size=batch_size,
          epochs=20,
          callbacks=[model_checkpoint,reduce_lr,early_stopping],
          validation_data=(vtrain_x, vtrain_y),
          verbose = 2
         )
# model.save('model.h5')


Train on 13725 samples, validate on 1250 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.63280, saving model to tmp/model.ckpt
 - 48s - loss: 0.6221 - acc: 0.6393 - val_loss: 0.6382 - val_acc: 0.6328
Epoch 2/20

Epoch 00002: val_acc improved from 0.63280 to 0.73520, saving model to tmp/model.ckpt
 - 17s - loss: 0.5186 - acc: 0.7493 - val_loss: 0.5307 - val_acc: 0.7352
Epoch 3/20

Epoch 00003: val_acc improved from 0.73520 to 0.77200, saving model to tmp/model.ckpt
 - 17s - loss: 0.4790 - acc: 0.7720 - val_loss: 0.4940 - val_acc: 0.7720
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.77200
 - 17s - loss: 0.4557 - acc: 0.7892 - val_loss: 0.4912 - val_acc: 0.7720
Epoch 5/20

Epoch 00005: val_acc improved from 0.77200 to 0.83200, saving model to tmp/model.ckpt
 - 17s - loss: 0.4348 - acc: 0.8014 - val_loss: 0.4118 - val_acc: 0.8320
Epoch 6/20

Epoch 00006: val_acc improved from 0.83200 to 0.84080, saving model to tmp/model.ckpt
 - 17s - loss: 0.4060 - acc: 0.8224 -

<tensorflow.python.keras.callbacks.History at 0x7fe57dc99198>

In [19]:
new_model = load_model('tmp/model.ckpt')

In [20]:
score, acc = new_model.evaluate(vtrain_x, vtrain_y)
print('Test score:', score)
print('Test accuracy:', acc)

# pred = new_model.predict(vtrain_x)
# r2_score(np.around(pred), vtrain_y)

Test score: 0.30056820499897
Test accuracy: 0.9144


In [26]:
# name=[sys.argv[1]]
name = ["radha"]
# name=["sandhya","jaspreet","rajesh","kaveri","aditi deepak","arihant","sasikala","aditi","ragini rajaram"]

X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [set_flag(char_index[j]) for j in str(i.lower())]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_index["END"]))
    X.append(tmp)
pred=new_model.predict(np.asarray(X))
pred = pred.round()[0]
if pred[0] == 1.0:
    print( name[0], ".... is name of a male")
else:
    print(name[0], ".... is name of a female")

radha .... is name of a female


In [47]:
pred

array([0., 1.], dtype=float32)