In [25]:
import pandas as pd
import re
import numpy as np

In [3]:
df = pd.read_csv('data/allnames.tsv', sep='\t')
df = df.drop(columns='Person ID')
df = df.drop_duplicates(subset="Person Name")
print(df.shape)
print(df.head())

(99093, 3)
    Person Name Gender Train/Test
0         -minu   Male       Test
1  (.)p(...)nin   Male      Train
2   12th Planet   Male      Train
3      2 Chainz   Male      Train
4       50 Cent   Male      Train


In [8]:
# remove non latin chars

def clean_col(df, col_list, non_latin):
    df = df.copy().dropna()
    for col in col_list:
        # df[col] = df[col].str.strip()
        # df[col] = df[col].str.replace(r'([^\s\w]|_)+', '')
        # df[col] = df[col].apply(lambda x: x.encode("ascii", errors="ignore").decode())

        contains_non_latin = df[col].str.contains(non_latin)
        series = df[col].apply(
            lambda x: ''.join([c for c in
                               re.sub(r'\s+', ' ', x).strip()]).strip())
        df[col] = series

        # Get the mask of overly long utterances
        #keep = series.str.encode(encoding='utf-8').apply(len) < max_len
        df = df[(series != '') &
                (series != 'None') &
                (~contains_non_latin)]
        
    return df
    

In [9]:
col_list = ['Person Name']
non_latin = r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]'
clean_col(df, col_list, non_latin)
    

Unnamed: 0,Person Name,Gender,Train/Test
0,-minu,Male,Test
1,(.)p(...)nin,Male,Train
2,12th Planet,Male,Train
3,2 Chainz,Male,Train
4,50 Cent,Male,Train
5,A-Lin,Female,Train
6,A-Mei,Female,Test
7,A. C. Crispin,Female,Train
8,A. C. Newman,Male,Train
9,A. D. Walsh,Male,Train


In [10]:
def remove_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)

df['Person Name'] = df['Person Name'].apply(remove_non_ascii)

In [12]:
vocab = set(' '.join([str(i) for i in df['Person Name']]))
vocab.add('END')
len_vocab = len(vocab)

In [15]:
print("vocab", vocab)
print("len_vocab", vocab)

vocab {'!', '?', 'END', 'T', 'x', 'W', 'c', 'L', 'w', '2', 'S', 'l', 'a', '3', 'k', 'g', 'E', '(', 'u', '0', 's', 'U', '.', '/', 'J', ')', 'Y', 'B', '1', 'F', '4', 'b', 'q', 'm', 'P', ' ', 'o', 't', 'y', 'd', '$', 'f', '9', 'h', 'r', 'D', 'Z', 'R', "'", 'N', 'Q', ':', 'X', 'M', 'z', 'H', 'i', '-', '5', 'O', '8', '6', 'A', '+', 'C', '_', '&', 'G', '7', '"', ',', 'K', 'e', 'j', 'I', 'v', 'V', 'p', 'n'}
len_vocab {'!', '?', 'END', 'T', 'x', 'W', 'c', 'L', 'w', '2', 'S', 'l', 'a', '3', 'k', 'g', 'E', '(', 'u', '0', 's', 'U', '.', '/', 'J', ')', 'Y', 'B', '1', 'F', '4', 'b', 'q', 'm', 'P', ' ', 'o', 't', 'y', 'd', '$', 'f', '9', 'h', 'r', 'D', 'Z', 'R', "'", 'N', 'Q', ':', 'X', 'M', 'z', 'H', 'i', '-', '5', 'O', '8', '6', 'A', '+', 'C', '_', '&', 'G', '7', '"', ',', 'K', 'e', 'j', 'I', 'v', 'V', 'p', 'n'}


In [17]:
char_idx = dict((c, i) for i, c in enumerate(vocab))
print(char_idx)

{'!': 0, '?': 1, 'END': 2, 'T': 3, 'x': 4, 'W': 5, 'c': 6, 'L': 7, 'w': 8, '2': 9, 'S': 10, 'l': 11, 'a': 12, '3': 13, 'k': 14, 'g': 15, 'E': 16, '(': 17, 'u': 18, '0': 19, 's': 20, 'U': 21, '.': 22, '/': 23, 'J': 24, ')': 25, 'Y': 26, 'B': 27, '1': 28, 'F': 29, '4': 30, 'b': 31, 'q': 32, 'm': 33, 'P': 34, ' ': 35, 'o': 36, 't': 37, 'y': 38, 'd': 39, '$': 40, 'f': 41, '9': 42, 'h': 43, 'r': 44, 'D': 45, 'Z': 46, 'R': 47, "'": 48, 'N': 49, 'Q': 50, ':': 51, 'X': 52, 'M': 53, 'z': 54, 'H': 55, 'i': 56, '-': 57, '5': 58, 'O': 59, '8': 60, '6': 61, 'A': 62, '+': 63, 'C': 64, '_': 65, '&': 66, 'G': 67, '7': 68, '"': 69, ',': 70, 'K': 71, 'e': 72, 'j': 73, 'I': 74, 'v': 75, 'V': 76, 'p': 77, 'n': 78}


In [18]:
train = df[df['Train/Test'].str.contains('Train')]
train.count()

Person Name    79263
Gender         79263
Train/Test     79263
dtype: int64

In [19]:
test = df[df['Train/Test'].str.contains('Test')]
test.count()

Person Name    19830
Gender         19830
Train/Test     19830
dtype: int64

In [20]:
feature_train = train['Person Name']
print(feature_train.shape)
labels_train = train['Gender']
print(labels_train.shape)

(79263,)
(79263,)


In [21]:
feature_test = test['Person Name']
print(feature_test.shape)
labels_test = test['Gender']
print(labels_test.shape)

(19830,)
(19830,)


In [23]:
#take input upto max and truncate rest
#encode to vector space(one hot encoding)
#padd 'END' to shorter sequences
maxlen = 30
train_X = []
trunc_train_name = [str(i)[0:30] for i in feature_train]
for i in trunc_train_name:
    tmp = [char_idx[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(char_idx["END"])
    train_X.append(tmp)

In [26]:
np.asarray(train_X).shape

(79263, 30)

In [41]:
def set_flag(i):
    tmp = np.zeros(79)
    tmp[i] = 1
    return(tmp)

In [42]:
set_flag(3)

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [43]:
train_X = []
train_Y = []
trunc_train_name = [str(i)[0:maxlen] for i in feature_train]
for i in trunc_train_name:
    tmp = [set_flag(char_idx[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_idx["END"]))
    train_X.append(tmp)
for i in labels_train:
    if i == 'm':
        train_Y.append([1,0])
    else:
        train_Y.append([0,1])

In [44]:
np.asarray(train_X).shape

(79263, 30, 79)

In [45]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Dropout
print("Building model....")
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,len_vocab)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Building model....


In [46]:
test_X = []
test_Y = []
trunc_test_name = [str(i)[0:maxlen] for i in feature_test]
for i in trunc_test_name:
    tmp = [set_flag(char_idx[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(set_flag(char_idx["END"]))
    test_X.append(tmp)
for i in labels_test:
    if i == 'm':
        test_Y.append([1,0])
    else:
        test_Y.append([0,1])

In [47]:
print(np.asarray(test_X).shape)
print(np.asarray(test_Y).shape)

(19830, 30, 79)
(19830, 2)


In [48]:
test_X = np.array(test_X)
test_Y = np.array(test_Y)
train_X = np.array(train_X)
train_Y = np.array(train_Y)

In [49]:
batch_size=1000
model.fit(train_X, train_Y,batch_size=batch_size,nb_epoch=10,validation_data=(test_X, test_Y))


  


Train on 79263 samples, validate on 19830 samples
Epoch 1/10

KeyboardInterrupt: 