In [2]:
import numpy as np

In [16]:
def read_data(filename):
    return np.array(list(set(list(open(filename)))))

In [17]:
male = read_data("male.txt")
female = read_data("female.txt")

In [18]:
male.shape, female.shape

((2943,), (5001,))

In [19]:
male[:5]

array(['Pincas\n', 'Bard\n', 'Sterne\n', 'Dominique\n', 'Tammy\n'],
      dtype='<U16')

In [34]:
female[:5]

array(['Merl\n', 'Clari\n', 'Emmie\n', 'Lucia\n', 'Celestina\n'],
      dtype='<U16')

In [24]:
import pandas as pd

In [104]:
def create_pd_from_data(data, gender):
    df = pd.DataFrame(data={"name":data})
    df["name"] = df["name"].str.lower()
    df["name"] = df["name"].str.replace("\n", "")
    df["gender"] = gender
    return df

In [105]:
df_male = create_pd_from_data(male, 0)
df_male.head()

Unnamed: 0,name,gender
0,pincas,0
1,bard,0
2,sterne,0
3,dominique,0
4,tammy,0


In [106]:
df_female = create_pd_from_data(female, 1)
df_female.head()

Unnamed: 0,name,gender
0,merl,1
1,clari,1
2,emmie,1
3,lucia,1
4,celestina,1


In [107]:
corpus = df_female.append(df_male)
corpus.shape

(7944, 2)

In [119]:
corpus["name"].describe()

count       7944
unique      7576
top       mickie
freq           2
Name: name, dtype: object

In [127]:
# wow what a long name
max_len = max([len(x) for x in corpus["name"]])
max_len

15

In [126]:
{len(x):x for x in corpus["name"]}

{2: 'ez',
 3: 'art',
 4: 'winn',
 5: 'bernd',
 6: 'kermie',
 7: 'abelard',
 8: 'montague',
 9: 'standford',
 10: 'heathcliff',
 11: 'bartholomew',
 12: 'christorpher',
 13: 'jean-francois',
 14: 'sheila-kathryn',
 15: 'jean-christophe'}

In [108]:
from sklearn.model_selection import train_test_split

In [109]:
train, dev_test = train_test_split(corpus, stratify=corpus["gender"],test_size=0.3, random_state=26)
train.shape, dev_test.shape

((5560, 2), (2384, 2))

In [72]:
dev, test = train_test_split(dev_test, stratify=dev_test["gender"], test_size = 0.3, random_state=26  )

In [73]:
dev.shape, test.shape

((1668, 2), (716, 2))

In [111]:
train["gender"].value_counts()

1    3500
0    2060
Name: gender, dtype: int64

In [110]:
dev["gender"].value_counts()

1    1050
0     618
Name: gender, dtype: int64

In [112]:
test["gender"].value_counts()

1    451
0    265
Name: gender, dtype: int64

In [128]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [23]:
tokenizer = Tokenizer(char_level=True)

In [113]:
tokenizer.fit_on_texts(train["name"].append(dev["name"])) # i wont fit on test

In [157]:
def create_features(data):
    seq = tokenizer.texts_to_sequences(data)
    seq = pad_sequences(seq, maxlen=max_len)
    return seq

In [160]:
train_seq = create_features(train["name"])
train_seq.shape

(5560, 15)

In [230]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional, MaxPooling1D, Convolution1D, GlobalMaxPooling1D

In [251]:
model = Sequential([
    Embedding(input_length=max_len, input_dim=len(tokenizer.word_counts) + 1, output_dim=26),
    Bidirectional(LSTM(100)),
    Dropout(0.4),
    Dense(8, activation="relu"),
    Dropout(0.3),
    Dense(4, activation="relu"),
    Dropout(0.1),
    Dense(1, activation="sigmoid")
])

In [252]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 15, 26)            1456      
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 200)               101600    
_________________________________________________________________
dropout_40 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_54 (Dense)             (None, 8)                 1608      
_________________________________________________________________
dropout_41 (Dropout)         (None, 8)                 0         
_________________________________________________________________
dense_55 (Dense)             (None, 4)                 36        
_________________________________________________________________
dropout_42 (Dropout)         (None, 4)                 0         
__________

In [142]:
from keras.callbacks import EarlyStopping

In [143]:
stopping = EarlyStopping(patience=2) # should also restart training sometimes

In [149]:
def per_sample(data):
    return data["gender"].values.reshape(-1,1)

In [254]:
model.compile(optimizer="adam", loss="binary_crossentropy")
model.fit(create_features(train["name"]), per_sample(train),validation_data=(create_features(dev["name"]), per_sample(dev)), epochs=30, batch_size=32, callbacks=[stopping])

Train on 5560 samples, validate on 1668 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


<keras.callbacks.History at 0x7ff2f57f2be0>

In [256]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [257]:
def score_on(dataset, model):
    prediction = np.round(model.predict(create_features(dataset["name"])))
    print(confusion_matrix(y_true=dataset["gender"], y_pred=prediction, labels=[0,1]))
    print(classification_report(y_true=dataset["gender"], y_pred=prediction, labels=[0,1], target_names=["Male", "Female"]))
    print("acc", accuracy_score(y_true=dataset["gender"], y_pred=prediction))
    

In [258]:
score_on(dev, model)

[[486 132]
 [189 861]]
             precision    recall  f1-score   support

       Male       0.72      0.79      0.75       618
     Female       0.87      0.82      0.84      1050

avg / total       0.81      0.81      0.81      1668

acc 0.807553956835


In [260]:
score_on(test, model)

[[190  75]
 [ 83 368]]
             precision    recall  f1-score   support

       Male       0.70      0.72      0.71       265
     Female       0.83      0.82      0.82       451

avg / total       0.78      0.78      0.78       716

acc 0.779329608939
