In [1]:
import keras
import pandas as pd
import numpy as np
import datetime
import os
import time
import keras
import sklearn
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf; from keras.backend.tensorflow_backend import set_session

Using TensorFlow backend.


# Load baby name dataset

In [2]:
db = pd.read_csv('./dataset/babynames.csv')

# Number of female name is twice
* 1 is male; 0 is female

In [3]:
db.gender.value_counts()

0    62587
1    34723
Name: gender, dtype: int64

# Sampling test data with balance classes
* Instead of spliting train/test with stratified target, sampling test data with balance classes makes gender distribution much fit the real world. 

In [4]:
# train/test split with balance classes
tt_split_ratio = 0.2
target_col = db.gender
min_n = int(target_col.value_counts().values[-1] * tt_split_ratio)
test_index = []
for cat in target_col.unique(): 
    test_index += list(db[target_col==cat].sample(min_n, random_state=0).index)
train_db=db[~db.index.isin(test_index)].reset_index(drop=True)
test_db=db[db.index.isin(test_index)].reset_index(drop=True)

# Parameter setup

In [None]:
project_root = '.'
exp_name='gender_predict_by_name'
par = {'embedding_dim': 20, 
         'target': 'gender', 
         'text_column': 'name', 
         'max_seq_len': max(train_db.name.apply(len)), 
         'model_dir': os.path.join(project_root, 'model', exp_name), 
         'result_dir': os.path.join(project_root, 'result/'), 
         'result_filename': exp_name, 
         'conv_kernel_size_1': 4, 'conv_filters_1': 2048, 'mp_filters_1': 1, 
         'conv_kernel_size_2': 2, 'conv_filters_2': 128, 'mp_filters_2': 1, 
         'fc_size_1': 1024, 'fc_size_2': 512, 
         'lr': 1e-4, 'decay': 0., 'dropout': 0.1, 
         'validation_ratio': 0.2, 'epochs': 200, 'batch_size': 32, 'patience': 0
      }

In [None]:
# # remove directory recursively
# import shutil
# if os.path.exists(par['model_dir']): 
#     print("delete %s ..."%(par['model_dir']))
#     shutil.rmtree(par['model_dir'])
# if os.path.exists(par['result_dir'] + par['result_filename']): 
#     print("delete %s" % (par['result_dir'] + par['result_filename']))
#     os.remove(par['result_dir'] + par['result_filename'])
    
# create model directory
if not os.path.exists(par['model_dir']):
    print("create folder %s" % (par['model_dir']))
    os.makedirs(par['model_dir'])

# Constraint GPU memory usage

In [None]:
# # constraint gpu usage
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# set_session(tf.Session(config=config))

# Feature engineering
* store what characters have been seen in training data
* save feature engineering for later use and predict process
* if these is numeric features, variable of normalized method will be stored, such as mean/std and min/max, etc. 

In [None]:
# fit
tokenizer = Tokenizer(char_level=True)    
tokenizer.fit_on_texts(train_db[par['text_column']].tolist())
par['num_of_char'] = len(tokenizer.word_index) + 1

# save model and feature engineering file
par['start_time']=datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f") 
par['model_file'] = 'model_' + par['start_time'] + '.hdf5'
par['feature_engineering_file'] = par['model_file'] + '.fe'
with open(os.path.join(par['model_dir'], par['feature_engineering_file']), 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Train / validation data split
* make target balance

In [None]:
val_count = int(train_db['gender'].value_counts().sort_values()[0] * par['validation_ratio'])
f_val_index = train_db[train_db['gender']==0].sample(val_count//2, random_state=0).index
m_val_index = train_db[train_db['gender']==1].sample(val_count//2, random_state=0).index
val_index = f_val_index.append(m_val_index)
train_index = train_db.index.difference(val_index)
val_db = train_db.loc[val_index].reset_index(drop=True)
train_db = train_db.loc[train_index].reset_index(drop=True)

# Over-sampling
* training data is unbalance
* over-sampling data in minor class making class balance

In [None]:
# over-sampling
maxlen = np.sort(train_db[par['target']].value_counts())[-1]
for p in train_db[par['target']].unique():
    plen = len(train_db[train_db[par['target']]==p])
    train_db = train_db.append(train_db[train_db[par['target']]==p]\
                               .sample((maxlen-plen), replace=True)).reset_index(drop=True)

# Data transform for feeding into model
* transform character sequence into index sequence
* padding sequences with the same length
* coding binary target

In [None]:
x_train_text = tokenizer.texts_to_sequences(train_db[par['text_column']].tolist())
x_train_text = pad_sequences(x_train_text, maxlen=par['max_seq_len'])
x_val_text = tokenizer.texts_to_sequences(val_db[par['text_column']].tolist())
x_val_text = pad_sequences(x_val_text, maxlen=par['max_seq_len'])    
x_test_text = tokenizer.texts_to_sequences(test_db[par['text_column']].tolist())
x_test_text = pad_sequences(x_test_text, maxlen=par['max_seq_len'])

y_train = keras.utils.to_categorical(train_db['gender'])
y_val = keras.utils.to_categorical(val_db['gender'])
y_test = keras.utils.to_categorical(test_db['gender'])

# Model setup

In [None]:
def model(par):
    # two 1-D CNN layers over embedding sequence
    embedding_layer = keras.layers.Embedding(par['num_of_char'], par['embedding_dim'],
                                             embeddings_initializer='uniform', 
                                             input_length=par['max_seq_len'], trainable=True)
    sequence_input = keras.layers.Input(shape=(par['max_seq_len'],), dtype='int32')
    embedding_sequences = embedding_layer(sequence_input)
    x = keras.layers.Conv1D(par['conv_filters_1'], par['conv_kernel_size_1'], 
                            use_bias=False)(embedding_sequences)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.MaxPool1D(par['mp_filters_1'])(x)
    x = keras.layers.Dropout(par['dropout'])(x)
    x = keras.layers.Conv1D(par['conv_filters_2'], par['conv_kernel_size_2'], 
                            use_bias=False)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.MaxPool1D(par['mp_filters_2'])(x)
    x = keras.layers.Dropout(par['dropout'])(x)
    text_output = keras.layers.Flatten()(x)
    
    # two fully-connected layers
    x = keras.layers.Dense(par['fc_size_1'], use_bias=False)(text_output)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.Dropout(par['dropout'])(x)
    x = keras.layers.Dense(par['fc_size_2'], use_bias=False)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Activation('relu')(x)
    x = keras.layers.Dropout(par['dropout'])(x)
    preds = keras.layers.Dense(2, activation='softmax')(x)
    model = keras.models.Model(inputs=sequence_input, outputs = preds)
    
    opt = keras.optimizers.Adam(lr=par['lr'], beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=par['decay'])
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc']) 
    model.summary()
    return model

# Training process
* setup callback list
* train model
* store result

In [None]:
# training process
start_time = time.time()
model = model(par)

# if no over-sampling, reassign class weight for unbalance target in training data
class_weight = np.max(np.sum(y_train, axis=0)) / (np.sum(y_train, axis=0))
callbacks_list = [
    # setup early stop to avoid overfitting
    keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=par['patience'], verbose=0), 
    # save accuracy-increased model after each epoch
    keras.callbacks.ModelCheckpoint(os.path.join(par['model_dir'], par['model_file']), 
                                    monitor='val_acc', verbose=0, save_best_only=True, mode='max'), 
    # log for each epoch
    keras.callbacks.CSVLogger(os.path.join(par['model_dir'], par['model_file'])+'.log'), 
    # log for tensorboard
    keras.callbacks.TensorBoard(log_dir=os.path.join(par['model_dir'], par['model_file'])+'.tflog', 
                                histogram_freq=0, write_graph=True, write_images=True)
]

history = model.fit(x_train_text, y_train, validation_data=(x_val_text, y_val), 
                    class_weight=class_weight, epochs=par['epochs'], 
                    batch_size=par['batch_size'], shuffle=True, 
                    #callbacks=callbacks_list, 
                    verbose=1)

par['train_time'] = (time.time() - start_time) / 60 # minutes
min_i = np.argmin(np.array(history.history['val_loss']))
par['run_epochs'] = min_i
par['train_loss'] = history.history['loss'][min_i]
par['train_acc'] = history.history['acc'][min_i]
par['val_loss'] = history.history['val_loss'][min_i]
par['val_acc'] = history.history['val_acc'][min_i]

# testing process
model.load_weights(os.path.join(par['model_dir'],par['model_file']))
model.save(os.path.join(par['model_dir'],par['model_file']))

test_result = model.evaluate(x_test_text, y_test)
par['test_loss'] = test_result[0]
par['test_acc'] = test_result[1]

popularity_pred = (np.sum(y_train, axis=0) >= np.sort(np.sum(y_train, axis=0))[-1]).astype(int)
par['bl_acc'] = np.sum(np.sum(y_test * popularity_pred, axis=1)) / y_test.shape[0] 

par['num_trainData'] = len(train_db)
par['num_valData'] = len(val_db)
par['num_testData'] = len(test_db)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 15, 20)            1060      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 12, 2048)          163840    
_________________________________________________________________
batch_normalization_1 (Batch (None, 12, 2048)          8192      
_________________________________________________________________
activation_1 (Activation)    (None, 12, 2048)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 2048)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 2048)          0         
__________

# Save result information

In [None]:
def output_result(par, result):
    OUTPUT_FILEPATH = par['result_dir'] + par['result_filename'] + '.csv'
    if os.path.isfile(OUTPUT_FILEPATH):
        print("result is appended to %s" % (OUTPUT_FILEPATH))
        result.to_csv(OUTPUT_FILEPATH, encoding='utf-8', index=False, mode='a', header=False)
    else:
        print("%s is created." % (OUTPUT_FILEPATH))
        result.to_csv(OUTPUT_FILEPATH, encoding='utf-8', index=False)

one_li = []
one_li.append(par)
one_result = pd.DataFrame(one_li)
output_result(par, one_result)

# Test auccuray: > 88%

In [None]:
par

# Tune model via grid search

In [None]:
# def output_result(par, result):
#     OUTPUT_FILEPATH = par['result_dir'] + par['result_filename'] + '.csv'
#     if os.path.isfile(OUTPUT_FILEPATH):
#         print("result is appended to %s" % (OUTPUT_FILEPATH))
#         result.to_csv(OUTPUT_FILEPATH, encoding='utf-8', index=False, mode='a', header=False)    
#     else:
#         print("%s is created." % (OUTPUT_FILEPATH))
#         result.to_csv(OUTPUT_FILEPATH, encoding='utf-8', index=False)  

# import itertools
# tp={'text_column':['name'],'embedding_dim':[40]
#     ,'conv_kernel_size_1':[4],'conv_filters_1':[2048],'mp_filters_1':[1]
#     ,'conv_kernel_size_2':[2],'conv_filters_2':[256,128,512],'mp_filters_2':[1,2,3]
#     ,'dropout':[0.1],'fc_size_1':[512],'fc_size_2':[256]
#     ,'lr':[1e-4],'decay':[0.,1e-5],'batch_size':[32],'epochs':[200],'patience':[6]}

# tp_com=list(itertools.product(*[p for p in tp.values()]))

# i=0
# while i<len(tp_com):
#     par=d_par.copy()    
#     one_li = []
#     print("\n %s/%s %s" % (i+1, len(tp_com), ''))
    
#     for k,j in zip(tp.keys(),tp_com[i]):
#         par[k]=j
        
#     print(tp_com[i])
#     result,model=process_all(par,train_db,test_db)
#     if result is not None:
#         one_li.append(result)
#         one_result = pd.DataFrame(one_li)
#         output_result(par, one_result)    
#     i+=1