In [1]:
import pandas as pd
import numpy as np
import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam
import os
np.random.seed(2017)

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
train_data = pd.read_csv('../../data/spooky-author/download/train.csv')
mapper = {'EAP':0, 'HPL':1, 'MWS':2}
train_data['author'] = train_data['author'].map(lambda x : mapper[x])

In [3]:
texts = train_data['text'].values.tolist()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=90)
labels = to_categorical(train_data['author'].values.tolist())
print('data:', data.shape)
print('label:', labels.shape)

data: (19579, 90)
label: (19579, 3)


In [4]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.20 * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [5]:
embeddings_index = {}

f = open(os.path.join('../../src/pip/glove/glove.6B.50d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

del embeddings_index

embedding_layer = Embedding(len(word_index) + 1, 50, weights=[embedding_matrix], input_length=90, trainable=True)

In [6]:
model = Sequential()
model.add(embedding_layer)
model.add(GlobalAveragePooling1D())
model.add(Dense(3, activation='softmax'))
optimizer = Adam(lr=0.0001)
model.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['accuracy'])

In [7]:
params = {}
params['validation_data'] = (x_val, y_val)
params['batch_size'] = 8
params['epochs'] = 20
params['callbacks'] = [EarlyStopping(patience=2, monitor='val_loss')]
model.fit(x_train, y_train, **params, )

Train on 15664 samples, validate on 3915 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdd2c051d68>

In [None]:
params = {}
params['validation_data'] = (x_val, y_val)
params['batch_size'] = 8
params['epochs'] = 30
params['callbacks'] = [EarlyStopping(patience=2, monitor='val_loss')]
model.optimizer.lr = 0.0001
model.fit(x_train, y_train, **params)

Train on 15664 samples, validate on 3915 samples
Epoch 1/30