In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import json
import numpy as np

#local packages
import twitter
import text_processing

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

np.random.seed(152)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
#import tweet datasets
all_tweets, is_sexist = text_processing.import_data('sexist_tweets.json', 'control_tweets.json')

#Consider only the 20.000 most common tokens
MAX_VOCAB_SIZE = 20000

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(all_tweets)
sequences = tokenizer.texts_to_sequences(all_tweets)
vocab_size = len(tokenizer.word_index)

print('Number of unique tokens found: %s'  % vocab_size)

MAX_SEQUENCE_LENGTH = 128
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(is_sexist)

print('Shape of data tensor:', data.shape)

Number of unique tokens found: 17650
Shape of data tensor: (11555, 128)


In [4]:
#Split data into training and validation sets

#Use 25% of values as validation
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.25 * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [5]:
#Prepare embedding layer
#Based on https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

EMBEDDING_DIM = 100
embeddings_index = {}
f = open('word_embeddings/glove.twitter.27B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1193514 word vectors.


In [6]:
embedding_matrix = np.zeros((vocab_size + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
embedding_layer = Embedding(vocab_size + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [8]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=3, batch_size=64)

Train on 8667 samples, validate on 2888 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23bb3ed87f0>

In [9]:
#Save model data so it can be loaded later without re-training
model.save('model.h5')
with open('corpus_text.json', 'w', encoding='utf-8') as fh:
    json.dump(all_tweets, fh, ensure_ascii=False)

In [10]:
model.predict(data[:10])

array([[5.8705686e-04],
       [1.4754159e-04],
       [1.1528141e-03],
       [3.2123714e-03],
       [2.1283091e-04],
       [4.3426338e-03],
       [2.0193753e-03],
       [1.0523942e-04],
       [1.5341323e-02],
       [1.2159599e-01]], dtype=float32)