Ideas for improvement:
- Change hashing method so that the 5000 most common tokens are unique and all others are zero
- Change random_tweets to be more random, and remove foreign languages, etc.

In [15]:
%load_ext autoreload

In [97]:
%autoreload 2
import pandas as pd
import numpy as np
import twitter

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

In [55]:
#import tweet ids from hatespeech-master, filter for sexist tweets, and convert to a list
tweet_ids = pd.read_csv('hatespeech-master/NAACL_SRW_2016.csv', header=None)
tweet_ids = list(tweet_ids[tweet_ids[1] == 'sexism'][0])

tweets = twitter.get_statuses(tweet_ids)
tweet_text = [tweet['text'] for tweet in tweets]

In [56]:
#Now add a list of random tweets from random_tweets.csv
random_tweets = list(pd.read_csv('random_tweets.csv', header=None)[1])

#Create a list of markers for sexist tweets:
#0 = not sexist; 1 = sexist
is_sexist = [0]*len(tweets) + [1]*len(random_tweets)

#Concatenate the sexist and random tweets
tweet_text += random_tweets

In [102]:
vocab_size = 10000
embedding_vector_length = 32

#Convert words to integer hashes
encoded_text = twitter.create_hash_indices(tweet_text, hashspace_size=vocab_size)
#Pad word sequences so they're all the same length
encoded_text = pad_sequences(encoded_text, padding='post')

max_tweet_length = max([len(tw) for tw in encoded_text])

In [103]:
#Split data into training and validation sets

np.random.seed(152)

#Use 25% of values as validation
n = len(encoded_text)
validation_indices = np.random.choice(n, size=(n // 4), replace=False)
training_indices = np.setdiff1d(np.arange(n), validation_indices)

X_train = np.array([encoded_text[i] for i in training_indices])
y_train = np.array([is_sexist[i] for i in training_indices])
X_test = np.array([encoded_text[i] for i in validation_indices])
y_test = np.array([is_sexist[i] for i in validation_indices])

In [107]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_tweet_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=64)

Train on 9544 samples, validate on 3181 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2c186587fd0>

In [105]:
model.evaluate(X_test, y_test, verbose=0)

[0.5424987262279423, 0.7673687520210041]

In [108]:
#Save trained model
model.save('model.h5')

In [95]:
def predict(text):
    if type(text) is str:
        text = [text]
    encoded = twitter.create_hash_indices(text, vocab_size)
    encoded = pad_sequences(encoded, maxlen=max_tweet_length, padding='post')
    return model.predict_classes(encoded)