In [1]:
import re
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Activation, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections import Counter
from keras.preprocessing import sequence

In [2]:
# Tokenization Regexes
number_re = re.compile(r"(?:\d+[,\.\d]*)?\d")
punct_re = re.compile(r"[!@#\$%\^&\*\(\)\-_\+=\{\}\[\]:;\"',<\.>\\/\?]")
multi_space_re = re.compile(r"\s\s+")
hashtag_re = re.compile(r"#[^\s]+")

In [3]:
def load_embedding(path):
    '''
    Loads the word embedding into memory
    '''
    embeddings_index = {}
    with open(path, "r") as _f:
        for line in _f:
            values = line.rstrip().rsplit(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    print("{} word vectors".format(len(embeddings_index)))
    return embeddings_index

def set_up_weight_matrix(embedding_index, word_index):
    '''
    Sets up the weight matrix for the embedding layer by creating a new matrix and filling it
    with the vectors from the word embedding.
    '''
    words_not_found = set()
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, 300))

    for word, i in word_index.items():
        if i == 0:
            print(word)
        if i >= nb_words:
            continue
        try:
            embedding_vector = embedding_index[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            words_not_found.add(word)
    print("{} words not found out of {} words".format(len(words_not_found), len(word_index)))
    return embedding_matrix

def tokenize_texts(texts):
    tokenized = []
    for text in texts:
        text = multi_space_re.sub(" ", number_re.sub("", punct_re.sub("", hashtag_re.sub("", text.lower())))).strip()
        tokenized.append(text)
    return tokenized

In [4]:
en_vec = load_embedding("../../Utils/word_vectors/wiki.en.align.vec")

2519371 word vectors


In [5]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [6]:
train_df["tokenized"] = tokenize_texts(train_df["text"])
test_df["tokenized"] = tokenize_texts(test_df["text"])
all_texts = train_df["tokenized"].tolist() + test_df["tokenized"].tolist()
train_texts = train_df["tokenized"].tolist()
train_labs = train_df["target"].tolist()

In [7]:
tokenizer = Tokenizer(lower=True, char_level=False)
tokenizer.fit_on_texts(all_texts)

In [8]:
val_size = .9
split_point = int(len(train_texts) * val_size)
x_texts = train_texts[:split_point]
y_labs = train_labs[:split_point]
val_texts = train_texts[split_point:]
val_labs = train_labs[split_point:]

In [9]:
x_texts_tok = tokenizer.texts_to_sequences(x_texts)
x_texts_tok = sequence.pad_sequences(x_texts_tok, maxlen=20)
val_texts_tok = tokenizer.texts_to_sequences(val_texts)
val_texts_tok = sequence.pad_sequences(val_texts_tok, maxlen=20)
word_index = tokenizer.word_index

In [10]:
x_texts_tok = np.array(x_texts_tok)
y_labs = np.array(y_labs)
val_texts_tok = np.array(val_texts_tok)
val_labs = np.array(val_labs)

In [11]:
embedding_matrix = set_up_weight_matrix(en_vec, word_index)

11620 words not found out of 26619 words


In [45]:
lstm = Sequential()
lstm.add(Embedding(len(word_index)+1,
                   300,
                   weights=[embedding_matrix],
                   input_length=20,
                   trainable=False))
lstm.add(Dense(512, activation="relu"))
lstm.add(Dropout(0.2))
#lstm.add(LSTM(256))
#lstm.add(LSTM(256, dropout = 0.3, recurrent_dropout = 0.3))
lstm.add(LSTM(256, dropout = 0.2))
lstm.add(Dense(256, activation="relu"))
lstm.add(Dropout(0.3))
lstm.add(Dense(1, activation='sigmoid'))
lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [46]:
lstm.fit(x_texts_tok, y_labs, epochs=20, validation_data=(val_texts_tok, val_labs))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f5e719f2e50>

In [47]:
scores = lstm.evaluate(val_texts_tok, val_labs)
print(scores[1] * 100)

77.8215229511261
