# Word2Vec then RNN

## Load data

In [1]:
import pandas as pd
import numpy as np
from os import path

importDirectory = "../state/data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [2]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

train: (127656, 8), test: (31915, 8), all: (159571, 8), contestTest: (153164, 2)


In [3]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [4]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

CPU times: user 2.98 s, sys: 164 ms, total: 3.14 s
Wall time: 3.14 s


## Convert words to integers

In [5]:
from sklearn.preprocessing import LabelEncoder

allWords = set([word for sentence in pd.concat([splitTrain, splitTest]) for word in sentence])

In [6]:
wordToInteger = { word: index for index, word in enumerate(allWords) }
integerToWord = { index: word for index, word in enumerate(allWords) }

In [7]:
integerTrain = splitTrain.apply(lambda sentence: [wordToInteger[word] for word in sentence])
integerTest = splitTest.apply(lambda sentence: [wordToInteger[word] for word in sentence])

## Int2Vec embedding matrix

In [8]:
numDimensions = 50
maxSeqLength = 250

In [9]:
%%time
import gensim

w2vModel = gensim.models.KeyedVectors.load_word2vec_format("../state/external-models/glove.6B/w2v.glove.6B.50.txt")

CPU times: user 26 s, sys: 200 ms, total: 26.2 s
Wall time: 26.1 s


In [10]:
availableWords = set.intersection(allWords, set(w2vModel.vocab.keys()))

In [11]:
np.random.seed(432432)

int2vec = {index: w2vModel.word_vec(word) 
             if word in availableWords 
             else np.random.normal(scale=.644, size=(numDimensions,))
         for index, word in integerToWord.items()}

In [12]:
embeddingMatrix = np.array([vector for vector in int2vec.values()])

## Oversample

In [13]:
def oversample(dataset, label): 
    multiples = int(dataset[dataset[label] == 0].shape[0] / dataset[dataset[label] == 1].shape[0])
    
    datasetPositive = dataset[dataset[label] == 1]
    
    return pd.concat([dataset] + multiples * [datasetPositive]).reset_index()

In [14]:
trainOversampled = oversample(train, "toxic")

In [24]:
testOversampled = oversample(test, "toxic")

## Zero pad vectors

In [15]:
def padArrayWithZeros(array):
    fullArray = np.zeros(maxSeqLength)
    fullArray[:min(array.shape[0], maxSeqLength)] = array[:min(array.shape[0], maxSeqLength)]
    return fullArray

In [16]:
def prepareText(dataset):
    return np.array(splitSentences(dataset)
        .apply(lambda sentence: 
            padArrayWithZeros(np.array([
                wordToInteger[word]
                for word in sentence])))
        .tolist())

In [17]:
%%time
trainSentences = prepareText(trainOversampled)[0:10000]

CPU times: user 10.2 s, sys: 392 ms, total: 10.6 s
Wall time: 10.6 s


In [18]:
trainLabels = np.array(trainOversampled
    .toxic
    .apply(lambda label: np.array([0, 1]) if label == 1 else np.array([1, 0]))
    .tolist())[0:10000]

In [25]:
testSentences = prepareText(testOversampled)[0:10000]

In [26]:
testLabels = np.array(testOversampled
    .toxic
    .apply(lambda label: np.array([0, 1]) if label == 1 else np.array([1, 0]))
    .tolist())[0:10000]

## LMST RNN with keras

In [28]:
lstmUnits = [100]
numClasses = 2
batchSize = 1024

In [29]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12179981339408787876
]


In [30]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Embedding
from keras.optimizers import Adam

In [31]:
import keras.backend as K
import tensorflow as tf

def auc(y_true, y_pred):
     auc = tf.metrics.auc(y_true, y_pred)[1]
     K.get_session().run(tf.local_variables_initializer())
     return auc

In [32]:
model = Sequential()

model.add(Embedding(
    embeddingMatrix.shape[0],
    embeddingMatrix.shape[1],
    weights=[embeddingMatrix],
    input_length=maxSeqLength,
    trainable=False))

model.add(LSTM(
    lstmUnits[0], 
    dropout=0.2, 
    recurrent_dropout=0.2,
    name="LSTM"))

model.add(Dense(
    2, 
    activation="softmax", 
    name="softmax_output"))

model.compile(
    loss="categorical_crossentropy", 
    optimizer="adam", 
    metrics=["accuracy", auc])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 50)           11198300  
_________________________________________________________________
LSTM (LSTM)                  (None, 100)               60400     
_________________________________________________________________
softmax_output (Dense)       (None, 2)                 202       
Total params: 11,258,902
Trainable params: 60,602
Non-trainable params: 11,198,300
_________________________________________________________________
None


In [33]:
tf.set_random_seed(43245)
np.random.seed(453252)

In [36]:
%%time
model.fit(
    trainSentences, 
    trainLabels, 
    nb_epoch=2, 
    batch_size=batchSize,
    validation_data=(testSentences, testLabels))



Train on 10000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
CPU times: user 5min 40s, sys: 21.1 s, total: 6min 1s
Wall time: 1min 42s


<keras.callbacks.History at 0x7fd387e10b70>