# Word2Vec then RNN

## Load data

In [96]:
import pandas as pd
import numpy as np
from os import path

importDirectory = "../state/data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [97]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

train: (127656, 8), test: (31915, 8), all: (159571, 8), contestTest: (153164, 2)


In [98]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [99]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

CPU times: user 3.75 s, sys: 140 ms, total: 3.89 s
Wall time: 3.89 s


## Convert words to integers

In [100]:
from sklearn.preprocessing import LabelEncoder

allWords = set([word for sentence in pd.concat([splitTrain, splitTest]) for word in sentence])

In [101]:
wordToInteger = { word: index for index, word in enumerate(allWords) }
integerToWord = { index: word for index, word in enumerate(allWords) }

In [102]:
integerTrain = splitTrain.apply(lambda sentence: [wordToInteger[word] for word in sentence])
integerTest = splitTest.apply(lambda sentence: [wordToInteger[word] for word in sentence])

## Int2Vec embedding matrix

In [103]:
numDimensions = 50
maxSeqLength = 250

In [104]:
%%time
import gensim

w2vModel = gensim.models.KeyedVectors.load_word2vec_format("../state/external-models/glove.6B/w2v.glove.6B.50.txt")

CPU times: user 22.8 s, sys: 108 ms, total: 22.9 s
Wall time: 22.8 s


In [105]:
availableWords = set.intersection(allWords, set(w2vModel.vocab.keys()))

In [106]:
np.random.seed(432432)

int2vec = {index: w2vModel.word_vec(word) 
             if word in availableWords 
             else np.random.normal(scale=.644, size=(numDimensions,))
         for index, word in integerToWord.items()}

In [107]:
embeddingMatrix = np.array([vector for vector in int2vec.values()])

## Oversample

In [108]:
def oversample(dataset, label): 
    multiples = int(dataset[dataset[label] == 0].shape[0] / dataset[dataset[label] == 1].shape[0])
    
    datasetPositive = dataset[dataset[label] == 1]
    
    return pd.concat([dataset] + multiples * [datasetPositive]).reset_index()

In [109]:
trainOversampled = oversample(train, "toxic")

## Zero pad vectors

In [110]:
def padArrayWithZeros(array):
    fullArray = np.zeros(maxSeqLength)
    fullArray[:min(array.shape[0], maxSeqLength)] = array[:min(array.shape[0], maxSeqLength)]
    return fullArray

In [111]:
def prepareText(dataset):
    return np.array(splitSentences(dataset)
        .apply(lambda sentence: 
            padArrayWithZeros(np.array([
                wordToInteger[word]
                for word in sentence])))
        .tolist())

In [112]:
%%time
trainSentences = prepareText(trainOversampled)[0:10000]

CPU times: user 9.38 s, sys: 248 ms, total: 9.62 s
Wall time: 9.62 s


In [113]:
trainLabels = np.array(trainOversampled
    .toxic
    .apply(lambda label: np.array([0, 1]) if label == 1 else np.array([1, 0]))
    .tolist())[0:10000]

## LMST RNN with keras

In [114]:
lstmUnits = [100]
numClasses = 2
batchSize = 1024

In [115]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8606361863497408411
]


In [116]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Embedding
from keras.optimizers import Adam

In [117]:
import keras.backend as K
import tensorflow as tf

def auc(y_true, y_pred):
     auc = tf.metrics.auc(y_true, y_pred)[1]
     K.get_session().run(tf.local_variables_initializer())
     return auc

In [120]:
model = Sequential()

model.add(Embedding(
    embeddingMatrix.shape[0],
    embeddingMatrix.shape[1],
    weights=[embeddingMatrix],
    input_length=maxSeqLength,
    trainable=False))

model.add(LSTM(
    lstmUnits[0], 
    dropout=0.2, 
    recurrent_dropout=0.2,
    name="LSTM"))

model.add(Dense(
    2, 
    activation="softmax", 
    name="softmax_output"))

model.compile(
    loss="categorical_crossentropy", 
    optimizer="adam", 
    metrics=["accuracy", auc])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 250, 50)           11198300  
_________________________________________________________________
LSTM (LSTM)                  (None, 100)               60400     
_________________________________________________________________
softmax_output (Dense)       (None, 2)                 202       
Total params: 11,258,902
Trainable params: 60,602
Non-trainable params: 11,198,300
_________________________________________________________________
None


In [119]:
tf.set_random_seed(43245)
np.random.seed(453252)

In [None]:
%%time
model.fit(
    trainSentences, 
    trainLabels, 
    nb_epoch=5, 
    batch_size=batchSize)