# Word2Vec then RNN

## Imports and setting up TensorBoard

In [24]:
from data_preparation import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Embedding
from keras.optimizers import Adam
from keras_train_helper import tfauc
from tensorflow.python.client import device_lib
from keras_train_helper import rotateTensorboardLogs
from keras.callbacks import TensorBoard
from keras_train_helper import PrintAucCallback

tensorBoardCallback = TensorBoard(log_dir="./tb-logs")
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1919960022472173355
]


## Loading data and setting up preprocessing tools

In [9]:
train, test, allData, contestTest = loadData()

Loading datasets...
train: (127656, 8), test: (31915, 8), allData: (159571, 8), contestTest: (153164, 2)


In [10]:
from sklearn.pipeline import Pipeline

maxSeqLength = 250

splitter = SentenceSplitter("comment_text")
allWords = pd.concat([splitter.transform(allData), splitter.transform(contestTest)])
w2i = Word2Int(allWords)
zeroPadder = ZeroPadder(maxSeqLength)

preparationPipeline = Pipeline(steps=[
    ("split", splitter),
    ("w2i", w2i),
    ("zeroPadding", zeroPadder) ])

Splitting sentences...
Splitting sentences...
Loading w2i and i2w dictionaries...


In [11]:
import data_preparation
import importlib
importlib.reload(data_preparation)
from data_preparation import *

In [18]:
%%time
w2v = Word2Vec("webcrawl", w2i.i2w, seed=4324)

Loading word2vec dictionary...
CPU times: user 2.17 s, sys: 76 ms, total: 2.25 s
Wall time: 2.24 s


In [19]:
embeddingMatrix, i2v = w2v.embeddingMatrix()

## Define the model

### Define the model

In [20]:
lstmUnits = 100
numClasses = 2
batchSize = 1024

def defineModel():
    model = Sequential()

    model.add(Embedding(
        embeddingMatrix.shape[0],
        embeddingMatrix.shape[1],
        weights=[embeddingMatrix],
        input_length=maxSeqLength,
        trainable=False))

    model.add(Bidirectional(LSTM(lstmUnits), name="LSTM"))

    model.add(Dropout(.2, name="dropout"))

    model.add(Dense(
        2, 
        activation="softmax", 
        name="softmax_output"))
    
    model.compile(
        loss="categorical_crossentropy", 
        optimizer="nadam", 
        metrics=[tfauc])

    print(model.summary())
    return model

## Prepare data and train the model

In [43]:
def fitModel(model, trainDataset, testDataset, label, epochs, fast=False):
    print("Training model for target variable: {}".format(label))
    
    tf.set_random_seed(43245)
    np.random.seed(453252)
    
    labelizer = Labelizer(label)
    oversampler = Oversampler(label)
    
    trainOversampled = oversampler.transform(trainDataset)

    trainSentences = preparationPipeline.transform(trainOversampled)
    trainLabels = labelizer.transform(trainOversampled)
    testSentences = preparationPipeline.transform(testDataset)
    testLabels = labelizer.transform(testDataset)

    printAucCallback = PrintAucCallback(
        [trainSentences, trainLabels], 
        [testSentences, testLabels], 
        4* batchSize, 
        printFrequency=0.1)

    model.fit(
        trainSentences, 
        trainLabels, 
        nb_epoch=epochs, 
        batch_size=batchSize,
        callbacks=[] if fast else [tensorBoardCallback, printAucCallback])
    
    return model, printAucCallback

In [None]:
%%time
model = defineModel()
_, printAucCallback = fitModel(model, train, test, "toxic", 20)

In [None]:
%%time
contestModels = [
    fitModel(defineModel(), allData, test, label, epochs, fast=True)
    for (label, epochs)
    in [("toxic", 2), ("severe_toxic", 2), ("obscene", 1), ("threat", 1), ("insult", 1), ("identity_hate", 1)]]

In [None]:
%%time
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
contestPredictionProbabilities = {category: model.predict(preparationPipeline.transform(contestTest)) for (category, model) in zip(categories, contestModels)}

In [33]:
suffledResult = pd.DataFrame(dict(
    [("id", contestTest.id)] 
    + [(name, preds[:, 1]) for (name, preds) in contestPredictionProbabilities.items()]))

result = suffledResult[["id"] + categories]

In [34]:
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.107428,0.08661,0.179781,0.043975,0.104099,0.086608
1,0000247867823ef7,0.147077,0.124943,0.256728,0.045869,0.097912,0.125053
2,00013b17ad220c46,0.144134,0.122432,0.231361,0.07396,0.156137,0.122611
3,00017563c3f7919a,0.094946,0.076373,0.186833,0.022637,0.050725,0.076566
4,00017695ad8997eb,0.105768,0.083794,0.182512,0.037039,0.066183,0.08365


In [None]:
result.shape

In [45]:
result.to_csv("./submissions/w2v-then-rnn.csv", index=False)

## Evaluate on test data

In [None]:
import matplotlib.pyplot as plt
plt.plot([value for key, value in printAucCallback.listOfAucs])

plt.ylabel("Area under ROC")
plt.xlabel("Epoch")
plt.show()

In [None]:
from evaluate_predictions import evaluatePredictions

predictions = model.predict(testSentences, batch_size=batchSize)
binaryPredictions = np.where(predictions[:, 1] > 0.5, 1, 0)
labels = testLabels[:, 1]

evaluatePredictions(pd.Series(labels), binaryPredictions, predictions)