# Word2Vec then RNN

## Imports and setting up TensorBoard

In [1]:
from data_preparation import *
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Embedding
from keras.optimizers import Adam
from keras_train_helper import tfauc
from tensorflow.python.client import device_lib
from keras_train_helper import rotateTensorboardLogs
from keras.callbacks import TensorBoard
from keras_train_helper import PrintAucCallback

tensorBoardCallback = TensorBoard(log_dir="./tb-logs")
print(device_lib.list_local_devices())

Using TensorFlow backend.


[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7715724662805126216
]


## Loading data and setting up preprocessing tools

In [2]:
train, test, allData, contestTest = loadData()

Loading datasets...
train: (127656, 8), test: (31915, 8), allData: (159571, 8), contestTest: (153164, 2)


In [3]:
from sklearn.pipeline import Pipeline

maxSeqLength = 250

splitter = SentenceSplitter("comment_text")
allWords = pd.concat([splitter.transform(allData), splitter.transform(contestTest)])
w2i = Word2Int(allWords)
zeroPadder = ZeroPadder(maxSeqLength)

preparationPipeline = Pipeline(steps=[
    ("split", splitter),
    ("w2i", w2i),
    ("zeroPadding", zeroPadder) ])

Splitting sentences...
Splitting sentences...
Loading w2i and i2w dictionaries...


In [17]:
import data_preparation
import importlib
importlib.reload(data_preparation)
from data_preparation import *

In [21]:
%%time
w2v = Word2Vec(50, w2i.i2w, seed=4324)

Loading word2vec dictionary...
CPU times: user 2.05 s, sys: 12 ms, total: 2.06 s
Wall time: 2.14 s


In [22]:
embeddingMatrix, i2v = w2v.embeddingMatrix()

## Define the model

### Define the model

In [23]:
lstmUnits = [100]
numClasses = 2
batchSize = 1024

def defineModel():
    model = Sequential()

    model.add(Embedding(
        embeddingMatrix.shape[0],
        embeddingMatrix.shape[1],
        weights=[embeddingMatrix],
        input_length=maxSeqLength,
        trainable=False))

    model.add(Bidirectional(LSTM(lstmUnits[0]), name="LSTM"))

    model.add(Dropout(.2, name="dropout"))

    model.add(Dense(
        2, 
        activation="softmax", 
        name="softmax_output"))

    model.compile(
        loss="categorical_crossentropy", 
        optimizer="nadam", 
        metrics=[tfauc])

    print(model.summary())
    return model

## Prepare data and train the model

In [29]:
def fitModel(model, trainDataset, testDataset, label, epochs, fast=False):
    print("Training model for target variable: {}".format(label))
    
    tf.set_random_seed(43245)
    np.random.seed(453252)
    
    labelizer = Labelizer(label)
    oversampler = Oversampler(label)
    
    trainOversampled = oversampler.transform(trainDataset)

    trainSentences = preparationPipeline.transform(trainOversampled)
    trainLabels = labelizer.transform(trainOversampled)
    testSentences = preparationPipeline.transform(testDataset)
    testLabels = labelizer.transform(testDataset)

    printAucCallback = PrintAucCallback(testSentences, testLabels, allTestDataPerEpochs=1)
    if fast:
        callbacks = []
    else:
        callbacks = [tensorBoardCallback, printAucCallback]
    
    model.fit(
        trainSentences, 
        trainLabels, 
        nb_epoch=epochs, 
        batch_size=batchSize,
        callbacks=callbacks)
    
    return model

In [30]:
model = defineModel()
fitModel(model, train[:1000], test[:1000], "toxic", 2, fast=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 250, 50)           19255800  
_________________________________________________________________
LSTM (Bidirectional)         (None, 200)               120800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
softmax_output (Dense)       (None, 2)                 402       
Total params: 19,377,002
Trainable params: 121,202
Non-trainable params: 19,255,800
_________________________________________________________________
None
Building model for target variable: toxic
Oversampling...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...




Epoch 1/2
Epoch 2/2


<keras.models.Sequential at 0x7fa7b3b18320>

In [31]:
contestModels = [
    fitModel(defineModel(), train, test, label, epochs) 
    for label, epochs 
    in [("toxic", 2), ("severe_toxic", 2), ("obscene", 1), ("threat", 3), ("insult", 2), ("identity_hate", 2)]]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 250, 50)           19255800  
_________________________________________________________________
LSTM (Bidirectional)         (None, 200)               120800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
softmax_output (Dense)       (None, 2)                 402       
Total params: 19,377,002
Trainable params: 121,202
Non-trainable params: 19,255,800
_________________________________________________________________
None
Building model for target variable: toxic
Oversampling...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...




Epoch 1/2
 - all-test-data-auc: 0.4678
Epoch 2/2
 - all-test-data-auc: 0.5375
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 250, 50)           19255800  
_________________________________________________________________
LSTM (Bidirectional)         (None, 200)               120800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
softmax_output (Dense)       (None, 2)                 402       
Total params: 19,377,002
Trainable params: 121,202
Non-trainable params: 19,255,800
_________________________________________________________________
None
Building model for target variable: severe_toxic
Oversampling...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting wor

In [32]:
categories = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
contestPredictionProbabilities = {category: model.predict(preparationPipeline.transform(contestTest[:1000])) for (category, model) in zip(categories, contestModels)}

Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...
Splitting sentences...
Converting words to integers...
Zero-padding...


In [33]:
suffledResult = pd.DataFrame(dict(
    [("id", contestTest[:1000].id)] 
    + [(name, preds[:, 1]) for (name, preds) in contestPredictionProbabilities.items()]))

result = suffledResult[["id"] + categories]

In [34]:
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.107428,0.08661,0.179781,0.043975,0.104099,0.086608
1,0000247867823ef7,0.147077,0.124943,0.256728,0.045869,0.097912,0.125053
2,00013b17ad220c46,0.144134,0.122432,0.231361,0.07396,0.156137,0.122611
3,00017563c3f7919a,0.094946,0.076373,0.186833,0.022637,0.050725,0.076566
4,00017695ad8997eb,0.105768,0.083794,0.182512,0.037039,0.066183,0.08365


In [45]:
result.to_csv("./submissions/w2v-then-rnn.csv", index=False)

## Evaluate on test data

In [None]:
import matplotlib.pyplot as plt
plt.plot([value for key, value in printAucCallback.listOfAucs])

plt.ylabel("Area under ROC")
plt.xlabel("Epoch")
plt.show()

In [None]:
from evaluate_predictions import evaluatePredictions

predictions = model.predict(testSentences, batch_size=batchSize)
binaryPredictions = np.where(predictions[:, 1] > 0.5, 1, 0)
labels = testLabels[:, 1]

evaluatePredictions(pd.Series(labels), binaryPredictions, predictions)