# Word2Vec then RNN

## Load data

In [1]:
import pandas as pd
import numpy as np
from os import path

importDirectory = "../state/data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [2]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

train: (127656, 8), test: (31915, 8), all: (159571, 8), contestTest: (153164, 2)


In [3]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [4]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

CPU times: user 3.04 s, sys: 180 ms, total: 3.22 s
Wall time: 3.21 s


## Convert words to integers

In [5]:
from sklearn.preprocessing import LabelEncoder

allWords = set([word for sentence in pd.concat([splitTrain, splitTest]) for word in sentence])

In [6]:
wordToInteger = { word: index for index, word in enumerate(allWords) }
integerToWord = { index: word for index, word in enumerate(allWords) }

In [7]:
integerTrain = splitTrain.apply(lambda sentence: [wordToInteger[word] for word in sentence])
integerTest = splitTest.apply(lambda sentence: [wordToInteger[word] for word in sentence])

## Int2Vec dictionary

In [28]:
numDimensions = 50
maxSeqLength = 250

In [8]:
%%time
import gensim

w2vModel = gensim.models.KeyedVectors.load_word2vec_format("../state/external-models/glove.6B/w2v.glove.6B.50.txt")

CPU times: user 27.6 s, sys: 176 ms, total: 27.8 s
Wall time: 27.7 s


In [13]:
availableWords = set.intersection(allWords, set(w2vModel.vocab.keys()))

In [18]:
int2vec = {index: w2vModel.word_vec(word) 
             if word in availableWords 
             else np.random.normal(scale=.644, size=(numDimensions,))
         for index, word in integerToWord.items()}

## Zero pad vectors

In [29]:
def padArrayWithZeros(array):
    fullArray = np.zeros(maxSeqLength)
    fullArray[:min(array.shape[0], maxSeqLength)] = array[:min(array.shape[0], maxSeqLength)]
    return fullArray

def padDictionary(dictionary):
    return np.array([padArrayWithZeros(vector) for vector in dictionary.values()])

In [34]:
embeddingMatrix = padDictionary(int2vec)

## Oversample

In [50]:
def oversample(dataset, features, label): 
    multiples = int(dataset[dataset[label] == 0].shape[0] / dataset[dataset[label] == 1].shape[0])
    
    datasetPositive = dataset[dataset[label] == 1]
    featuresPositive = features[datasetPositive.index.tolist()]
    
    datasetOversampled = pd.concat([dataset] + multiples * [datasetPositive]).reset_index() 
    featuresOversampled = pd.concat([features] + multiples * [featuresPositive]).reset_index().comment_text
    
    return datasetOversampled, featuresOversampled

In [51]:
trainOversampled, w2vTrainFeaturesOversampled = oversample(train, w2vTrainFeatures, "toxic")

## Helper function to get batches

In [52]:
# Is padding the words till position 250 with 0s the right approach?
def padWithZeros(array):
    fullArray = np.zeros([maxSeqLength, numDimensions])
    fullArray[:min(array.shape[0], maxSeqLength), :] = array[:min(array.shape[0], maxSeqLength), :]
    return fullArray

In [53]:
# always remember to reset the seed before using getTrainBatch
import numpy as np
np.random.seed(4324)

# Check if vectorizing this one can improve performance
def getBatch(dataset, features, size):
    indices = np.random.randint(0, len(features), size)
    
    features = np.array(features[indices].apply(padWithZeros).tolist())
    labels = (np
        .array(dataset.toxic[indices]
        .apply(
            lambda label: np.array([0., 1.]) if label == 0 else np.array([1., 0.])).tolist()))
    return features, labels

def getTrainBatch(size):
    return getBatch(trainOversampled, w2vTrainFeaturesOversampled, size)

## LMST with keras

In [54]:
def getFeaturesAndLabels(dataset, features, size):
    features = np.array(features[:size].apply(padWithZeros).tolist())
    labels = np.array(
        dataset
            .toxic[:size]
            .apply(
                lambda label: np.array([0., 1.]) if label == 0 else np.array([1., 0.])).tolist())
    
    return features, labels

In [30]:
%%time
# Beware, this will use ~40 GB of RAM
trainFeatures, trainLabels = getFeaturesAndLabels(trainOversampled, w2vTrainFeaturesOversampled, trainOversampled.shape[0])

CPU times: user 328 ms, sys: 104 ms, total: 432 ms
Wall time: 432 ms


In [55]:
%%time
# Small version, only 10000 datapoints
trainFeatures, trainLabels = getFeaturesAndLabels(trainOversampled, w2vTrainFeaturesOversampled, 10000)

CPU times: user 260 ms, sys: 284 ms, total: 544 ms
Wall time: 543 ms


In [136]:
lstmUnits = [100]
numClasses = 2
batchSize = 1024

In [57]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 362579018187557503
]


In [131]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout
from keras.optimizers import Adam

In [132]:
model = Sequential()
# model.add(Merge(mergeLayers, mode='concat')) 
        

cnt = 0
for size in lstmUnits:
    model.add(Bidirectional(LSTM(size, return_sequences=False), name="LSTM_"+str(cnt), input_shape=trainFeatures.shape[1:]))   
    model.add(TimeDistributed(Dropout(0.3, name="dropout_"+str(cnt))))
    cnt += 1

model.add(TimeDistributed(Dense(2, activation='softmax', name='softmax_output')))
       
model.compile(loss="categorical_crossentropy", optimizer=Adam())
print(model.summary())

AssertionError: 

In [133]:
tf.set_random_seed(43245)
np.random.seed(453252)

In [157]:


y = np.array([0, 0, 1, 1])
pred = np.array([0.1, 0.4, 0.35, 0.8])
roc_auc_score(y, pred)

0.75

In [145]:
thresholds

array([0.8 , 0.4 , 0.35, 0.1 ])

In [None]:
K.

In [162]:
import keras.backend as K
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, TimeDistributed, Dropout, Embedding
from keras.optimizers import Adam

lstmUnits = [100]
numClasses = 2
batchSize = 1024

def auc(y_true, y_pred):
     auc = tf.metrics.auc(y_true, y_pred)[1]
     K.get_session().run(tf.local_variables_initializer())
     return auc
    
model = Sequential()
model.add(Embedding())
model.add(LSTM(
    lstmUnits[0], 
    dropout=0.2, 
    recurrent_dropout=0.2, 
    input_shape=trainFeatures.shape[1:]))
model.add(Dense(2, activation="softmax"))
model.compile(
    loss="categorical_crossentropy", 
    optimizer="adam", 
    metrics=["accuracy", auc])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_64 (LSTM)               (None, 100)               60400     
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 202       
Total params: 60,602
Trainable params: 60,602
Non-trainable params: 0
_________________________________________________________________
None


In [163]:
%%time
model.fit(
    trainFeatures, 
    trainLabels, 
    nb_epoch=5, 
    batch_size=batchSize)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 10min 15s, sys: 19.1 s, total: 10min 34s
Wall time: 3min 3s


<keras.callbacks.History at 0x7fb095c50c18>

## Defining the RNN

In [None]:
lstmUnits = 64
numClasses = 2
batchSize = 2048

In [None]:
import tensorflow as tf

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses], name="labels")
data = tf.placeholder(tf.float32, [batchSize, maxSeqLength, numDimensions], name="data")

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCellWithDropout = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCellWithDropout, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

auc = 1 - tf.metrics.auc(tf.argmax(labels, 1), tf.clip_by_value(prediction[:, 0], 0, 1), name="auc")[1]
acc = tf.metrics.accuracy(tf.argmax(labels, 1), tf.argmax(prediction, 1), name="accuracy")[1]

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels), name="loss")
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Persist for tensorboard

In [None]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Area_under_roc', auc)
tf.summary.scalar("Accuracy", acc)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

## Training the model

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
np.random.seed(4324)
tf.set_random_seed(43245)

epochs = 10
samplesPerEpoch = int(trainOversampled.shape[0] / batchSize)

with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    for epoch in range(epochs):
        print("Epoch #{}".format(epoch))
        for i in range(samplesPerEpoch):
            nextBatch, nextBatchLabels = getTrainBatch(batchSize)
            
            sess.run(optimizer, {data: nextBatch, labels: nextBatchLabels})

            #Save the network every 1,000 training iterations
            iteration = epoch * samplesPerEpoch + i
            
            if (iteration % 1000 == 0 and iteration != 0):
                save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=iteration)
                print("saved to %s" % save_path)
                
            #Write summary to Tensorboard
            if (iteration % 100 == 0 and iteration != 0):
                print(f"Loss: {sess.run(loss, {data: nextBatch, labels: nextBatchLabels})}")
                print(f"Acc: {sess.run(acc, {data: nextBatch, labels: nextBatchLabels})}")
                print(f"Auc: {sess.run(auc, {data: nextBatch, labels: nextBatchLabels})}")
                summary = sess.run(merged, {data: nextBatch, labels: nextBatchLabels})
                writer.add_summary(summary, iteration)
    writer.close()

In [None]:
def getTestBatch(size):
    return getBatch(test, w2vTestFeatures, size)

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

preds = []

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./models/pretrained_lstm.ckpt-74000")
    
    for i in range(int(test.shape[0]/batchSize)):
        if i % 100 == 0:
            print(i)
        nextBatch, nextBatchLabels = getTestBatch(batchSize)

        preds.append((sess.run(prediction, {data: nextBatch}), nextBatchLabels[:, 0]))

In [None]:
prd, labs = zip(*preds)

In [None]:
prdProbVec = np.concatenate(prd)

In [None]:
prdVec = np.where(prdProbVec[:, 0] > 0.5, 1, 0)

In [None]:
labsVec = np.concatenate(labs, axis=0)

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

logloss = []
accuracy = []
auroc = []

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./models/pretrained_lstm.ckpt-74000")
    
    for i in range(int(test.shape[0]/batchSize)):
        if i % 100 == 0:
            print(i)
        nextBatch, nextBatchLabels = getTestBatch(batchSize)
        logloss.append(sess.run(loss, {data: nextBatch, labels: nextBatchLabels}))
        accuracy.append(sess.run(acc, {data: nextBatch, labels: nextBatchLabels}))
        auroc.append(sess.run(auc, {data: nextBatch, labels: nextBatchLabels}))

In [None]:
def lavg(l):
    return sum(l) / len(l)

In [None]:
print(f"Loss: {lavg(logloss)}")
print(f"Accuracy: {lavg(accuracy)}")
print(f"AUC: {lavg(auroc)}")

In [None]:
from evaluate_predictions import evaluatePredictions

evaluatePredictions(pd.Series(labsVec), prdVec, 1 - prdProbVec)

## Model from paper

In [12]:
from __future__ import print_function
import os
import logging
import sys
from neuralnets.BiLSTM import BiLSTM
from util.preprocessing import perpareDataset, loadDatasetPickle

In [14]:
# :: Change into the working dir of the script ::
abspath = os.getcwd()
dname = os.path.dirname(abspath)
os.chdir(dname)

# :: Logging level ::
loggingLevel = logging.INFO
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [7]:
biLSTM = BiLSTM()

NameError: name 'BiLSTM' is not defined