# Word2Vec then RNN

## Load data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

In [None]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [None]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

## Sentense lengths

In [None]:
sentenceLengths = splitTrain.apply(len)
sentenceLengths.describe()

In [None]:
import matplotlib.pyplot as plt

sentenceLengths.plot.box()
plt.show()

In [None]:
sentenceLengths.hist(bins=100)
plt.show()

In [None]:
maxSeqLength = 250

## Load Word2Vec dictionary

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format("./external-models/glove.6B/w2v.glove.6B.50.txt")  

In [None]:
numDimensions = 50

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

## Transform sentences to sequences of vectors

In [None]:
%%time
import numpy as np

# If possible, vectorize this transformation
def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    leftWords = allowedWords if allowedWords else ["hello"]
    
    return model.wv[leftWords]

In [None]:
%%time
w2vTrainFeatures = splitTrain.apply(wordsToVector)

In [None]:
%%time
w2vTestFeatures = splitTest.apply(wordsToVector)

## Oversample

In [None]:
def oversample(dataset, features, label): 
    multiples = int(dataset[dataset[label] == 0].shape[0] / dataset[dataset[label] == 1].shape[0])
    
    datasetPositive = dataset[dataset[label] == 1]
    featuresPositive = features[datasetPositive.index.tolist()]
    
    datasetOversampled = pd.concat([dataset] + multiples * [datasetPositive]).reset_index() 
    featuresOversampled = pd.concat([features] + multiples * [featuresPositive]).reset_index().comment_text
    
    return datasetOversampled, featuresOversampled

In [None]:
trainOversampled, w2vTrainFeaturesOversampled = oversample(train, w2vTrainFeatures, "toxic")

## Helper function to get batches

In [None]:
# Is padding the words till position 250 with 0s the right approach?
def padWithZeros(array):
    fullArray = np.zeros([maxSeqLength, numDimensions])
    fullArray[:min(array.shape[0], maxSeqLength), :] = array[:min(array.shape[0], maxSeqLength), :]
    return fullArray

In [None]:
# always remember to reset the seed before using getTrainBatch
np.random.seed(4324)

# Check if vectorizing this one can improve performance
def getBatch(dataset, features, size):
    indices = np.random.randint(0, len(features), size)
    
    features = np.array(features[indices].apply(padWithZeros).tolist())
    labels = (np
        .array(dataset.toxic[indices]
        .apply(
            lambda label: np.array([0., 1.]) if label == 0 else np.array([1., 0.])).tolist()))
    return features, labels

def getTrainBatch(size):
    return getBatch(trainOversampled, w2vTrainFeaturesOversampled, size)

## Defining the RNN

In [None]:
lstmUnits = 64
numClasses = 2
batchSize = 32

In [None]:
import tensorflow as tf

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses], name="labels")
data = tf.placeholder(tf.float32, [batchSize, maxSeqLength, numDimensions], name="data")

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCellWithDropout = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCellWithDropout, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

auc = 1 - tf.metrics.auc(tf.argmax(labels,1), prediction[:, 0], name="auc")[1]
acc = tf.metrics.accuracy(tf.argmax(labels,1), tf.argmax(prediction, 1), name="accuracy")[1]

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels), name="loss")
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Persist for tensorboard

In [None]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Area_under_roc', auc)
tf.summary.scalar("Accuracy", acc)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

## Training the model

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
np.random.seed(4324)

epochs = 10
samplesPerEpoch = int(trainOversampled.shape[0] / batchSize)

with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    for epoch in range(epochs):
        print("Epoch #{}".format(epoch))
        for i in range(samplesPerEpoch):
            nextBatch, nextBatchLabels = getTrainBatch(batchSize)
            
            sess.run(optimizer, {data: nextBatch, labels: nextBatchLabels})

            #Save the network every 1,000 training iterations
            iteration = epoch * samplesPerEpoch + i
            
            if (iteration % 1000 == 0 and iteration != 0):
                save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=iteration)
                print("saved to %s" % save_path)
                
            #Write summary to Tensorboard
            if (iteration % 100 == 0 and iteration != 0):
                print(f"Loss: {sess.run(loss, {data: nextBatch, labels: nextBatchLabels})}")
                print(f"Acc: {sess.run(acc, {data: nextBatch, labels: nextBatchLabels})}")
                print(f"Auc: {sess.run(auc, {data: nextBatch, labels: nextBatchLabels})}")
                summary = sess.run(merged, {data: nextBatch, labels: nextBatchLabels})
                writer.add_summary(summary, iteration)
    writer.close()

In [None]:
def getTestBatch(size):
    return getBatch(test, w2vTestFeatures, size)

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

preds = []

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./models/pretrained_lstm.ckpt-74000")
    
    for i in range(int(test.shape[0]/batchSize)):
        if i % 100 == 0:
            print(i)
        nextBatch, nextBatchLabels = getTestBatch(batchSize)

        preds.append((sess.run(prediction, {data: nextBatch}), nextBatchLabels[:, 0]))

In [None]:
prd, labs = zip(*preds)

In [None]:
prdProbVec = np.concatenate(prd)

In [None]:
prdVec = np.where(prdProbVec[:, 0] > 0.5, 1, 0)

In [None]:
labsVec = np.concatenate(labs, axis=0)

In [None]:
%%time
saver = tf.train.Saver()
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

logloss = []
accuracy = []
auroc = []

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./models/pretrained_lstm.ckpt-74000")
    
    for i in range(int(test.shape[0]/batchSize)):
        if i % 100 == 0:
            print(i)
        nextBatch, nextBatchLabels = getTestBatch(batchSize)
        logloss.append(sess.run(loss, {data: nextBatch, labels: nextBatchLabels}))
        accuracy.append(sess.run(acc, {data: nextBatch, labels: nextBatchLabels}))
        auroc.append(sess.run(auc, {data: nextBatch, labels: nextBatchLabels}))

In [None]:
def lavg(l):
    return sum(l) / len(l)

In [None]:
print(f"Loss: {lavg(logloss)}")
print(f"Accuracy: {lavg(accuracy)}")
print(f"AUC: {lavg(auroc)}")

In [None]:
from evaluate_predictions import evaluatePredictions

evaluatePredictions(pd.Series(labsVec), prdVec, 1 - prdProbVec)