# Word2Vec then RNN

## Load data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

In [None]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [None]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

## Sentense lengths

In [None]:
sentenceLengths = splitTrain.apply(len)
sentenceLengths.describe()

In [None]:
import matplotlib.pyplot as plt

sentenceLengths.plot.box()
plt.show()

In [None]:
sentenceLengths.hist(bins=100)
plt.show()

In [None]:
maxSeqLength = 250

## Load Word2Vec dictionary

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format("./external-models/glove.6B/w2v.glove.6B.50.txt")  

In [None]:
numDimensions = 50

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

## Transform sentences to sequences of vectors

In [None]:
%%time
import numpy as np

def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    leftWords = allowedWords if allowedWords else ["hello"]
    
    return model.wv[leftWords]

In [None]:
%%time
w2vTrainFeatures = splitTrain.apply(wordsToVector)

In [None]:
%%time
w2vTestFeatures = splitTest.apply(wordsToVector)

## Helper function to get batches

In [None]:
def padWithZeros(array):
    fullArray = np.zeros([maxSeqLength, numDimensions])
    fullArray[:min(array.shape[0], maxSeqLength), :] = array[:min(array.shape[0], maxSeqLength), :]
    return fullArray

In [None]:
np.random.seed(4324) # always remember to reset the seed before using getTrainBatch

def getTrainBatch(size):
    indices = np.random.randint(0, len(w2vTrainFeatures), size)
    
    features = np.array(w2vTrainFeatures[indices].apply(padWithZeros).tolist())
    labels = (np
        .array(train.toxic[indices]
        .apply(
            lambda label: np.array([0., 1.]) if label == 0 else np.array([1., 0.])).tolist()))
    return features, labels

## Defining the RNN

In [None]:
batchSize = 100
lstmUnits = 64
numClasses = 2
iterations = 10000
maxSeqLength = 250

In [None]:
import tensorflow as tf

tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses], name="labels")
data = tf.placeholder(tf.float32, [batchSize, maxSeqLength, numDimensions], name="data")

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCellWithDropout = tf.contrib.rnn.DropoutWrapper(
    cell=lstmCell, 
    output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCellWithDropout, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1), name="correctedPrediction")
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32), name="accuracy")

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels), name="loss")
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Persist for tensorboard

In [None]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

## Training the model

In [None]:
%%time
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(logdir, sess.graph)
    
    for i in range(iterations):
       nextBatch, nextBatchLabels = getTrainBatch(batchSize)
       sess.run(optimizer, {data: nextBatch, labels: nextBatchLabels})

       #Write summary to Tensorboard
       if (i % 100 == 0):
           print(i)
           print(sess.run(accuracy, {data: nextBatch, labels: nextBatchLabels}))
           summary = sess.run(merged, {data: nextBatch, labels: nextBatchLabels})
           writer.add_summary(summary, i)

       #Save the network every 10,000 training iterations
       if (i % 2000 == 0 and i != 0):
           save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
           print("saved to %s" % save_path)
            
    writer.close()

In [None]:
def getTestBatch(i):
    features = np.array(w2vTestFeatures[i:(i + 100)].apply(padWithZeros).tolist())
    labels = (np
        .array(test.toxic[i:(i + 100)]
        .apply(
            lambda label: np.array([0., 1.]) if label == 0 else np.array([1., 0.])).tolist()))
    return features, labels

In [None]:
%%time
with tf.Session() as sess:
    saver.restore(sess, "./models/pretrained_lstm.ckpt-8000")
    
    def accFromBatch(i):
        if(i % 100 == 0):
            print(i)
        testBatch, testLabel = getTestBatch(i)
        return sess.run(prediction, {data: testBatch, labels: testLabel})

    preds = list(map(accFromBatch, range(300)))