# Word2Vec then RNN

## Load data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

In [None]:
def splitSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower()
    .str.split())

In [None]:
%%time
splitTrain = splitSentences(train)
splitTest = splitSentences(test)

## Sentense lengths

In [None]:
sentenceLengths = splitTrain.apply(len)
sentenceLengths.describe()

In [None]:
import matplotlib.pyplot as plt

sentenceLengths.plot.box()
plt.show()

In [None]:
sentenceLengths.hist(bins=100)
plt.show()

## Load Word2Vec dictionary

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format("./external-models/glove.6B/w2v.glove.6B.50.txt")  

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

## Transform sentences to sequences of vectors

In [None]:
splitTrain.shape

In [None]:
model.wv["hello", "there"].shape

In [None]:
%%time
import numpy as np

def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    leftWords = allowedWords if allowedWords else ["hello"]
    
    return model.wv[leftWords]

In [None]:
%%time
w2vTrainFeatures = splitTrain.apply(wordsToVector)

In [None]:
%%time
w2vTestFeatures = splitTest.apply(wordsToVector)

## Helper to get batches

In [None]:
np.random.seed(4324) # always remember to reset the seed before using getTrainBatch

def getTrainBatch(size):
    return w2vTrainFeatures.sample(size)