# Word2Vec then RNN

## Imports and setting up TensorBoard

In [1]:
from data_preparation import *

## Loading data and setting up preprocessing tools

In [2]:
train, test, allData, contestTest = loadData()

Loading datasets...
train: (127656, 8), test: (31915, 8), allData: (159571, 8), contestTest: (153164, 2)


### Split sentences and apply word correction & splitting

In [5]:
%%time
from sklearn.pipeline import Pipeline

maxSeqLength = 250

splitter = SentenceSplitter("comment_text")
wordEmbedding = loadWordEmbedding("../../state/external-models/glove.6B/saved-2300.bin")

allWords = splitter.transform(pd.concat([allData, contestTest]))
textWords = set([word for sentence in allWords for word in sentence])
missingWords = textWords - set(list(wordEmbedding.vocab.keys()))

missingWordsResolutionDict = loadObject("./missingWordsResolution.plk")
missingWordsResolver = MissingWordsResolver(missingWordsResolutionDict, missingWords)

splittingPipeline = Pipeline(steps=[
    ("split", splitter),
    ("resolveMissingWords", missingWordsResolver) ])

Splitting sentences...
CPU times: user 18 s, sys: 720 ms, total: 18.8 s
Wall time: 18.8 s


### W2I and zero padding

In [6]:
%%time
resolvedWords = missingWordsResolver.transform(allWords)

w2i = Word2Int(resolvedWords)
zeroPadder = ZeroPadder(maxSeqLength)

indexingPipeline = Pipeline(steps=[
    ("w2i", w2i),
    ("zeroPadding", zeroPadder)
])

Resolving missing words...
Loading w2i and i2w dictionaries...
CPU times: user 7.9 s, sys: 68 ms, total: 7.97 s
Wall time: 8.44 s


In [7]:
preparationPipeline = Pipeline(steps=[
    ("splittingPipeline", splittingPipeline),
    ("indexingPipeline", indexingPipeline)
])

### W2V

In [8]:
%%time
w2v = Word2Vec(wordEmbedding, 300, w2i.i2w, seed=4324)

Loading word2vec dictionary...
CPU times: user 88 ms, sys: 4 ms, total: 92 ms
Wall time: 88.5 ms


In [9]:
embeddingMatrix, i2v = w2v.embeddingMatrix()

## Sentence to average vector 

In [87]:
def s2v(sentence):
    return sum([i2v[w2i.w2i[word]] for word in sentence])

In [88]:
trainF = splittingPipeline.transform(train)

Splitting sentences...
Resolving missing words...


In [105]:
testF = splittingPipeline.transform(test)

Splitting sentences...
Resolving missing words...


In [92]:
trainFull = pd.concat([trainF, train.toxic], axis=1)

In [104]:
trainNonEmpty = trainFull[trainFull.comment_text.str.len() != 0]

In [106]:
testFull = pd.concat([testF, test.toxic], axis=1)

In [107]:
testNonEmpty = testFull[testFull.comment_text.str.len() != 0]

In [114]:
trainSentences = np.array(trainNonEmpty.comment_text.transform(s2v).tolist())

In [115]:
testSentences = np.array(testNonEmpty.comment_text.transform(s2v).tolist())

## Checkpoint

In [None]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=300, n_jobs=-1)

model.fit(trainSentences, trainNonEmpty.toxic)

predictions = model.predict_proba(testSentences)

from evaluate_predictions import evaluatePredictions

binaryPredictions = np.where(predictions[:, 1] > 0.5, 1, 0)
labels = testNonEmpty.toxic

evaluatePredictions(pd.Series(labels), binaryPredictions, predictions)

In [None]:
predTrain = model.predict_proba(trainSentences)

binaryPredictionsTrain = np.where(predTrain[:, 1] > 0.5, 1, 0)
labels = trainNonEmpty.toxic

evaluatePredictions(pd.Series(labels), binaryPredictionsTrain, predTrain)