# Word2Vec using TF-IDF weights to sum word vectors

## Load data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

In [None]:
def cleanSentences(dataset):
    return (dataset.comment_text
    .str.replace("[^A-Za-z\s]", "")
    .str.lower())

In [None]:
%%time
cleanTrain = cleanSentences(train)
cleanTest = cleanSentences(test)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer().fit(cleanTrain)

In [None]:
word2idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

## Word2Vec

In [None]:
splitTrain = cleanTrain.str.split()
splitTest = cleanTest.str.split()

In [None]:
%%time
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format("./external-models/glove.6B/w2v.glove.6B.300.txt")  

In [None]:
%%time
vocabulary = set(model.wv.vocab.keys())

In [None]:
%%time
import numpy as np

def wordsToVector(words):
    allowedWords = [word for word in words if word in vocabulary]
    leftWords = allowedWords if allowedWords else ["hello"]
    idfWeights = [word2idf.get(word, 0.) for word in leftWords]
    
    return np.matmul(idfWeights, model.wv[leftWords])

In [None]:
%%time
w2vTrainFeatures = np.array(splitTrain.apply(wordsToVector).tolist())

In [None]:
%%time
w2vTestFeatures = np.array(splitTest.apply(wordsToVector).tolist())

## Evaluate with logistic regression

In [None]:
from feature_evaluation_logistic_regression import evaluateFeaturesWithLogisticRegression

evaluateFeaturesWithLogisticRegression(w2vTrainFeatures, w2vTestFeatures, train.toxic, test.toxic)