In [5]:
import pickle
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

In [2]:
with open('../data/clean-data.pkl','rb') as fp:
    df = pickle.load(fp)
fp.close()

In [7]:
text = df.Text.tolist()
summary = df.Summary.tolist()

### Tokenizing:
***

In [8]:
for i in range(len(text)):
    text[i] = text[i].split()
    summary[i] = summary[i].split()

### Mean Weighted and Tfidf weighted:
***

In [16]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [17]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

### Loading pre-trained model:
***

In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format\
('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
# Preparing to convert to vectors
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [18]:
mean_vectorizer = MeanEmbeddingVectorizer(w2v)
tfidf_vectorizer = TfidfEmbeddingVectorizer(w2v)

In [20]:
# Fitting to the mean-weighted converter to the text of review
mean_vectorizer.fit(text,df.Score)

<__main__.MeanEmbeddingVectorizer at 0x1aeb550a320>

In [25]:
mean_vector_text = mean_vectorizer.transform(text)

In [24]:
# Fitting to the tfidf-weighted converter to the text of review
tfidf_vectorizer.fit(text,df.Score)

<__main__.TfidfEmbeddingVectorizer at 0x1aeb550a2e8>

In [27]:
tfidf_vector_text = tfidf_vectorizer.transform(text)

In [29]:
# Saving for modeling
with open('../data/mean_weighted_vector_text.pkl','wb') as fp:
    pickle.dump(mean_vector_text,fp)
fp.close()

with open('../data/tfidf_weighted_vector_text.pkl','wb') as fp:
    pickle.dump(tfidf_vector_text,fp)
fp.close()

In [30]:
del mean_vector_text, tfidf_vector_text

In [31]:
# Fitting to the mean-weighted converter to the summary of review
mean_vectorizer.fit(summary,df.Score)

<__main__.MeanEmbeddingVectorizer at 0x1aeb550a320>

In [32]:
mean_vector_summary = mean_vectorizer.transform(summary)

In [33]:
# Fitting to the tfidf-weighted convertor to the summary of review
tfidf_vectorizer.fit(summary,df.Score)

<__main__.TfidfEmbeddingVectorizer at 0x1aeb550a2e8>

In [34]:
tfidf_vector_summary = tfidf_vectorizer.transform(summary)

In [35]:
# Saving for modeling
with open('../data/mean_weighted_vector_summary.pkl','wb') as fp:
    pickle.dump(mean_vector_summary,fp)
fp.close()

with open('../data/tfidf_weighted_vector_summary.pkl','wb') as fp:
    pickle.dump(tfidf_vector_summary,fp)
fp.close()

***