In [8]:
from FeatureGenerator import *
import pandas as pd
import numpy as np
import pickle
import gensim
from sklearn.preprocessing import normalize
from helpers import *

class Word2VecFeatureGenerator(FeatureGenerator):
    def __init__(self, name='word2vecFeatureGenerator'):
        super(Word2VecFeatureGenerator, self).__init__(name)
        
    def process(self, df):

        print('generating word2vec features')
        df["Headline_unigram_vec"] = df["Headline"].map(lambda x: preprocess_data(x, exclude_stopword=False, stem=False))
        df["articleBody_unigram_vec"] = df["articleBody"].map(lambda x: preprocess_data(x, exclude_stopword=False, stem=False))
        
        n_train = df[~df['target'].isnull()].shape[0]
        print('Word2VecFeatureGenerator: n_train:',n_train)
        n_test = df[df['target'].isnull()].shape[0]
        print('Word2VecFeatureGenerator: n_test:',n_test)
        
        # 1). document vector built by multiplying together all the word vectors
        # using Google's pre-trained word vectors
        model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print('model loaded')

        Headline_unigram_array = df['Headline_unigram_vec'].values
        print('Headline_unigram_array:')
        print(Headline_unigram_array)
        print(Headline_unigram_array.shape)
        print(type(Headline_unigram_array))
        
        # word vectors weighted by normalized tf-idf coefficient?
        #headlineVec = [0]
        headlineVec = list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Headline_unigram_array))
        headlineVec = np.array(headlineVec)
        print('headlineVec:')
        print(headlineVec)
        print('type(headlineVec)')
        print(type(headlineVec))
        #headlineVec = np.exp(headlineVec)
        print(headlineVec.shape)
        headlineVec = normalize(headlineVec)
        print('headlineVec')
        print(headlineVec)
        print(headlineVec.shape)
        
        headlineVecTrain = headlineVec[:n_train, :]
        outfilename_hvec_train = "train.headline.word2vec.pkl"
        with open(outfilename_hvec_train, "wb") as outfile:
            pickle.dump(headlineVecTrain, outfile, -1)
        print('headline word2vec features of training set saved in %s' % outfilename_hvec_train)

        if n_test > 0:
            # test set is available
            headlineVecTest = headlineVec[n_train:, :]
            outfilename_hvec_test = "test.headline.word2vec.pkl"
            with open(outfilename_hvec_test, "wb") as outfile:
                pickle.dump(headlineVecTest, outfile, -1)
            print('headline word2vec features of test set saved in %s' % outfilename_hvec_test)
        print('headine done')

        Body_unigram_array = df['articleBody_unigram_vec'].values
        print('Body_unigram_array:')
        print(Body_unigram_array)
        print(Body_unigram_array.shape)
        #bodyVec = [0]
        bodyVec = list(map(lambda x: reduce(np.add, [model[y] for y in x if y in model], [0.]*300), Body_unigram_array))
        bodyVec = np.array(bodyVec)
        print(bodyVec)
        print(bodyVec.shape)
        
        bodyVec = normalize(bodyVec)
        print('bodyVec')
        print(bodyVec)
        print(bodyVec.shape)

        bodyVecTrain = bodyVec[:n_train, :]
        outfilename_bvec_train = "train.body.word2vec.pkl"
        with open(outfilename_bvec_train, "wb") as outfile:
            pickle.dump(bodyVecTrain, outfile, -1)
        print('body word2vec features of training set saved in %s' % outfilename_bvec_train)
        
        if n_test > 0:
            # test set is available
            bodyVecTest = bodyVec[n_train:, :]
            outfilename_bvec_test = "test.body.word2vec.pkl"
            with open(outfilename_bvec_test, "wb") as outfile:
                pickle.dump(bodyVecTest, outfile, -1)
            print('body word2vec features of test set saved in %s' % outfilename_bvec_test)

        print('body done')

        res = []
        for i in range(0, 75385):
            res.append(cosine_sim(headlineVec[i], bodyVec[i]))
        # compute cosine similarity between headline/body word2vec features
        simVec = np.asarray(list(res))[:, np.newaxis]
        print('simVec.shape:')
        print(simVec.shape)

        simVecTrain = simVec[:n_train]
        outfilename_simvec_train = "train.sim.word2vec.pkl"
        with open(outfilename_simvec_train, "wb") as outfile:
            pickle.dump(simVecTrain, outfile, -1)
        print('word2vec sim. features of training set saved in %s' % outfilename_simvec_train)
        
        if n_test > 0:
            # test set is available
            simVecTest = simVec[n_train:]
            outfilename_simvec_test = "test.sim.word2vec.pkl"
            with open(outfilename_simvec_test, "wb") as outfile:
                pickle.dump(simVecTest, outfile, -1)
            print('word2vec sim. features of test set saved in %s' % outfilename_simvec_test)

        return 1
    
    def read(self, header='train'):

        filename_hvec = "%s.headline.word2vec.pkl" % header
        with open(filename_hvec, "rb") as infile:
            headlineVec = pickle.load(infile)

        filename_bvec = "%s.body.word2vec.pkl" % header
        with open(filename_bvec, "rb") as infile:
            bodyVec = pickle.load(infile)

        filename_simvec = "%s.sim.word2vec.pkl" % header
        with open(filename_simvec, "rb") as infile:
            simVec = pickle.load(infile)

        print('headlineVec.shape:')
        print(headlineVec.shape)
        #print type(headlineVec)
        print('bodyVec.shape:')
        print(bodyVec.shape)
        #print type(bodyVec)
        print('simVec.shape:')
        print(simVec.shape)
        #print type(simVec)
        np.save('word2vec_headline_body_train', [headlineVec, bodyVec])
        return [headlineVec, bodyVec, simVec]



In [9]:
wv = Word2VecFeatureGenerator()

In [10]:
wv.read('train')

headlineVec.shape:
(49972, 300)
bodyVec.shape:
(49972, 300)
simVec.shape:
(49972, 1)


[array([[  5.35259352e-02,   1.32955500e-02,   9.87021049e-02, ...,
          -5.21376449e-02,   7.17105366e-02,  -4.89572585e-03],
        [  7.50404004e-02,   3.59492218e-02,  -9.60042341e-05, ...,
          -5.12837163e-02,   4.00992231e-02,   1.19219803e-02],
        [  9.64843133e-02,  -1.04772821e-02,  -5.52340203e-02, ...,
          -2.13550719e-02,   5.99560063e-02,  -6.44642214e-03],
        ..., 
        [  7.17788898e-02,  -1.76894229e-02,   7.37191665e-02, ...,
          -1.04984853e-02,   9.00299718e-02,  -7.46042050e-03],
        [ -1.37780814e-02,   5.00387097e-02,   1.75331500e-02, ...,
          -4.76819626e-02,   5.50542048e-02,  -4.90198838e-02],
        [  2.13743771e-02,   8.42956669e-02,   4.20932043e-03, ...,
          -1.33208890e-01,  -1.60494717e-02,   4.23692253e-02]]),
 array([[ 0.03247191,  0.03348987,  0.04878961, ..., -0.03635329,
          0.04725184, -0.00169143],
        [ 0.04067834,  0.06328456,  0.03503142, ..., -0.04174063,
          0.06769959, -0