In [8]:
from FeatureGenerator import *
from TfidfFeatureGenerator import *
import pandas as pd
import numpy as np
from scipy.sparse import vstack
import pickle
from sklearn.decomposition import TruncatedSVD
from helpers import *

class SvdFeatureGenerator(FeatureGenerator):
    def __init__(self, name='svdFeatureGenerator'):
        super(SvdFeatureGenerator, self).__init__(name)
        
        
    def process(self, df):
        
        n_train = df[~df['target'].isnull()].shape[0]
        print('SvdFeatureGenerator, n_train:',n_train)
        n_test  = df[df['target'].isnull()].shape[0]
        print('SvdFeatureGenerator, n_test:',n_test)

        tfidfGenerator = TfidfFeatureGenerator('tfidf')
        featuresTrain = tfidfGenerator.read('train')
        xHeadlineTfidfTrain, xBodyTfidfTrain = featuresTrain[0], featuresTrain[1]
        
        xHeadlineTfidf = xHeadlineTfidfTrain
        xBodyTfidf = xBodyTfidfTrain
        if n_test > 0:
            # test set is available
            featuresTest  = tfidfGenerator.read('test')
            xHeadlineTfidfTest,  xBodyTfidfTest  = featuresTest[0],  featuresTest[1]
            xHeadlineTfidf = vstack([xHeadlineTfidfTrain, xHeadlineTfidfTest])
            xBodyTfidf = vstack([xBodyTfidfTrain, xBodyTfidfTest])
            
        # compute the cosine similarity between truncated-svd features
        svd = TruncatedSVD(n_components=50, n_iter=15)
        xHBTfidf = vstack([xHeadlineTfidf, xBodyTfidf])
        svd.fit(xHBTfidf) # fit to the combined train-test set (or the full training set for cv process)
        print('xHeadlineTfidf.shape:')
        print(xHeadlineTfidf.shape)
        xHeadlineSvd = svd.transform(xHeadlineTfidf)
        print('xHeadlineSvd.shape:')
        print(xHeadlineSvd.shape)
        
        xHeadlineSvdTrain = xHeadlineSvd[:n_train, :]
        outfilename_hsvd_train = "train.headline.svd.pkl"
        with open(outfilename_hsvd_train, "wb") as outfile:
            pickle.dump(xHeadlineSvdTrain, outfile, -1)
        print('headline svd features of training set saved in %s' % outfilename_hsvd_train)
        
        if n_test > 0:
            # test set is available
            xHeadlineSvdTest = xHeadlineSvd[n_train:, :]
            outfilename_hsvd_test = "test.headline.svd.pkl"
            with open(outfilename_hsvd_test, "wb") as outfile:
                pickle.dump(xHeadlineSvdTest, outfile, -1)
            print('headline svd features of test set saved in %s' % outfilename_hsvd_test)

        xBodySvd = svd.transform(xBodyTfidf)
        print('xBodySvd.shape:')
        print(xBodySvd.shape)
        
        xBodySvdTrain = xBodySvd[:n_train, :]
        outfilename_bsvd_train = "train.body.svd.pkl"
        with open(outfilename_bsvd_train, "wb") as outfile:
            pickle.dump(xBodySvdTrain, outfile, -1)
        print('body svd features of training set saved in %s' % outfilename_bsvd_train)
        
        if n_test > 0:
            # test set is available
            xBodySvdTest = xBodySvd[n_train:, :]
            outfilename_bsvd_test = "test.body.svd.pkl"
            with open(outfilename_bsvd_test, "wb") as outfile:
                pickle.dump(xBodySvdTest, outfile, -1)
            print('body svd features of test set saved in %s' % outfilename_bsvd_test)

        res = []
        for i in range(0, 75385):
            res.append(cosine_sim(xHeadlineSvd[i], xBodySvd[i]))
            
        simSvd = np.asarray(list(res))[:, np.newaxis]
        print('simSvd.shape:')
        print(simSvd.shape)

        simSvdTrain = simSvd[:n_train]
        outfilename_simsvd_train = "train.sim.svd.pkl"
        with open(outfilename_simsvd_train, "wb") as outfile:
            pickle.dump(simSvdTrain, outfile, -1)
        print('svd sim. features of training set saved in %s' % outfilename_simsvd_train)
        
        if n_test > 0:
            # test set is available
            simSvdTest = simSvd[n_train:]
            outfilename_simsvd_test = "test.sim.svd.pkl"
            with open(outfilename_simsvd_test, "wb") as outfile:
                pickle.dump(simSvdTest, outfile, -1)
            print('svd sim. features of test set saved in %s' % outfilename_simsvd_test)
        return 1


    def read(self, header='train'):

        filename_hsvd = "%s.headline.svd.pkl" % header
        with open(filename_hsvd, "rb") as infile:
            xHeadlineSvd = pickle.load(infile)

        filename_bsvd = "%s.body.svd.pkl" % header
        with open(filename_bsvd, "rb") as infile:
            xBodySvd = pickle.load(infile)

        filename_simsvd = "%s.sim.svd.pkl" % header
        with open(filename_simsvd, "rb") as infile:
            simSvd = pickle.load(infile)

        np.save('svd_headline_body_train', [xHeadlineSvd, xBodySvd])
        print('xHeadlineSvd.shape:')
        print(xHeadlineSvd.shape)
        #print type(xHeadlineSvd)
        print ('xBodySvd.shape:')
        print(xBodySvd.shape)
        #print type(xBodySvd)
        print ('simSvd.shape:')
        print (simSvd.shape)
        #print type(simSvd)

        return [xHeadlineSvd, xBodySvd, simSvd.reshape(-1, 1)]
        #return [simSvd.reshape(-1, 1)]




In [9]:
svd = SvdFeatureGenerator()

In [10]:
svd.read('train')

xHeadlineSvd.shape:
(49972, 50)
xBodySvd.shape:
(49972, 50)
simSvd.shape:
(49972, 1)


[array([[ 0.03207111, -0.02211176, -0.00768814, ...,  0.05651077,
         -0.03286519, -0.02382299],
        [ 0.00991723, -0.0061924 ,  0.00474473, ...,  0.00326889,
         -0.01427453,  0.02269052],
        [ 0.03529908, -0.04601441, -0.03071824, ..., -0.00258664,
          0.00229425,  0.00051807],
        ..., 
        [ 0.03538133, -0.01916267, -0.00156872, ...,  0.06806871,
         -0.03641366, -0.02122546],
        [ 0.07735895,  0.01317191, -0.00416012, ...,  0.0245615 ,
         -0.00448776, -0.01109989],
        [ 0.05258799, -0.08666438, -0.08722353, ..., -0.0205316 ,
         -0.01476017, -0.00853244]]),
 array([[ 0.04377182, -0.07088275, -0.05191007, ...,  0.00635997,
          0.01904262, -0.02497177],
        [ 0.04661704, -0.0279718 ,  0.00061669, ..., -0.00045492,
         -0.00764089,  0.00479087],
        [ 0.04197071, -0.03094536, -0.00197641, ...,  0.00991218,
         -0.01961413, -0.0346405 ],
        ..., 
        [ 0.02570851, -0.00936859,  0.0008985 , ...,