In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.test.utils import common_texts, get_tmpfile
import warnings
warnings.filterwarnings('ignore')

In [8]:
test_clean = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'test_clean.csv', sep=','))

In [9]:
test_clean.head()

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review,length,numbers,caps,...,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WP$,WRB
0,6,929,0,4.0,,2009-08-25,let start shoutout everyone boosting mint lemo...,67,0,0.0,...,0.074627,0.044776,0.014925,0.029851,0.059701,0.044776,0.0,0.0,0.0,0.0
1,9,932,0,5.0,,2014-05-09,stopped lunch today could nt believe delicious...,27,0,0.0,...,0.037037,0.148148,0.0,0.0,0.037037,0.0,0.0,0.0,0.0,0.0
2,14,937,0,4.0,,2014-10-15,tiny little place good food pastitsio especial...,8,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22,945,0,5.0,,2014-04-10,food delicious service great good atmosphere q...,11,0,0.0,...,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23,946,0,5.0,,2014-03-29,awesome hole wall place grab quick bite great ...,24,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Doc2Vec

In [6]:
#tokenized data required for Word2Vec & Doc2Vec
def tokenize(data):
    
    data = data.replace(np.nan, " ")
    #tokenize reviews and store in separate column for vectorizers
    data['tokens'] = pd.Series()
    for i in range(len(data['review'])):
        data['tokens'][i] = word_tokenize(data['review'][i])
    
    return data

In [10]:
test = tokenize(test_clean)

In [7]:
def Doc2Vec_input(data):
    
    #returns appropriate input data for Doc2Vec model
    #tag reviews with tokens and labels
    train_doc =[]
    for i in range(0,data.shape[0]):
        train_doc.append(TaggedDocument(data['tokens'][i],str(data['label'][i]))) 
    
    return train_doc

In [11]:
test_doc = Doc2Vec_input(test)

In [14]:
#example input
test_doc[0]

TaggedDocument(words=['let', 'start', 'shoutout', 'everyone', 'boosting', 'mint', 'lemonade', 'stuff', 'super', 'tasty', 'refuse', 'believe', 'get', 'fresher', 'delicious', 'greek', 'food', 'dollar', 'sport', 'lunch', 'regularly', 'huge', 'sandwiches', 'filled', 'marinated', 'vegetables', 'succulent', 'meats', 'lamb', 'particularly', 'choice', 'floury', 'ciabatta', 'ideal', 'crustcrunch', 'softinnerwomb', 'cold', 'meze', 'options', 'favorites', 'know', 'anyone', 'take', 'good', 'feta', 'cover', 'salt', 'pepper', 'olive', 'oil', 'dish', 'olives', 'tomatoes', 'quality', 'proportion', 'ingredients', 'makes', 'special', 'heaven', 'rock', 'sandwich', 'grab', 'bigger', 'meals', 'midteens', 'get', 'done'], tags=' ')

In [15]:
def Doc2Vec_vectorize(model, data, epochs=1):
    
    #creates feature vectors for each review
    targets, feature_vectors = zip(*[(doc.tags[0], dv_model.infer_vector(doc.words, epochs =epochs)) for doc in data])
    return np.array(feature_vectors)

In [16]:
#load saved model
dv_model = Doc2Vec.load('dv_model')

In [17]:
#create feature vectors for test data using trained model
dv_test = Doc2Vec_vectorize(dv_model, test_doc)

In [18]:
np.savetxt("dv_test.csv", dv_test, delimiter=",")