# Implementing Doc2Vec method.

In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
df = pd.read_csv("cleaned_reviews.csv")

In [5]:
df.head()

Unnamed: 0,cleaned_reviews,rating
0,excellent,5
1,dog become healthy thank,5
2,quality service everyone happy,5
3,went top dog raw food morning first time amazi...,5
4,very stylish site enough information make choi...,5


In [10]:
tagged_data = [TaggedDocument(words=word_tokenize(review), tags=[str(i)]) for i, review in enumerate(df['cleaned_reviews'])]

In [16]:
tagged_data[323323]

TaggedDocument(words=['flexoplex', 'not', 'give', 'relief'], tags=['323323'])

In [17]:
max_epochs = 20
vec_size = 50
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)



In [18]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0


  """


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [19]:
model.docvecs[0]

array([ 0.10943171,  0.1570775 ,  0.20438488, -0.16720575, -0.37034696,
       -0.5873601 ,  0.07812168, -0.02868595,  0.36529353,  0.0283529 ,
       -0.04595437, -0.35873258,  0.2885365 , -0.02118153, -0.05538253,
       -0.0610761 , -0.02582747, -0.31046742,  0.13521093,  0.22571318,
        0.0691458 , -0.16867654,  0.34231612, -0.35911196,  0.25636002,
       -0.25272492, -0.0488746 ,  0.03874098, -0.2978058 , -0.11610639,
       -0.46691853, -0.2936468 ,  0.07877075, -0.34024963, -0.17031549,
        0.22852756,  0.22664908, -0.12792124, -0.26333746, -0.31538224,
       -0.00811125, -0.03767433, -0.19063088,  0.06531884,  0.1402551 ,
        0.05964357,  0.04522478,  0.05232068,  0.06674878, -0.21663778],
      dtype=float32)

In [36]:
model.most_similar('good')

  """Entry point for launching an IPython kernel.


[('awesome', 0.7871295213699341),
 ('superb', 0.7782031893730164),
 ('splendid', 0.7761396765708923),
 ('phenomenal', 0.7667189836502075),
 ('excellent', 0.7609777450561523),
 ('exceptional', 0.7431771755218506),
 ('extraordinary', 0.7259469032287598),
 ('perfect', 0.7233240008354187),
 ('remarkable', 0.7226216197013855),
 ('satisfactory', 0.7190631031990051)]

In [38]:
model.most_similar(positive=['excellent', 'awesome'], negative=['poor'])

  """Entry point for launching an IPython kernel.


[('wonderful', 0.7700765132904053),
 ('terrific', 0.7518953084945679),
 ('fantastic', 0.7238051891326904),
 ('incredible', 0.708552896976471),
 ('brilliant', 0.6851599216461182),
 ('phenomenal', 0.6848448514938354),
 ('remarkable', 0.6812764406204224),
 ('fabulous', 0.6726204752922058),
 ('splendid', 0.6590040922164917),
 ('exceptional', 0.6573909521102905)]

In [39]:
model.save("embedded_vec.model")