# Implementing Doc2Vec method.

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [4]:
df = pd.read_csv("cleaned_reviews.csv")

In [5]:
df.head()

Unnamed: 0,cleaned_reviews,rating
0,excellent,5
1,dog become healthy thank,5
2,quality service everyone happy,5
3,went top dog raw food morning first time amazi...,5
4,very stylish site enough information make choi...,5


In [10]:
tagged_data = [TaggedDocument(words=word_tokenize(review), tags=[str(i)]) for i, review in enumerate(df['cleaned_reviews'])]

In [16]:
tagged_data[323323]

TaggedDocument(words=['flexoplex', 'not', 'give', 'relief'], tags=['323323'])

In [17]:
max_epochs = 20
vec_size = 50
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)



In [18]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

iteration 0


  """


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [23]:
model.docvecs[233456]

array([-0.06608222, -2.9659534 ,  1.1775397 , -0.1807955 , -0.6115066 ,
        0.79215455, -1.6854904 ,  0.65099   ,  1.7038072 , -0.40928   ,
        3.294647  , -4.6069345 ,  1.1898072 ,  1.7132146 ,  1.4515322 ,
        0.65683514, -3.3786561 ,  1.9196206 , -0.18748984,  2.4246464 ,
       -0.8803402 ,  0.06538036,  2.2928007 , -3.3835707 , -1.6942369 ,
       -2.2992566 ,  3.6652408 ,  3.291941  , -0.38673323, -0.94967914,
        1.0113062 , -4.3947244 ,  1.7185047 , -1.4101114 ,  2.2943168 ,
        4.1129565 , -4.0442286 ,  0.81051105, -1.5362293 ,  1.0131965 ,
        1.1682564 ,  0.34744444,  4.36991   , -0.75983876,  0.48169515,
        1.1773525 ,  1.8778418 , -1.1470038 ,  4.646028  ,  1.2060422 ],
      dtype=float32)

In [24]:
model.wv.most_similar('good')

[('awesome', 0.7871295213699341),
 ('superb', 0.7782031893730164),
 ('splendid', 0.7761396765708923),
 ('phenomenal', 0.7667189836502075),
 ('excellent', 0.7609777450561523),
 ('exceptional', 0.7431771755218506),
 ('extraordinary', 0.7259469032287598),
 ('perfect', 0.7233240008354187),
 ('remarkable', 0.7226216197013855),
 ('satisfactory', 0.7190631031990051)]

In [25]:
model.wv.most_similar(positive=['excellent', 'awesome'], negative=['poor'])

[('wonderful', 0.7700765132904053),
 ('terrific', 0.7518953084945679),
 ('fantastic', 0.7238051891326904),
 ('incredible', 0.708552896976471),
 ('brilliant', 0.6851599216461182),
 ('phenomenal', 0.6848448514938354),
 ('remarkable', 0.6812764406204224),
 ('fabulous', 0.6726204752922058),
 ('splendid', 0.6590040922164917),
 ('exceptional', 0.6573909521102905)]

In [39]:
model.save("embedded_vec.model")