## Train a Word2vec Model

### Read the cleaned data

In [33]:
import pandas as pd
df = pd.read_csv('df_clean2.csv',header = 0)

In [36]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2EFCYXHNK06IS,5555991584,"Abigail Perkins ""Abby &#34;Reads Too Much&#34...","[4, 5]",anthemic title track begin quotthe memory tree...,5.0,Enya Experiments And Succeeds,978480000,"01 3, 2001"
1,A1WR23ER5HMAA9,5555991584,AKB,"[1, 1]",thought enya couldnt possibly get better found...,5.0,How to improve upon perfection.,953424000,"03 19, 2000"
2,A2IR4Q0GPAFJKW,5555991584,Alexander,"[0, 0]",nice cd easy listening husband like quite bit ...,4.0,Good listening.,1393545600,"02 28, 2014"
3,A2V0KUVAB9HSYO,5555991584,Alison Hight,"[0, 1]",really liked cd especially different language ...,4.0,Loved It,966124800,"08 13, 2000"
4,A1J0GL9HCA7ELW,5555991584,"Al the Pal ""Al the Pal""","[3, 3]",enyas richly chorded style smitten little bit ...,5.0,Another Hauntingly Beautiful Collection of Songs,1007683200,"12 7, 2001"


In [35]:
del df['Unnamed: 0']

In [37]:
df = df[(df['reviewText'].isna() == False)].reset_index(drop=True)

In [38]:
df = df.groupby(['asin'])['reviewText'].apply(lambda x: ','.join(x.astype(str))).reset_index()

### Train the word2vec model

In [39]:
import logging
import re
from io import open
from random import shuffle
#import os
import numpy as np
import pandas as pd
from gensim import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

epoch_num = 1


class ReviewDataset(object):
    def __init__(self, review_text, product_id):
        self.review_text = review_text
        self.product_id = product_id
        self.sentences = self.to_array()
        

    def to_array(self):
        sentences = []
        for i in range(len(df)):
            each_sentences = TaggedDocument(words = self.review_text[i].split(),tags=[str(self.product_id[i])])
            sentences.append(each_sentences)

        return sentences

    def sentences_perm(self):
        # start_time = time.time()
        shuffle(self.sentences)
        # print("Time used to shuffle data: %f seconds."
        #       % (time.time() - start_time))
        return self.sentences

    # def next_batch(self, batch_size=100):
    #     return np.random.choice(self.sentences, batch_size)



In [40]:
# batch_size = 100
logging.info("Loading dataset...")
model_file = 'my_model.doc2vec'

review_dataset = ReviewDataset(df['reviewText'],df['asin'])
logging.info("Building vocabularies...")
model = Doc2Vec(min_count=1, window=15, size=100,
                sample=1e-4, negative=5, workers=8)
#if os.path.exists(model_file):
#    model = model.load(model_file)
model.build_vocab(review_dataset.sentences)
logging.info("Training...")
for epoch in range(epoch_num):
    logging.info("==================== Training @ epoch "
                 "[{:02d}] ====================".format(epoch + 1))
    sent = review_dataset.sentences_perm()
    model.train(sent,total_examples=model.corpus_count,epochs=model.iter)
    # model.train(weibo_dataset.next_batch(batch_size))
logging.info("Training completed.")
logging.info("Dumping trained model...")
model.save(model_file)

2018-12-02 22:09:53,650 : INFO : Loading dataset...
2018-12-02 22:10:06,208 : INFO : Building vocabularies...
2018-12-02 22:10:06,227 : INFO : collecting all words and their counts
2018-12-02 22:10:06,228 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-12-02 22:10:11,465 : INFO : PROGRESS: at example #10000, processed 16612300 words (3172085/s), 645821 word types, 10000 tags
2018-12-02 22:10:11,578 : INFO : PROGRESS: at example #20000, processed 17040736 words (3895749/s), 659367 word types, 20000 tags
2018-12-02 22:10:11,707 : INFO : PROGRESS: at example #30000, processed 17559314 words (4037652/s), 675061 word types, 30000 tags
2018-12-02 22:10:11,819 : INFO : PROGRESS: at example #40000, processed 18006496 words (4010330/s), 688372 word types, 40000 tags
2018-12-02 22:10:11,978 : INFO : PROGRESS: at example #50000, processed 18437452 words (2728397/s), 700806 word types, 50000 tags
2018-12-02 22:10:12,098 : INFO : PROGRESS: at example #60000, pro

2018-12-02 22:11:07,399 : INFO : EPOCH 1 - PROGRESS: at 94.22% examples, 661598 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:11:08,401 : INFO : EPOCH 1 - PROGRESS: at 97.11% examples, 656308 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:11:09,136 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-02 22:11:09,141 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-12-02 22:11:09,148 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-12-02 22:11:09,154 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-12-02 22:11:09,155 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-02 22:11:09,157 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-02 22:11:09,165 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-02 22:11:09,169 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-02 22:11:09,170 : INFO : EPOCH - 1 :

2018-12-02 22:12:06,194 : INFO : EPOCH 3 - PROGRESS: at 93.09% examples, 679843 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:12:07,199 : INFO : EPOCH 3 - PROGRESS: at 97.08% examples, 680347 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:12:07,955 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-02 22:12:07,960 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-12-02 22:12:07,973 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-12-02 22:12:07,976 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-12-02 22:12:07,979 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-02 22:12:07,981 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-02 22:12:07,988 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-02 22:12:07,994 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-02 22:12:07,995 : INFO : EPOCH - 3 :

2018-12-02 22:13:04,571 : INFO : EPOCH 5 - PROGRESS: at 92.56% examples, 677558 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:13:05,572 : INFO : EPOCH 5 - PROGRESS: at 96.27% examples, 675840 words/s, in_qsize 15, out_qsize 0
2018-12-02 22:13:06,485 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-02 22:13:06,487 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-12-02 22:13:06,496 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-12-02 22:13:06,501 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-12-02 22:13:06,502 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-02 22:13:06,503 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-02 22:13:06,511 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-02 22:13:06,517 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-02 22:13:06,518 : INFO : EPOCH - 5 :

In [41]:
model = Doc2Vec.load(model_file)

2018-12-02 22:13:11,570 : INFO : loading Doc2Vec object from my_model.doc2vec
2018-12-02 22:13:15,396 : INFO : loading vocabulary recursively from my_model.doc2vec.vocabulary.* with mmap=None
2018-12-02 22:13:15,397 : INFO : loading trainables recursively from my_model.doc2vec.trainables.* with mmap=None
2018-12-02 22:13:15,397 : INFO : loading syn1neg from my_model.doc2vec.trainables.syn1neg.npy with mmap=None
2018-12-02 22:13:15,869 : INFO : loading wv recursively from my_model.doc2vec.wv.* with mmap=None
2018-12-02 22:13:15,870 : INFO : loading vectors from my_model.doc2vec.wv.vectors.npy with mmap=None
2018-12-02 22:13:16,518 : INFO : loading docvecs recursively from my_model.doc2vec.docvecs.* with mmap=None
2018-12-02 22:13:16,519 : INFO : loading vectors_docs from my_model.doc2vec.docvecs.vectors_docs.npy with mmap=None
2018-12-02 22:13:16,730 : INFO : loaded my_model.doc2vec


In [43]:
model.docvecs['5555991584']

array([ 8.5690087e-01,  1.3259938e+00, -2.4659967e+00,  8.3682680e-01,
        3.6604449e-01, -1.6435783e-01,  8.4744728e-01, -2.1925848e+00,
        6.5339440e-01,  6.3344407e-01, -2.6030960e+00, -4.6304309e-01,
       -4.1871506e-01, -8.0010414e-02, -3.2070062e+00, -1.5998273e+00,
       -9.5453018e-01,  2.5520689e+00, -1.7978915e+00, -2.1716087e+00,
        5.9448814e-01, -2.3796809e-01,  1.1782731e+00, -6.9557585e-02,
       -2.4261441e+00, -2.1347103e-01, -1.7526522e+00,  5.6135841e-02,
        4.2183173e-01, -1.2340851e+00,  2.1188443e+00, -2.1873865e+00,
       -8.3848470e-01,  1.3081435e+00,  1.2462344e+00,  1.0634818e+00,
       -5.0451136e-01,  1.4557089e-01,  1.1138166e+00, -1.5496886e+00,
        9.2197639e-01, -1.0914167e+00,  8.7023991e-01,  1.8620198e+00,
       -1.8575892e+00, -1.7521975e+00, -4.1948085e+00, -4.9386668e-01,
       -2.4069560e+00, -1.6085173e-01,  7.9135798e-02, -5.5755907e-01,
        3.7224126e-01, -1.9844030e+00,  6.8588609e-01, -1.7959115e-01,
      

In [45]:
logging.info("Extracting Embedding Vectors ...")
asin_list = df['asin'].tolist()
d2v = dict()
for i in asin_list:
    d2v[i] = model.docvecs[i]
logging.info("Saving it to .csv ...")
d2v_df = pd.DataFrame(data = d2v)
d2v_df = d2v_df.T
logging.info("Already save to dataframe ...")

2018-12-02 22:20:55,078 : INFO : Extracting Embedding Vectors ...
2018-12-02 22:20:55,800 : INFO : Saving it to .csv ...
2018-12-02 22:20:57,763 : INFO : Already save to dataframe ...


### Save the d2v.csv

In [46]:
d2v_df.to_csv("d2v.csv")

In [47]:
d2v_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5555991584,0.856901,1.325994,-2.465997,0.836827,0.366044,-0.164358,0.847447,-2.192585,0.653394,0.633444,...,-0.940333,-1.192315,0.047115,1.625511,-0.777654,1.607038,-0.960912,0.057814,-0.304665,-3.425489
6308051551,0.013385,0.044744,-0.062915,0.012726,0.034638,0.037249,0.032704,-0.006384,0.036424,0.027895,...,-0.041097,-0.050401,0.029213,-0.026003,0.118155,-0.116907,-0.146966,-0.042619,-0.007872,-0.018278
7901622466,0.052706,-0.360461,0.404728,0.625779,0.137144,0.121133,0.030336,0.398719,0.070142,0.139160,...,-0.197806,0.267863,0.327463,-0.008311,-0.401309,-0.658073,-1.154109,0.152930,0.063122,-0.082938
B0000000ZW,0.153720,0.140829,0.128890,0.287718,-0.047790,0.502433,0.491238,-0.090354,-0.034453,-0.046929,...,0.060190,-0.108006,0.386246,-0.100744,0.086392,-0.079423,-0.832502,-0.012997,-0.062597,0.353099
B00000016T,0.932325,-0.041272,-0.781207,-1.290578,-1.340093,1.198445,1.172556,-1.180809,0.695114,0.642097,...,-0.264124,1.154754,1.169049,-1.370199,-0.100498,-0.004180,-3.549084,-0.319167,-0.264281,-1.457054
B00000016W,0.062712,0.254967,-4.225322,-1.152805,-0.242148,0.529600,0.975017,-2.171337,-0.320130,2.351677,...,-0.535599,-0.554342,-0.321522,0.745093,-1.776738,-2.516270,-1.745387,-0.785131,0.920068,0.001680
B00000017R,1.288461,1.286532,-0.331059,-0.643395,0.785544,0.769907,1.291752,-0.002383,-0.668457,0.777876,...,-1.363663,-0.355453,0.297260,-0.590984,0.044610,0.147935,-1.747488,-0.775731,-0.078345,-0.509090
B0000001BA,-0.271556,0.071410,-0.453854,-0.181439,0.168012,0.000079,0.190388,-0.378686,-0.083943,0.206707,...,0.019655,0.056627,-0.215798,-0.387817,-0.537260,0.134323,-0.086659,-0.024782,-0.645430,-0.209026
B0000001BO,-0.018004,0.031539,0.070069,0.035387,-0.032159,-0.045577,0.039654,-0.031884,0.067932,0.016287,...,0.012101,-0.021503,0.016247,-0.007068,-0.001342,-0.007907,-0.079067,0.004191,-0.008880,0.007148
B0000001O0,-0.004363,-0.003236,0.048788,0.021607,0.009397,0.000497,0.063735,-0.004311,-0.033734,0.000572,...,-0.013296,0.003079,0.049077,-0.016710,-0.030483,0.004298,-0.046386,0.010951,-0.064939,0.007731
