# _Practice Project - Word2Vec Model Creation for Text Data_
***
_The meaning of the word can be inferred by the company it belongs to._


## _Import Libraries and Load the Data_

In [15]:
#data manipulation
import numpy as np
import pandas as pd
import os
import sys
assert sys.version_info >= (3,5)
#visualization import
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#handle unwanted warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
#text pre-processing
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
#regular expressions
import re
import string
#unzip using python
from zipfile import ZipFile

#import gensim library
import gensim

In [3]:
#load the data
filename = 'reviews_data.zip'

with ZipFile(filename,'r') as zip:
    #get the contents of the file
    zip.printdir()
    #extract all the files
    zip.extractall()

File Name                                             Modified             Size
reviews_data.txt                               2019-02-16 01:13:24    240342787


In [22]:
file = 'reviews_data.txt'

with open(file,'rb') as f:
    text = f.readlines()

In [23]:
len(text)

255404

In [24]:
text[1]

b'Sep 25 2009 \tGreat Budget Hotel!\tStayed two nights at Aloft on the most recent trip to China. The hotel was very modern and clean. The room was spotless and a comfortable king sized bed (as far as soft beds go in China). The staff was very punctual and went out of the way to help my every need, including going to a store across the street to purchase a China Mobile SIM card for me. The buffet breakfast was okay, nothing to write home about. The 42\x94 LCD screen had movies on demand for 20RMB and had a good selection of western channels including HBO, CNN, BBC, Star World etc\x85 The gym was small, had a selection of basic weights and one cable machine, there was however 6 new TechnoGym cardio machines with built in LCD TVs which were very good. The location is a bit out of the way to the central areas of Beijing, but it is better suited for my needs as I need to be in the Haidian district. Being SPG Platinum there were no upgrades to a better room, because Aloft has a policy of no

In [25]:
for i, line in enumerate(text):
    if i < 2:
        print(f'Line {i} text is {line}')
        
    else:
        break

Line 0 text is b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the histo

In [29]:
documents = []
for i,line in enumerate(text):
    documents.append(gensim.utils.simple_preprocess(line))
    

In [34]:
documents[2][:5]

['aug', 'excellent', 'value', 'location', 'not']

In [38]:
#training word2vec model
from gensim.models import Word2Vec
#train the model
model = Word2Vec(documents,min_count=1)

In [39]:
#summarize the trained model
print(model)

Word2Vec(vocab=150059, size=100, alpha=0.025)


In [41]:
#summarize the vocabulary
words = list(model.wv.vocab)
print(words[:10])

['oct', 'nice', 'trendy', 'hotel', 'location', 'not', 'too', 'bad', 'stayed', 'in']


In [42]:
#acces a vector for one of the words
print(model['nice'])

[ 0.14901423 -1.7512064  -1.121937   -0.8137081  -3.2777586   0.3872739
  0.62606287 -2.1407642  -3.0497634  -2.013671    0.41758108 -0.4762054
 -3.8974872  -4.1876006  -3.0922625   2.107958   -0.29541084  1.3170369
 -1.4061233   0.34048057 -0.31713316  3.1194968   0.07426205  3.7516215
  0.68503714 -2.232105    1.7703934   1.7308425  -2.2828295  -2.1673117
 -0.7317405   0.39877585  2.952547   -0.7672046  -1.0284679   4.152369
 -3.6084955  -0.47560102  0.55415845 -0.7084167  -0.37781385 -1.0723218
  1.0506369   0.8700196  -0.8030182   0.6738116   1.2882427   0.5892732
  0.19673459  3.340832   -0.85394603 -2.3220007   1.5470397  -3.9314516
  1.1457063   0.8710716  -0.87436104 -2.140911    0.26735255 -1.3005157
  2.4954903  -0.61551195 -1.8254633  -0.5203502   1.8170469  -0.713162
 -3.6717227   1.0981166   1.9974068  -3.5702205  -0.06482992 -0.7826211
  0.8667963   1.3024031   3.265877    0.9902576   3.0346484   0.743197
  0.01959011 -2.6919003  -0.31016755  3.5573955   0.6438287  -1.909

In [43]:
print(model['trendy'])

[-0.14610225 -0.43602616  0.6979314   2.0773368  -1.2628345  -0.24177982
  0.593253   -0.78505814  0.3949093  -1.814445   -1.5204215   1.6803021
 -0.81844175 -3.043637   -0.363252   -1.1253523  -1.1124251   1.2499716
 -1.8235354  -0.36981133 -1.9372826   0.2573762  -0.6359617   2.0587568
 -1.8213345  -0.70086116  0.10602593  0.74291253  2.195751   -2.268869
 -1.9405046   0.9048939   2.352095    1.6577314  -1.2566677   3.6979244
 -0.9246681   1.3801439   3.3256998   0.4605271  -1.140972   -2.5812707
  1.9799682   0.7611882   0.19489534 -2.8636615  -2.6182141  -0.24993128
  0.40325385  0.9463937  -2.1133382  -0.4730558  -0.13074866 -3.9722035
  1.0631346   1.0221469   1.250692   -0.13217042 -0.10303911 -1.5192246
  3.4587572   0.3449336  -0.8219983   0.5549843   2.4422438   0.5181697
 -0.46307725  1.539461    1.4379178  -1.472322   -1.2654153   0.76117086
  0.7569897  -1.0101581   0.6734058  -1.211317    0.53180754  1.7498472
  0.5060066  -2.3239696  -0.7375885   0.4274293  -1.5428138  -

In [46]:
#check the dimension --> size of the vector embeddings
model['trendy'].shape

(100,)

In [47]:
#save the model
model.save('model.bin')

In [48]:
#load the model as a new model
new_model = Word2Vec.load('model.bin')

In [49]:
print(new_model)

Word2Vec(vocab=150059, size=100, alpha=0.025)


In [50]:
new_model['trendy']

array([-0.14610225, -0.43602616,  0.6979314 ,  2.0773368 , -1.2628345 ,
       -0.24177982,  0.593253  , -0.78505814,  0.3949093 , -1.814445  ,
       -1.5204215 ,  1.6803021 , -0.81844175, -3.043637  , -0.363252  ,
       -1.1253523 , -1.1124251 ,  1.2499716 , -1.8235354 , -0.36981133,
       -1.9372826 ,  0.2573762 , -0.6359617 ,  2.0587568 , -1.8213345 ,
       -0.70086116,  0.10602593,  0.74291253,  2.195751  , -2.268869  ,
       -1.9405046 ,  0.9048939 ,  2.352095  ,  1.6577314 , -1.2566677 ,
        3.6979244 , -0.9246681 ,  1.3801439 ,  3.3256998 ,  0.4605271 ,
       -1.140972  , -2.5812707 ,  1.9799682 ,  0.7611882 ,  0.19489534,
       -2.8636615 , -2.6182141 , -0.24993128,  0.40325385,  0.9463937 ,
       -2.1133382 , -0.4730558 , -0.13074866, -3.9722035 ,  1.0631346 ,
        1.0221469 ,  1.250692  , -0.13217042, -0.10303911, -1.5192246 ,
        3.4587572 ,  0.3449336 , -0.8219983 ,  0.5549843 ,  2.4422438 ,
        0.5181697 , -0.46307725,  1.539461  ,  1.4379178 , -1.47

In [55]:
#printing the similar words
w1 = 'value'
print(new_model.wv.most_similar(positive=w1)) #positive = list of words that contribute positively to the given word

[('bargain', 0.6336358189582825), ('location', 0.6113249659538269), ('loaction', 0.6072180867195129), ('prosposition', 0.598526120185852), ('deal', 0.5955886244773865), ('valuefor', 0.5490619540214539), ('price', 0.5376212000846863), ('refelection', 0.5352514982223511), ('position', 0.5312513113021851), ('values', 0.5288784503936768)]


In [None]:
#printing the similar words
w1 = 'value'
print(new_model.wv.most_similar(negativ=w1))