# Similar Words

In [2]:
import pandas as pd

In [30]:
df = pd.read_excel(r"C:\Users\Tanya\Desktop\Capstone\Reviews.xlsx")
df.shape

(2506, 1)

In [31]:
import bs4 as bs
import urllib.request
import re
import nltk
# imports needed and logging
import gzip
import gensim 
import logging

In [32]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [33]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [49]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['reviews'])

In [50]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.04 mins


In [51]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(1663, 1)

In [52]:
from gensim.models.phrases import Phrases, Phraser


In [53]:
sent = [row.split() for row in df_clean['clean']]


In [54]:
phrases = Phrases(sent, min_count=30, progress_per=10000)


INFO - 17:30:22: collecting all words and their counts
INFO - 17:30:22: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:30:22: collected 12635 token types (unigram + bigrams) from a corpus of 17799 words and 1663 sentences
INFO - 17:30:22: merged Phrases<12635 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 17:30:22: Phrases lifecycle event {'msg': 'built Phrases<12635 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.05s', 'datetime': '2021-04-24T17:30:22.080576', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [55]:
bigram = Phraser(phrases)


INFO - 17:30:22: exporting phrases from Phrases<12635 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 17:30:22: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<3 phrases, min_count=30, threshold=10.0> from Phrases<12635 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.06s', 'datetime': '2021-04-24T17:30:22.872553', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [56]:
sentences = bigram[sent]


In [57]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

2202

In [58]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]


['good',
 'product',
 'br',
 'installation',
 'bed',
 'quality',
 'buy',
 'bad',
 'time',
 'nice']

In [59]:
import multiprocessing

from gensim.models import Word2Vec

In [60]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [61]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 17:30:27: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2021-04-24T17:30:27.979349', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [62]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:30:28: collecting all words and their counts
INFO - 17:30:28: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:30:28: collected 2202 word types from a corpus of 17561 raw words and 1663 sentences
INFO - 17:30:28: Creating a fresh vocabulary
INFO - 17:30:28: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 155 unique words (7.03905540417802%% of original 2202, drops 2047)', 'datetime': '2021-04-24T17:30:28.844722', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 17:30:28: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 11804 word corpus (67.2171288650988%% of original 17561, drops 5757)', 'datetime': '2021-04-24T17:30:28.847717', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INF

Time to build vocab: 0.0 mins


In [63]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))


INFO - 17:30:29: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 155 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2', 'datetime': '2021-04-24T17:30:29.812107', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 17:30:29: worker thread finished; awaiting finish of 6 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 5 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 4 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 3 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 2 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 1 more threads
INFO - 17:30:29: worker thread finished; awaiting finish of 0 more threads
INFO - 17:30:29: EPOCH - 1 : training on 17561 raw words (1050 effective words) took 0.0s, 46364 effective

Time to train the model: 0.02 mins


In [64]:
w2v_model.init_sims(replace=True)


  w2v_model.init_sims(replace=True)


In [54]:
w2v_model.wv.most_similar(positive=["bed"])


[('guy', 0.9992009401321411),
 ('storage', 0.9991886615753174),
 ('water', 0.9991698861122131),
 ('team', 0.9991599321365356),
 ('shift', 0.9991491436958313),
 ('furniture', 0.9991310238838196),
 ('support', 0.999117910861969),
 ('inch', 0.9991121888160706),
 ('br', 0.9991108775138855),
 ('look', 0.9991108775138855)]

In [55]:
w2v_model.wv.most_similar(positive=["design"])


[('durable', 0.9993487000465393),
 ('little', 0.9993336796760559),
 ('storage', 0.9993126392364502),
 ('nt', 0.9992791414260864),
 ('furniture', 0.9992734789848328),
 ('room', 0.9992492198944092),
 ('fit', 0.999237060546875),
 ('corner', 0.9992243647575378),
 ('fix', 0.999223530292511),
 ('guy', 0.9992228150367737)]

In [56]:
w2v_model.wv.most_similar(positive=["installation"])


[('kind', 0.9991629123687744),
 ('fix', 0.9991552829742432),
 ('properly', 0.9991474151611328),
 ('proper', 0.9991389513015747),
 ('floor', 0.9991124272346497),
 ('strong', 0.9991062879562378),
 ('assemble', 0.9991021156311035),
 ('awesome', 0.9990903735160828),
 ('home', 0.9990839958190918),
 ('team', 0.9990763068199158)]

In [57]:
w2v_model.wv.most_similar(positive=["product"])


[('support', 0.9991557002067566),
 ('receive', 0.9991464614868164),
 ('get', 0.9991289973258972),
 ('recommend', 0.9991154670715332),
 ('ply', 0.9991132616996765),
 ('guy', 0.9991123676300049),
 ('customer', 0.9991041421890259),
 ('small', 0.9990973472595215),
 ('wise', 0.9990789890289307),
 ('year', 0.9990785121917725)]

In [58]:
w2v_model.wv.most_similar(positive=["quality"])


[('price_range', 0.9991963505744934),
 ('little', 0.9991540312767029),
 ('people', 0.9991416335105896),
 ('k', 0.9991183876991272),
 ('storage', 0.9991040229797363),
 ('feel', 0.9991003274917603),
 ('team', 0.9990830421447754),
 ('want', 0.9990777373313904),
 ('finish', 0.9990522265434265),
 ('board', 0.999052107334137)]

In [59]:
w2v_model.wv.most_similar(positive=["size"])


[('strong', 0.999234676361084),
 ('awesome', 0.9992081522941589),
 ('give', 0.9992024302482605),
 ('problem', 0.9991983771324158),
 ('ok', 0.9991768598556519),
 ('mr', 0.9991752505302429),
 ('space', 0.9991693496704102),
 ('low', 0.9991616606712341),
 ('plywood', 0.9991534352302551),
 ('guy', 0.9991498589515686)]