In [None]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
data_file="reviews_data.txt.gz"

with gzip.open ('reviews_data.txt.gz', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

In [None]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

documents = list (read_input (data_file))
logging.info ("Done reading data file")

In [None]:
model = gensim.models.Word2Vec (documents,  window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

In [None]:
w1 = "dirty"
model.wv.most_similar (positive=w1)

In [None]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="smelly")

### Small Word Model

In [1]:
from gensim.models import word2vec as w2v
import multiprocessing

sentences = [['car'], ['insurance'],['best'], ['car'], ['service']]
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model = w2v.Word2Vec(min_count=1,window=2,sample=6e-5,alpha=0.03,min_alpha=0.0007, negative=20, workers=cores-1)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)

(4, 150)

In [2]:
print(model.wv.most_similar(positive=["car"]))

[('service', 0.220316082239151), ('insurance', 0.04292996972799301), ('best', 0.03810085356235504)]


### My Corpus Model

In [3]:
import numpy as np
import random
import re
from collections import defaultdict

In [4]:
path = 'dataset/test1.txt'
with open(path, encoding='utf8') as f:
    text = f.read()

In [5]:
import gensim
tokenized_text = list(gensim.utils.simple_preprocess (text))
print(tokenized_text)

['the', 'winner', 'of', 'this', 'auction', 'will', 'receive', 'two', 'paikka', 'human', 'visibility', 'raincoat', 'camo', 'xs', 'the', 'paikka', 'human', 'visibility', 'raincoat', 'is', 'made', 'with', 'smart', 'reflective', 'surface', 'that', 'ensures', 'you', 'll', 'stay', 'safe', 'on', 'nighttime', 'walks', 'the', 'seams', 'are', 'all', 'taped', 'to', 'make', 'this', 'jacket', 'waterproof', 'with', 'adjustable', 'sleeves', 'and', 'hood', 'this', 'raincoat', 'makes', 'sure', 'you', 'll', 'be', 'safe', 'and', 'comfortable', 'in', 'all', 'conditions', 'there', 'are', 'too', 'many', 'poor', 'quality', 'dog', 'products', 'in', 'the', 'world', 'we', 'are', 'here', 'to', 'make', 'change', 'we', 'understand', 'the', 'common', 'pain', 'points', 'of', 'dogs', 'and', 'their', 'owners', 'we', 'create', 'uncompromised', 'human', 'grade', 'solutions', 'to', 'make', 'every', 'moment', 'better', 'you', 'will', 'find', 'our', 'answer', 'revealed', 'in', 'every', 'paikka', 'product', 'free', 'shippin

In [6]:
docs = []
for i in range(len(tokenized_text)):
    docs.append([tokenized_text[i]])
print(docs)

[['the'], ['winner'], ['of'], ['this'], ['auction'], ['will'], ['receive'], ['two'], ['paikka'], ['human'], ['visibility'], ['raincoat'], ['camo'], ['xs'], ['the'], ['paikka'], ['human'], ['visibility'], ['raincoat'], ['is'], ['made'], ['with'], ['smart'], ['reflective'], ['surface'], ['that'], ['ensures'], ['you'], ['ll'], ['stay'], ['safe'], ['on'], ['nighttime'], ['walks'], ['the'], ['seams'], ['are'], ['all'], ['taped'], ['to'], ['make'], ['this'], ['jacket'], ['waterproof'], ['with'], ['adjustable'], ['sleeves'], ['and'], ['hood'], ['this'], ['raincoat'], ['makes'], ['sure'], ['you'], ['ll'], ['be'], ['safe'], ['and'], ['comfortable'], ['in'], ['all'], ['conditions'], ['there'], ['are'], ['too'], ['many'], ['poor'], ['quality'], ['dog'], ['products'], ['in'], ['the'], ['world'], ['we'], ['are'], ['here'], ['to'], ['make'], ['change'], ['we'], ['understand'], ['the'], ['common'], ['pain'], ['points'], ['of'], ['dogs'], ['and'], ['their'], ['owners'], ['we'], ['create'], ['uncomprom

In [7]:
from gensim.models import word2vec as w2v
import multiprocessing

In [8]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
model = w2v.Word2Vec(min_count=1,window=10,sample=6e-5,alpha=0.03,min_alpha=0.0007, negative=20, workers=cores-1)
model.build_vocab(docs)
model.train(docs, total_examples=model.corpus_count, epochs=30, report_delay=1)

(265, 3480)

In [9]:
print(model.wv.most_similar(positive=["reflective"]))

[('here', 0.2867913246154785), ('jacket', 0.20972684025764465), ('of', 0.1798821985721588), ('our', 0.1537526249885559), ('that', 0.13904783129692078), ('create', 0.126688614487648), ('product', 0.11935709416866302), ('dogs', 0.11709446460008621), ('too', 0.11341460794210434), ('made', 0.10040757805109024)]
