# Word2Vec using Amazon product review dataset 
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [1]:
import gensim 
import pandas as pd 

In [3]:
df = pd.read_json('data/video42/reviews_Cell_Phones_and_Accessories_5.json', lines=True)

In [5]:
df.sample(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
145671,AUUQIXCA8CQDS,B00AYPEL56,JoPfef,"[0, 0]",Got these so I could have additional space on ...,5,Great adapters!,1384473600,"11 15, 2013"
171403,A35UN8P5NBNU2K,B00DZ0J74W,IanA,"[0, 0]",This cable is great length and its quality is ...,5,Fantastic cable!,1393718400,"03 2, 2014"
170506,A2QI4VIO4MRX8A,B00DUR2GZM,Stephen Davidson,"[0, 0]",Love this product! Never had any problem with ...,5,Great Product,1390176000,"01 20, 2014"
171345,A2MENIMGJCIIKK,B00DY9LMRE,R. Wesson,"[0, 1]",ordered pink and got purple. ordered 2 since t...,2,well,1392249600,"02 13, 2014"
145115,A1KEV5E0PWU9LV,B00AX7T65U,John Andrews,"[0, 0]","This case is slim, has a great grip, and looks...",5,Good case,1396224000,"03 31, 2014"
177079,AS65L3A9I2ES6,B00F27CY4G,sunnyflowers1212,"[0, 0]",nice case. easy to use and works well. i would...,4,Four Stars,1405555200,"07 17, 2014"
71844,A1GHPJB2N0D0RL,B006OT2UQS,C.R.U,"[17, 18]",**skip to the end if you just want the Alurate...,5,The winner of the budget bluetooths...and I've...,1349049600,"10 1, 2012"
183057,A1S33E945IW6TW,B00G5VQIMM,JessPete,"[0, 0]","Works great and has a low profile. Again, I'v...",5,Another awesome charger that my husband has s...,1400457600,"05 19, 2014"
10238,A1S0D037M0D71X,B002BH3I9U,Steven,"[0, 0]",bought these for my wife after she liked using...,5,she likes them,1327795200,"01 29, 2012"
181221,ACQK1VZ94OLVB,B00FOFY1J2,"Steve DeGregorio ""flyers2114""","[0, 1]",See my YouTube review at Steve DeGregorio (fly...,4,Excellent for your power needs!,1387584000,"12 21, 2013"


In [6]:
df.shape

(194439, 9)

In [7]:
# we have reviewText column on which we are interested in for Word2Vec 
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [8]:
# we have data in mixed case and puntuations are present. We can use gensim preprocessing to tokenize e.g. 
gensim.utils.simple_preprocess(df.reviewText[0])

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [10]:
# apply this to our pandas reviewText column 
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text[:10]

0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
5    [these, make, using, the, home, button, easy, ...
6    [came, just, as, described, it, doesn, come, u...
7    [it, worked, for, the, first, week, then, it, ...
8    [good, case, solid, build, protects, phone, al...
9    [this, is, fantastic, case, very, stylish, and...
Name: reviewText, dtype: object

In [11]:
# initialize the gensim model 
model = gensim.models.Word2Vec(
    window=10,                               # 10 words before and after target word
    min_count=2,                             # if you have sentense of less than 2 words - discard them
    workers=4,                               # 4 threads 
)


In [12]:
# buld the vocabulary
model.build_vocab(
    review_text, 
    progress_per=100
)

In [13]:
# model has default epoch of 5
model.epochs

5

In [14]:
# what is the size of corpus 
model.corpus_count

194439

``` 
Build the Word2Vec model 

In [15]:
model.train(review_text, epochs=model.epochs, total_examples=model.corpus_count)

(61506423, 83868975)

In [16]:
# save the model - so that we can use the pretrained model for other examples 
model.save('output/video42/amazon-review.model')

``` 
Experiment with model

In [18]:
model.wv.most_similar("bad")

[('shabby', 0.6609269380569458),
 ('terrible', 0.6519754528999329),
 ('good', 0.586584210395813),
 ('horrible', 0.578879714012146),
 ('legit', 0.5500673651695251),
 ('awful', 0.5285909175872803),
 ('okay', 0.5146781802177429),
 ('ok', 0.5139726400375366),
 ('crappy', 0.5087110996246338),
 ('disappointing', 0.50858473777771)]

In [20]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.5236083

In [22]:
model.wv.similarity(w1="great", w2="product")

-0.05084209

# Excercise

Get the data from http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [31]:
# read the file from the location 
df_sports = pd.read_json('data/video42/reviews_Sports_and_Outdoors_5.json', lines=True)
df_sports.sample(10)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
138131,A1APWHOUZBGMR2,B001RJ4Q2G,"Stacy Brown ""Olystacy""","[0, 0]",We bought this for our son on his 13th birthda...,5,FUN GAME FOR EVERYONE ABLE TO THROW A DISC,1400716800,"05 22, 2014"
80414,ADH7NHBYIKSTP,B000XHBKOU,NB,"[9, 9]","Instead of buying a bunch of locks and cables,...",5,Just what I needed for city riding,1322611200,"11 30, 2011"
94134,A3EWW8FFOIDXJM,B00162Q17W,clock buyer.,"[0, 2]",I thought I was going to be able to take this ...,4,Needs lots of maintenance?,1378512000,"09 7, 2013"
201614,A1LYDMJ55BBQJB,B0040J7Y5Q,"Hatchling001 ""Hatchling""","[0, 0]",I've seen he same set on other sites go for mo...,5,GREAT BUY,1360368000,"02 9, 2013"
47699,A9RTRKZM3IEOY,B000HR95NO,"M. Ross ""Bacchuskitty""","[88, 92]",I got this stove to serve as a backup for my c...,3,Decent Backup Stove,1280966400,"08 5, 2010"
24687,A16JCH8VSM5OYM,B000AO7NRY,"James Zampino ""jimmy zamp""","[2, 2]",After reading the reviews I expected this sadd...,4,GOOD VALVE,1393891200,"03 4, 2014"
84681,A128X2E3HD9XKK,B0011ZH312,M. Russo,"[1, 2]",I really like these Power Grip straps; they wo...,4,Bang for the Buck,1326067200,"01 9, 2012"
289172,A1BYMKR6E2EXEJ,B00D7H9LIA,A Dogged shopper,"[0, 0]",Works as promised and really stays warm for ho...,4,hot stuff,1359676800,"02 1, 2013"
98794,AFB7MSIPJIW68,B0017KUSCC,Seph,"[0, 0]",Razor sharp from factory. Quick and easy open/...,5,Great knife.,1353715200,"11 24, 2012"
40571,A3N37IWYXIEFL0,B000FIE4AE,Brandon,"[1, 1]",This is a great and must have tool for any bic...,5,Great bike tool,1396310400,"04 1, 2014"


In [32]:
df_sports.shape

(296337, 9)

In [33]:
review_text_sports = df_sports.reviewText.apply(gensim.utils.simple_preprocess)
review_text_sports[:10]

0    [this, came, in, on, time, and, am, veru, happ...
1    [had, factory, glock, tool, that, was, using, ...
2    [if, you, don, have, punch, or, would, like, t...
3    [this, works, no, better, than, any, punch, yo...
4    [purchased, this, thinking, maybe, need, speci...
5    [needed, this, tool, to, really, break, down, ...
6    [if, don, have, it, get, it, all, you, need, t...
7    [this, light, will, no, doubt, capture, the, a...
8    [light, and, laser, torch, work, well, very, b...
9    [does, everything, it, says, it, will, do, wou...
Name: reviewText, dtype: object

In [34]:
# initialize the gensim model 
model_sports = gensim.models.Word2Vec(
    window=10,                               # 10 words before and after target word
    min_count=2,                             # if you have sentense of less than 2 words - discard them
    workers=4,                               # 4 threads 
)


In [35]:
# buld the vocabulary
model_sports.build_vocab(
    review_text_sports, 
    progress_per=1000
)

In [36]:
model_sports.train(review_text_sports, epochs=model_sports.epochs, total_examples=model_sports.corpus_count)

(91341377, 121496535)

In [37]:
# save the model 
model_sports.save('output/video42/amazon-sports-review.model')

In [38]:
model_sports.wv.most_similar('awful')

[('horrible', 0.7129591703414917),
 ('terrible', 0.6943297982215881),
 ('ugly', 0.6103164553642273),
 ('unuseable', 0.5989765524864197),
 ('unpleasant', 0.591776430606842),
 ('overwhelming', 0.5912571549415588),
 ('overpowering', 0.5837376117706299),
 ('horrendous', 0.5829005837440491),
 ('utter', 0.5512279868125916),
 ('funny', 0.5479702353477478)]

In [39]:
model_sports.wv.similarity(w1="good", w2="great")

0.78164023

In [40]:
model_sports.wv.similarity(w1="slow", w2="steady")

0.39291072