In [1]:
import gensim
import pandas as pd

## dataset link: 
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [4]:
df = pd.read_json("Sports_and_Outdoors_5.json", lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [5]:
df.shape

(296337, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296337 entries, 0 to 296336
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   reviewerID      296337 non-null  object
 1   asin            296337 non-null  object
 2   reviewerName    294935 non-null  object
 3   helpful         296337 non-null  object
 4   reviewText      296337 non-null  object
 5   overall         296337 non-null  int64 
 6   summary         296337 non-null  object
 7   unixReviewTime  296337 non-null  int64 
 8   reviewTime      296337 non-null  object
dtypes: int64(2), object(7)
memory usage: 20.3+ MB


In [7]:
df.describe()

Unnamed: 0,overall,unixReviewTime
count,296337.0,296337.0
mean,4.393451,1364133000.0
std,0.986905,35345510.0
min,1.0,1015459000.0
25%,4.0,1353802000.0
50%,5.0,1371427000.0
75%,5.0,1388966000.0
max,5.0,1406074000.0


## Simple Preprocessing & Tokenization
- converting all the words to lower case
- trimming spaces
- removing punctuations
- remove stop words like 'and', 'or', 'is', 'the', 'a', 'an'
- convert words to root forms like 'running' to 'run'.

In [10]:
df.reviewText[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [13]:
gensim.utils.simple_preprocess(df.reviewText[0],)

['this',
 'came',
 'in',
 'on',
 'time',
 'and',
 'am',
 'veru',
 'happy',
 'with',
 'it',
 'haved',
 'used',
 'it',
 'already',
 'and',
 'it',
 'makes',
 'taking',
 'out',
 'the',
 'pins',
 'in',
 'my',
 'glock',
 'very',
 'easy']

In [11]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [74]:
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shubham/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [79]:
# new_review_text=[]

# for sentence in review_text:
#     print(sentence)
#     new_senetece = [word for word in sentence if word not in stop_words]
#     new_review_text.append(new_review_text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Training the Word2Vec Model


In [16]:
model = gensim.models.Word2Vec(window=10,
                               min_count=2)

## Build Vocabulary


In [19]:
model.build_vocab(review_text)

## Train the Word2Vec Model

In [20]:
model.epochs

5

In [21]:
model.corpus_count

296337

In [22]:
model.vector_size

100

In [23]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91339307, 121496535)

## Check Similar Words


In [36]:
model.wv.most_similar("bad")

[('terrible', 0.6956256031990051),
 ('horrible', 0.6848190426826477),
 ('shabby', 0.6655095815658569),
 ('greatest', 0.5339285731315613),
 ('good', 0.5174598097801208),
 ('funny', 0.517078161239624),
 ('lame', 0.5094628930091858),
 ('upset', 0.5083186626434326),
 ('poor', 0.49905991554260254),
 ('spectacular', 0.49051475524902344)]

In [37]:
model.wv.most_similar("good")

[('decent', 0.8643445372581482),
 ('great', 0.7861530184745789),
 ('nice', 0.7386904954910278),
 ('fantastic', 0.716711699962616),
 ('reasonable', 0.6301575899124146),
 ('terrific', 0.6268160939216614),
 ('excellent', 0.6212188005447388),
 ('superb', 0.6079545021057129),
 ('awesome', 0.6011907458305359),
 ('wonderful', 0.5676171183586121)]

## Similarity between words

In [34]:
model.wv.similarity(w1="good", w2="great")

0.7861529

In [35]:
model.wv.similarity(w1="good", w2="bad")

0.51745975

In [28]:
model.wv.similarity(w1="slow", w2="fast")

0.544945

In [38]:
model.wv.similarity(w1="yes", w2="no")

0.18570927

## Odd Man Out

In [41]:
model.wv.doesnt_match(["man","woman","girl","boy","monkey"])

'monkey'

In [44]:
model.wv.doesnt_match(["man","water","woman","girl","boy","monkey",])

'water'

In [53]:
model.wv.doesnt_match(["Football","Cricket","Hockey","monkey"])

'monkey'

In [54]:
model.wv.doesnt_match(["bat","ball","stick","shoes"])

'shoes'

In [59]:
model.wv.doesnt_match(["shoes","helmet","gloves","bat"])

'bat'

## Check vector shape

In [60]:
model.wv["cricket"]

array([ 6.85842335e-02, -3.28962058e-02,  1.13055706e-01, -8.80391076e-02,
        2.15301644e-02, -8.92385095e-02, -1.77110478e-01,  2.45463744e-01,
        7.22914487e-02,  7.16788694e-02, -1.63633108e-01, -3.20149846e-02,
        7.32564032e-02, -4.33639972e-04,  1.02927513e-01, -4.25332338e-02,
        6.44274727e-02, -5.03031313e-02,  3.87843810e-02,  3.97912487e-02,
       -2.20517829e-01, -7.90867135e-02, -1.67399570e-02, -2.18250719e-03,
       -5.56909107e-02,  4.18169983e-02, -3.19030851e-01, -4.41530973e-01,
        2.38790419e-02, -1.83072731e-01,  2.93621328e-02, -1.24943547e-01,
       -2.99461111e-02,  1.47645921e-02,  7.61857703e-02,  3.41494232e-02,
        1.35593712e-01, -8.88454989e-02, -6.20736033e-02, -1.45303130e-01,
       -1.25636503e-01, -1.44416943e-01, -2.25140810e-01, -9.77870524e-02,
       -4.14099872e-01, -9.71712731e-03, -1.18797980e-01,  2.94372112e-01,
       -6.31308779e-02,  7.96763375e-02, -1.81690037e-01, -1.33500725e-01,
       -5.51025234e-02, -

In [62]:
model.wv["bat"]

array([ 2.2948425 ,  0.526114  , -0.6732951 , -0.33091363, -1.6623371 ,
        3.0956216 , -0.64146584,  2.1122074 , -1.6500059 ,  1.5365368 ,
       -0.9235963 , -1.2684898 , -0.01061213,  1.5176959 ,  1.6249678 ,
        0.3282571 ,  1.0423957 , -0.9532238 , -1.3097931 , -1.0739422 ,
        0.23510012, -0.98886704, -1.2247269 , -2.6018019 , -2.0065677 ,
       -0.00525048,  0.9527548 ,  1.7160814 , -0.14275678, -2.9274027 ,
       -1.3319973 , -2.240692  , -1.1651824 , -0.24061438,  3.386044  ,
        0.30302316, -2.6883929 ,  0.7880923 ,  0.0094882 ,  3.9116902 ,
        1.2037975 ,  1.5011297 ,  1.3766896 ,  0.09592403, -1.5119945 ,
        2.0241165 , -1.470567  ,  0.7000056 ,  0.89275974, -2.1287968 ,
       -3.2609394 ,  0.78231734, -1.0618527 ,  0.7821663 ,  0.27127382,
       -1.3543626 ,  2.0986319 , -1.779901  , -3.049952  ,  2.8613675 ,
       -1.2915593 ,  0.19855657,  2.879675  , -1.6712861 ,  2.2224708 ,
       -0.89319277, -2.6474202 , -1.2715802 ,  0.53480077,  1.32

In [63]:
model.wv.get_normed_vectors().shape

(53233, 100)

In [64]:
model.wv.get_normed_vectors()

array([[ 0.08280678, -0.18233357,  0.18295455, ..., -0.00072405,
         0.13484894, -0.02581779],
       [ 0.05883162, -0.06710979, -0.05279383, ..., -0.19814937,
        -0.00053101,  0.02517627],
       [-0.03445325,  0.01406901,  0.0087341 , ...,  0.0112638 ,
        -0.04579254,  0.02023666],
       ...,
       [ 0.22199908,  0.00476774, -0.199237  , ...,  0.07107933,
         0.03882146, -0.07948396],
       [-0.13405459, -0.02162415,  0.01085088, ..., -0.18782485,
         0.12993535,  0.15114996],
       [-0.17251235,  0.00070891,  0.02797342, ..., -0.07939284,
         0.11023235,  0.01787119]], dtype=float32)

In [71]:
y = model.wv.index_to_key
len(y)

53233

In [80]:
y

['the',
 'and',
 'it',
 'to',
 'is',
 'of',
 'for',
 'this',
 'in',
 'my',
 'you',
 'on',
 'that',
 'with',
 'but',
 'have',
 'not',
 'as',
 'are',
 'was',
 'be',
 'so',
 'they',
 'very',
 'one',
 'if',
 'or',
 'can',
 'great',
 'these',
 'use',
 'like',
 'well',
 'at',
 'just',
 'good',
 'will',
 'up',
 'out',
 'your',
 'would',
 'when',
 'all',
 'more',
 'get',
 'them',
 'from',
 'an',
 'knife',
 'had',
 'than',
 'has',
 'me',
 'no',
 'about',
 'do',
 'some',
 'little',
 'only',
 'other',
 'there',
 'really',
 'also',
 'don',
 'easy',
 'what',
 'much',
 'time',
 'price',
 'used',
 'fit',
 'product',
 'which',
 'made',
 'light',
 'nice',
 'work',
 'off',
 'too',
 'quality',
 'works',
 'after',
 'bought',
 'because',
 'any',
 'need',
 'bike',
 'even',
 'its',
 'better',
 'back',
 'bag',
 'does',
 'small',
 've',
 'put',
 'am',
 'by',
 'over',
 'two',
 'buy',
 'enough',
 'got',
 'first',
 'still',
 'could',
 'way',
 'into',
 'size',
 'down',
 'recommend',
 'make',
 'using',
 'now',
 'th

In [81]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [82]:
x = pca.fit_transform(model.wv.get_normed_vectors())

In [83]:
x

array([[ 0.57165885,  0.28144214,  0.12690541],
       [ 0.47565535,  0.16409336,  0.1110385 ],
       [ 0.6883061 ,  0.00506638,  0.08113156],
       ...,
       [ 0.38179034, -0.02260034,  0.08597445],
       [-0.19586895, -0.18410781,  0.00814727],
       [-0.01783826,  0.03663317, -0.02925671]], dtype=float32)

In [84]:
x.shape

(53233, 3)

In [96]:
import plotly.express as px
fig = px.scatter_3d(x[:500],x=0,y=1,z=2,color=y[:500])
fig.show()