<a href="https://colab.research.google.com/github/setthawut8/ai/blob/main/%5BEmbedding%20Layer%5D%20tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Word Embedding Layer
(Inspiration https://youtu.be/Fuw0wv3X-0o)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers 
from tensorflow.keras.layers import Dense, Flatten, Embedding

In [None]:
reviews = ['nice food',
           'amazing restaurant',
           'too good',
           'just loved it',
           'will go again',
           'horrible food',
           'never go there',
           'poor service',
           'poor quality',
           'needs improvement']

sentiments = np.array([1,1,1,1,1,0,0,0,0,0])

##1) One Hot Vector

In [None]:
#create an array of number of each word
#(encoded number, size of vocab)
one_hot("amazing restaurant", 30)

[25, 2]

In [None]:
#no. of vocabulary words to be kept
vocab_size=50
#encoding words
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
encoded_reviews

[[22, 19],
 [38, 42],
 [4, 31],
 [41, 27, 9],
 [19, 37, 31],
 [40, 19],
 [35, 37, 32],
 [7, 44],
 [7, 1],
 [27, 6]]

##2) Padding

In [None]:
#some pharses are 2 or 3 words so some needed to padding to have the same length of array
maxlen=3
padded_reviews = pad_sequences(encoded_reviews, maxlen=maxlen, padding='post')
padded_reviews

array([[22, 19,  0],
       [38, 42,  0],
       [ 4, 31,  0],
       [41, 27,  9],
       [19, 37, 31],
       [40, 19,  0],
       [35, 37, 32],
       [ 7, 44,  0],
       [ 7,  1,  0],
       [27,  6,  0]], dtype=int32)

##3) Embedded Vector Size (features of each word) & Model

In [None]:
#Embeded vector size = numbers of features in a word
embeded_vector_size = 4

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=maxlen, name='embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
X = padded_reviews
y = sentiments

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 4)              200       
                                                                 
 flatten_4 (Flatten)         (None, 12)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 13        
                                                                 
Total params: 213
Trainable params: 213
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x7f4265cac910>

In [None]:
loss, accuracy = model.evaluate(X, y)
accuracy



1.0

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights) #this len's equal to vocab_size

50

In [None]:
print(weights[22])
print(weights[3])

[ 0.09223264 -0.00312139 -0.01635139  0.01045673]
[ 0.01558313 -0.01015184 -0.04265321  0.01204729]


In [None]:
#follow this link for downloading weights for the embedding layer
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/#:~:text=4.%20Example%20of%20Using%20Pre%2DTrained%20GloVe%20Embedding

#Word2Vec (Self-supervised)
(https://youtu.be/hQwFeIupNP0)

For ex, a computer can compute King - Men + Woman = Queen.>>

1) **CBOW**: given the target, predict the context

2) **Skip Gram**: given the contect, predict the traget

In [None]:
!pip install gensim
!pip install python-Levenshtein

#Data: a subset of Amazon reviews
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz
#.gz = zip file, so you need to unzip first with gunzip
!gunzip '/content/reviews_Cell_Phones_and_Accessories_5.json.gz'

In [None]:
import gensim
import pandas as pd

In [None]:
df = pd.read_json('/content/reviews_Cell_Phones_and_Accessories_5.json', lines=True)
#190k records and 9 columns
print(df.shape)
df.head()

(194439, 9)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [None]:
#Word tokenisation
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [None]:
#Build a model
model = gensim.models.Word2Vec(
    window=10, #= the number of words before and after the target word to be applied
    min_count=2, #= use at least 2 words for the training
    workers=4, #= CPUs to be used
)

In [None]:
model.build_vocab(review_text, progress_per=1000)

In [None]:
#train the model for word2vec (including words and word relationships)
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

#save the model
model.save('first_word2vec_amazon_reviews.model')

In [None]:
#find similar word
model.wv.most_similar('bad')

[('terrible', 0.6782885193824768),
 ('shabby', 0.6595362424850464),
 ('good', 0.5917364358901978),
 ('horrible', 0.5639103651046753),
 ('mad', 0.5362191200256348),
 ('poor', 0.5289582014083862),
 ('legit', 0.5195903778076172),
 ('sad', 0.5142112970352173),
 ('crappy', 0.51366126537323),
 ('awful', 0.5130081176757812)]

In [None]:
#Check similarity between 2 words
model.wv.similarity(w1='great', w2='hello')

0.0035293866

## Word2Vec exercise

https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/42_word2vec_gensim/42_word2vec_gensim.ipynb

## **Exercise**
Train a word2vec model on the Sports & Outdoors Reviews Dataset Once you train a model on this, find the words most similar to 'awful' and find similarities between the following word tuples: ('good', 'great'), ('slow','steady')
[click for the solution](https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/42_word2vec_gensim/42_word2vec_gensim_exercise_solution.ipynb)

In [None]:
!wget "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz"
!gunzip "/content/reviews_Sports_and_Outdoors_5.json.gz"

--2022-06-27 08:48:02--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68275834 (65M) [application/x-gzip]
Saving to: ‘reviews_Sports_and_Outdoors_5.json.gz’


2022-06-27 08:48:05 (19.9 MB/s) - ‘reviews_Sports_and_Outdoors_5.json.gz’ saved [68275834/68275834]



In [None]:
import gensim
import pandas as pd

In [None]:
df = pd.read_json('/content/reviews_Sports_and_Outdoors_5.json', lines=True)

In [None]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [None]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [None]:
model = gensim.models.Word2Vec(
    window=15,
    min_count=5,
    workers=4
)

In [None]:
model.build_vocab(review_text)

In [None]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91023400, 121496535)

In [125]:
model.wv.most_similar('awful')

[('horrible', 0.6582611799240112),
 ('terrible', 0.6495241522789001),
 ('ugly', 0.6117993593215942),
 ('exaggeration', 0.5812787413597107),
 ('overpowering', 0.5501402616500854),
 ('horrendous', 0.5473026633262634),
 ('pathetic', 0.5470247268676758),
 ('funny', 0.5463792681694031),
 ('idiot', 0.5439237952232361),
 ('eminent', 0.5434597134590149)]

In [None]:
model.wv.similarity(w1='good', w2='great')

0.77129304

In [None]:
model.wv.similarity(w1='slow', w2='steady')

0.3828448