# Sentiment Analysis of Product Reviews using Pre-trained Embeddings

In [None]:
%tensorflow_version 2.x

### We'll use this text classification problem data file for the example,as such you can use any other data and choose a different problem 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model

### Load Data from File - Amazon Product Reviews

In [None]:
data=pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None, names=['review','sentiment'])

In [None]:
data

Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


### Tokenize Data

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sent_lens=[]
for sent in data['review']:
    sent_lens.append(len(word_tokenize(sent)))

In [None]:
max(sent_lens)

36

In [None]:
np.quantile(sent_lens,0.95)

26.0

In [None]:
max_len = 30

tok = Tokenizer(char_level=False,split=' ')

tok.fit_on_texts(data['review'])

In [None]:
sequences = tok.texts_to_sequences(data['review'])

In [None]:
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

In [None]:
vocab_size=len(tok.word_index)

In [None]:
vocab_size

1878

### Build the RNN Model for Classification

In [None]:
def RNN(embedding_matrix, embed_dim, max_len, vocab_size):
    inputs = Input(name='inputs',shape=[max_len])
    
    layer = Embedding(vocab_size+1, embed_dim, input_length=max_len,
                      mask_zero=True, weights=[embedding_matrix], trainable=False)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
    return model

# Look at Pre-trained Embeddings

#Word2Vec

In [None]:
# different vocab size and source embeddings are available here 
# many non-english embeddings are also available 
# http://vectors.nlpl.eu/repository/
# chose any [ consider resource and model size that the team can afford ]

In [None]:
!wget http://vectors.nlpl.eu/repository/20/0.zip

--2021-05-07 16:16:49--  http://vectors.nlpl.eu/repository/20/0.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 344050746 (328M) [application/zip]
Saving to: ‘0.zip’


2021-05-07 16:17:05 (21.7 MB/s) - ‘0.zip’ saved [344050746/344050746]



In [None]:
# name of the file will change if you download a different embedding file
!unzip 0.zip

Archive:  0.zip
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


In [None]:
!head -10 model.txt
# if the download does not come with a .txt file [only .bin file is there], you'll have to 
# use package gensim to load and convert 

163473 300
say_VERB -0.008861 0.097097 0.100236 0.070044 -0.079279 0.000923 -0.012829 0.064301 -0.029405 -0.009858 -0.017753 0.063115 0.033623 0.019805 0.052704 -0.100458 0.089387 -0.040792 -0.088936 0.110212 -0.044749 0.077675 -0.017062 -0.063745 -0.009502 -0.079371 0.066952 -0.070209 0.063761 -0.038194 -0.046252 0.049983 -0.094985 -0.086341 0.024665 -0.112857 -0.038358 -0.007008 -0.010063 -0.000183 0.068841 0.024942 -0.042561 -0.044576 0.010776 0.006323 0.088285 -0.062522 0.028216 0.088291 0.033231 -0.033732 -0.002995 0.118994 0.000453 0.158588 -0.044475 -0.137629 0.066080 0.062824 -0.128369 -0.087959 0.028080 0.070063 0.046700 -0.083278 -0.118428 0.071118 0.100757 0.017944 0.026296 0.017282 -0.082127 -0.006148 0.002967 -0.032857 -0.076493 -0.072842 -0.055179 -0.081703 0.011437 -0.038698 -0.062540 -0.027899 0.087635 0.031870 0.029164 0.000524 -0.039895 -0.055559 0.024582 -0.030595 0.003942 -0.034500 0.003012 -0.023863 0.033831 0.061476 -0.090183 -0.039206 -0.026586 -0.042763 0.049835

In [None]:
embeding_index={}

f=open('/content/model.txt',encoding='utf-8')

for i,line in enumerate(f):
    if i==0:continue
    values=line.split()
    word=values[0].split('_')[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeding_index[word]=coefs
f.close()

In [None]:
len(embeding_index)

152494

In [None]:
list(embeding_index.keys())[:10]

['say', 'go', 'make', 'get', 'one', 'see', 'time', 'take', 'know', 'year']

In [None]:
embedding_matrix=np.zeros((vocab_size+1,300))
words_not_available=0
for word,i in tok.word_index.items():
    embed_vector=embeding_index.get(word)
    if embed_vector is not None:
        embedding_matrix[i]=embed_vector
    else:
      words_not_available+=1

In [None]:
words_not_available
# if this number is too high , you should look into alternative embeddings
# its not necessary that model performance will be low if this number is high 
# that'll be the case if these words matter in the context of the response 

575

In [None]:
# now we are set to use the embedding matrix in our model and fit it
# we'll look at other embeddings now 

In [None]:
!rm -rf 0.zip README meta.json model.bin model.txt

# fastText

In [None]:
# english embeddings are available here : https://fasttext.cc/docs/en/english-vectors.html
# growing repo of non-english embeddings : https://fasttext.cc/docs/en/crawl-vectors.html

# for getting representation of out of vocabulary words , download 
# the file with bin . you will find that in the second link

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
# its a big file [approx 10 gbs ], will take a while to download
# just a reminder , download is from the website to colab
# its not using your network data 

--2021-05-07 16:17:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10356881291 (9.6G) [application/zip]
Saving to: ‘wiki.en.zip’


2021-05-07 16:23:43 (26.2 MB/s) - ‘wiki.en.zip’ saved [10356881291/10356881291]



In [None]:
!unzip wiki.en.zip
# this will take some time 

Archive:  wiki.en.zip
  inflating: wiki.en.vec             
  inflating: wiki.en.bin             


In [None]:
!rm /content/wiki.en.zip

In [None]:
from gensim.models import FastText

In [None]:
ft_model = FastText.load_fasttext_format('/content/wiki.en.bin')
# this will again take a while 

In [None]:
len(ft_model['random'])
# embed dimension here is also 300
# this however give results for OOV words as well using ngrams

  """Entry point for launching an IPython kernel.


300

In [None]:
#ft_model['hello']

In [None]:
embedding_matrix=np.zeros((vocab_size+1,300))
words_not_available=0
for word,i in tok.word_index.items():
    try:
      embed_vector=ft_model[word]
      embedding_matrix[i]=embed_vector
    except:
      words_not_available+=1
words_not_available

  """


1

In [None]:
# we can now use this embedding matrix in the same way as others 

In [None]:
del ft_model
!rm -rf wiki.en.bin wiki.en.vec

# Embeddings from Transformer models

In [None]:
!pip install transformers 

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 3.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 13.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 17.4MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

In [None]:
'''BERT base model (uncased) Pretrained model on English language using a masked language modeling (MLM) objective. 
It was introduced in this paper and first released in this repository. 
This model is uncased: it does not make a difference between english and English
'''
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## How to get the Word Embeddings from Pre-trained BERT Model?

## Here since we are not 'loading' any embeddings, the model will look different without the embedding layer. 

## Input to our LSTM model will be simply output of embeddings from the Bert model to be obtained from the output of the last hidden-layer.

## Let's check it with a review/sentence.

In [None]:
data['review'][0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

In [None]:
# Encoding the words with integer indices
input_ids = tf.constant(tokenizer.encode(data['review'][0]))[None, :]  
print(input_ids)

tf.Tensor(
[[  101  2061  2045  2003  2053  2126  2005  2033  2000 13354  2009  1999
   2182  1999  1996  2149  4983  1045  2175  2011  1037 10463  2121  1012
    102]], shape=(1, 25), dtype=int32)


In [None]:
# Model produces the word embeddings from the integer encoded words
outputs = model(input_ids)
last_hidden_states = outputs[0]
print(last_hidden_states)

tf.Tensor(
[[[-0.01708518  0.2510941   0.1905702  ... -0.21773112  0.3104601
    0.8522914 ]
  [ 0.44042927 -0.2644225   0.42681867 ... -0.03187766  1.1223497
    0.86067444]
  [ 0.19325572 -0.23666668  0.03661226 ... -0.29324228  0.99495
    0.0513375 ]
  ...
  [ 0.91823876 -0.3364673  -0.11607603 ... -0.2613984   0.20164701
   -0.15987952]
  [ 0.6025457   0.3619916  -0.13388878 ...  0.02161847 -0.24598178
   -0.47703624]
  [ 0.5443777   0.5844305   0.38803414 ...  0.3234829  -0.34467596
   -0.48030332]]], shape=(1, 25, 768), dtype=float32)


In [None]:
# Word vectors of all the words in the sentence/review
outputs[0][0]

<tf.Tensor: shape=(25, 768), dtype=float32, numpy=
array([[-0.01708518,  0.2510941 ,  0.1905702 , ..., -0.21773112,
         0.3104601 ,  0.8522914 ],
       [ 0.44042927, -0.2644225 ,  0.42681867, ..., -0.03187766,
         1.1223497 ,  0.86067444],
       [ 0.19325572, -0.23666668,  0.03661226, ..., -0.29324228,
         0.99495   ,  0.0513375 ],
       ...,
       [ 0.91823876, -0.3364673 , -0.11607603, ..., -0.2613984 ,
         0.20164701, -0.15987952],
       [ 0.6025457 ,  0.3619916 , -0.13388878, ...,  0.02161847,
        -0.24598178, -0.47703624],
       [ 0.5443777 ,  0.5844305 ,  0.38803414, ...,  0.3234829 ,
        -0.34467596, -0.48030332]], dtype=float32)>

In [None]:
# Embedding word vector
outputs[0][0][0]

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-1.70851834e-02,  2.51094103e-01,  1.90570205e-01, -5.41369617e-02,
       -1.45467803e-01, -5.28701663e-01,  2.92222500e-01,  3.06810081e-01,
        1.01796195e-01,  2.31384719e-03,  1.36285007e-01,  3.86215270e-01,
        2.53703594e-01,  1.87108606e-01,  1.03689492e-01,  5.09814136e-02,
        4.75011170e-02,  2.08284348e-01,  2.16687635e-01, -2.18838364e-01,
       -4.59863879e-02, -7.37543330e-02,  2.97137439e-01,  3.75841111e-01,
       -1.29290968e-01, -2.09120139e-01,  2.04872176e-01, -6.01111837e-02,
       -2.72874475e-01,  1.68415755e-01,  9.67503414e-02,  3.43628041e-03,
       -1.65506199e-01, -1.90017819e-01,  4.30524111e-01, -2.70733774e-01,
       -2.26994772e-02, -1.15615651e-02,  1.98417053e-01, -2.45184943e-01,
       -2.89101750e-01,  1.75559536e-01,  2.32616201e-01,  4.37124848e-01,
       -5.03930449e-03, -2.17674583e-01, -3.09909558e+00, -1.52949579e-02,
       -2.41659701e-01, -8.77405778e-02, -7.64964521

#### You can experiment with different reviews and you'll notice that the dimension of tensors are changing [the second number , here its 25 for first review (id 0)] 


## Let's find out the Word Embedding for all the Reviews

### We'll have to make sure to trim embeddings for all reviews to make them of same length or pad inputs to the same size [ which is definitely easier ].

In [None]:
tokenized = data['review'].apply(lambda x: tokenizer.encode(x))

In [None]:
tokenized.values

array([list([101, 2061, 2045, 2003, 2053, 2126, 2005, 2033, 2000, 13354, 2009, 1999, 2182, 1999, 1996, 2149, 4983, 1045, 2175, 2011, 1037, 10463, 2121, 1012, 102]),
       list([101, 2204, 2553, 1010, 6581, 3643, 1012, 102]),
       list([101, 2307, 2005, 1996, 5730, 14417, 1012, 102]),
       list([101, 5079, 2000, 3715, 2099, 2005, 11450, 9879, 2062, 2084, 3429, 2781, 1012, 2350, 3471, 999, 999, 102]),
       list([101, 1996, 23025, 2003, 2307, 1012, 102]),
       list([101, 1045, 2031, 2000, 10147, 24679, 1996, 13354, 2000, 2131, 2009, 2000, 2240, 2039, 2157, 2000, 2131, 11519, 3872, 1012, 102]),
       list([101, 2065, 2017, 2031, 2195, 6474, 2030, 2195, 3634, 10402, 1010, 2059, 5674, 1996, 4569, 1997, 6016, 2169, 1997, 2068, 2028, 2011, 2028, 1012, 102]),
       list([101, 2065, 2017, 2024, 10958, 2480, 2099, 3954, 1012, 1012, 1012, 2017, 2442, 2031, 2023, 999, 102]),
       list([101, 17044, 2015, 2000, 2360, 1010, 1045, 13842, 2026, 2769, 1012, 102]),
       list([101, 2054, 103

In [None]:
tokenized.keys

<bound method Series.keys of 0      [101, 2061, 2045, 2003, 2053, 2126, 2005, 2033...
1         [101, 2204, 2553, 1010, 6581, 3643, 1012, 102]
2        [101, 2307, 2005, 1996, 5730, 14417, 1012, 102]
3      [101, 5079, 2000, 3715, 2099, 2005, 11450, 987...
4              [101, 1996, 23025, 2003, 2307, 1012, 102]
                             ...                        
995    [101, 1996, 3898, 2515, 2131, 15488, 27066, 40...
996    [101, 2054, 1037, 3538, 1997, 18015, 1012, 101...
997       [101, 8875, 2515, 2025, 2674, 3861, 1012, 102]
998    [101, 1996, 2069, 2518, 2008, 4487, 3736, 9397...
999    [101, 2017, 2064, 2025, 3437, 4455, 2007, 1996...
Name: review, Length: 1000, dtype: object>

### Make maximum length to 30 and pad the sequence

In [None]:
padded=[]
for i in tokenized.values:
  if len(i)>=30:
    padded.append(i[:30])
  else:
    padded.append([0]*(max_len-len(i))+i)

In [None]:
padded = np.array(padded)

In [None]:
np.array(padded).shape


(1000, 30)

### Prepare an Attention Mask required to let the BERT model know which words to ignore 

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1000, 30)

In [None]:
attention_mask
# this is to tell bert to ignore padding 

array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

### Get the Word Embeddings from the BERT model for the words in our Vocabulary (of all the words present in the product reviews)

In [None]:
last_hidden_states = model(padded, attention_mask=attention_mask)

#### You can now use this as input to any kind of model. 
#### It can be a simple classifier from scikit learn or a deep learning model, no need to use embedding layer.

In [None]:
features = last_hidden_states[0][:,0,:].numpy()


In [None]:
features.shape

(1000, 768)


## Finally, any one of such embeddings can be used at the input of the RNN/LSTM network we have created to make the sentiment classification.


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Sentiment Classification with Pre-trained Embeddings

## Lets Train the Model with **fastText** Embeddings

In [None]:
model = RNN(embedding_matrix = embedding_matrix, embed_dim = 300, max_len=30, vocab_size=vocab_size)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(sequences_matrix, data['sentiment'], test_size=0.3, random_state=101, shuffle=True)

In [None]:
model.fit(x_train, y_train, validation_data =(x_test, y_test), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fa19e6aaf10>

### Evaluate the Model

In [None]:
loss, acc = model.evaluate(x_test, y_test)



In [None]:
print(loss)

1.020675778388977


In [None]:
print(acc)

0.8166666626930237
