In [1]:
# To ignore warning messages when filtering data
from warnings import filterwarnings
filterwarnings('ignore')

# References

1. https://techvidvan.com/tutorials/python-sentiment-analysis/
2. https://asperbrothers.com/blog/sentiment-analysis-in-python/
3. https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
4. https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

# Basic dataset preparation

## Columns of the full dataset

In [27]:
import pandas as pd
# The whole data set
data = pd.read_csv("../data/amazonConsumerReviews.csv")
print("COLUMN NAMES\n------------")
for c in data.columns: print(c)

COLUMN NAMES
------------
id
dateAdded
dateUpdated
name
brand
categories
primaryCategories
manufacturer
manufacturerNumber
reviews.date
reviews.doRecommend
reviews.numHelpful
reviews.rating
reviews.text
reviews.title


## Only keeping relevant columns

In [28]:
# Only selecting relevant columns
reviewsData = data[['id',
                  'reviews.doRecommend',
                  'reviews.rating',
                  'reviews.text',
                  'reviews.title']]
reviewsData.head(3)

Unnamed: 0,id,reviews.doRecommend,reviews.rating,reviews.text,reviews.title
0,AVqVGZNvQMlgsOJE6eUY,False,3,I thought it would be as big as small paper bu...,Too small
1,AVqVGZNvQMlgsOJE6eUY,True,5,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach
2,AVqVGZNvQMlgsOJE6eUY,True,4,Didnt know how much i'd use a kindle so went f...,Great for the price


## Converting ratings into sentiment labels

In [29]:
print("REVIEWS RATING INFO\n------------")
ratings = reviewsData['reviews.rating']
print("Minimum:", min(ratings))
print("Maximum:", max(ratings))
print("Mean:", sum(ratings)/len(ratings))

REVIEWS RATING INFO
------------
Minimum: 1
Maximum: 5
Mean: 4.5968


For our purposes, let rating < 3 mean negative, rating > 3 mean positive, and rating = 3 be neutral.

In [30]:
# Converting ratings to sentiment labels
sentiment = []
for r in ratings:
    if r < 3: sentiment.append(0)   # Negative
    elif r > 3: sentiment.append(1) # Positive
    else: sentiment.append('n')     # Neutral
reviewsData['sentiment'] = sentiment
try: del(reviewsData['reviews.rating'])
except: pass
reviewsData.head(3)

Unnamed: 0,id,reviews.doRecommend,reviews.text,reviews.title,sentiment
0,AVqVGZNvQMlgsOJE6eUY,False,I thought it would be as big as small paper bu...,Too small,n
1,AVqVGZNvQMlgsOJE6eUY,True,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,1
2,AVqVGZNvQMlgsOJE6eUY,True,Didnt know how much i'd use a kindle so went f...,Great for the price,1


## Removing neutral sentiment rows

In [31]:
# Removing neutral rows
reviewsData = reviewsData[reviewsData['sentiment'] != 'n']
# Hence we can see the first row (which has neutral sentiment label) will be removed
reviewsData.head(3)

Unnamed: 0,id,reviews.doRecommend,reviews.text,reviews.title,sentiment
1,AVqVGZNvQMlgsOJE6eUY,True,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,1
2,AVqVGZNvQMlgsOJE6eUY,True,Didnt know how much i'd use a kindle so went f...,Great for the price,1
3,AVqVGZNvQMlgsOJE6eUY,True,I am 100 happy with my purchase. I caught it o...,A Great Buy,1


# Text processing and tokenization

The processes are elaborated and explained in the notebook on text mining and tokenization.

## Tokenization & lemmatization

In [32]:
# Tokenization
#------------
"""
Lemmatization (along with entity extraction and POS tagging)
are also performed simultaneously.
"""
from spacy import load
# Loading the language model
tokenizer = load("en_core_web_sm")
tokenizedDocs = []

# Tokenizing each review
for r in reviewsData['reviews.text']:
    tokenizedDocs.append(tokenizer(r))
#========================
# Lemmatization
#------------
lemmatizedDocs = []

# Iterating through each tokenized text
for doc in tokenizedDocs:
    lemmatizedDoc = []
    
    # Iterating through each token in the tokenized text
    for token in doc:
        lemmatizedDoc.append(token.lemma_)
    
    # Adding the lemmatized text to the list
    lemmatizedDocs.append(lemmatizedDoc)

In [33]:
reviewsData['reviews.text (new)'] = lemmatizedDocs
reviewsData.head(3)

Unnamed: 0,id,reviews.doRecommend,reviews.text,reviews.title,sentiment,reviews.text (new)
1,AVqVGZNvQMlgsOJE6eUY,True,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,1,"[this, kindle, be, light, and, easy, to, use, ..."
2,AVqVGZNvQMlgsOJE6eUY,True,Didnt know how much i'd use a kindle so went f...,Great for the price,1,"[do, not, know, how, much, I, would, use, a, k..."
3,AVqVGZNvQMlgsOJE6eUY,True,I am 100 happy with my purchase. I caught it o...,A Great Buy,1,"[I, be, 100, happy, with, my, purchase, ., I, ..."


# Machine learning approach

## Need for machine learning & deep learning

Natural language is extremely diverse in terms of concepts and expressions, and varies greatly depending on context. Furthermore, the number of permutations and combinations in which concepts and expressions may appear are practically infinite, and the effects of these permutations and combinations on meaning are often significant.
<br><br>
However, language is based on rules, which may be clearly defined or implicit in usage. Usage of language usually follows discernible patterns that human speakers learn to identify over time, through experience and education.
<br><br>
Due to the necessity to handle unstructured data (i.e. natural language texts), and due to the complexity of natural language that makes it difficult to structure it meaningfully using hard computing instructions, we will use **deep learning**, which uses artificial neural networks with multiple layers.

**Deep learning**: Deep learning is a subfield of machine learning. It refers to the usage of an ANN using more than one layer, hence the term 'deep', which refers to depth of network layers. Multiple network layers enhances the ANN's ability to adapt to more diverse and complex problems, since there are many more nodes, hence many more link weights and link structures to modify according to requirements.

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
#------------------------
vocabularySize = len(tokenizer.word_index)
embeddingVectorLength = 32
#------------------------
model = Sequential()
model.add(Embedding(vocabularySize,
                    embeddingVectorLength,
                    input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50,
          dropout=0.5,
          recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 32)           141888    
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_3 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 158,539
Trainable params: 158,539
Non-trainable params: 0
________________________________________________

**_Embedding_**

The class 'Embedding' inherits from the classes 'Layer' and 'Module'. The arguments used above are
1. **input_dim (int ≥ 1)**: size of vocabulary
2. **output_dim (int ≥ 1)**: dimension of dense embedding
3. **input_lenght**: Length of input sequences, when it is constant. This argument is required if you are going to connect Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).

In [37]:
import numpy as np
def intstr(x):
    return str(int(x))
reviewsData['sentiment'] = list(map(intstr, reviewsData['sentiment']))
sentimentLabels = np.array(reviewsData['sentiment'])
from tensorflow.keras.preprocessing.sequence import pad_sequences
paddedSequence = pad_sequences(encodedDocs, maxlen=200)
history = model.fit(paddedSequence,
                    sentimentLabels,
                    validation_split=0.2,
                    epochs=2,
                    batch_size=32)

Epoch 1/2


UnimplementedError:  Cast string to float is not supported
	 [[node binary_crossentropy/Cast
 (defined at /Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/losses.py:1797)
]] [Op:__inference_train_function_8100]

Errors may have originated from an input operation.
Input Source operations connected to node binary_crossentropy/Cast:
In[0] ExpandDims (defined at /Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/compile_utils.py:677)

Operation defined at: (most recent call last)
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/runpy.py", line 194, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
>>>     app.start()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 612, in start
>>>     self.io_loop.start()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 149, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
>>>     self._run_once()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
>>>     handle._run()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/asyncio/events.py", line 81, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/ioloop.py", line 690, in <lambda>
>>>     lambda f: self._run_callback(functools.partial(callback, future))
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/ioloop.py", line 743, in _run_callback
>>>     ret = callback()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/gen.py", line 787, in inner
>>>     self.run()
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/gen.py", line 748, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 365, in process_one
>>>     yield gen.maybe_future(dispatch(*args))
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
>>>     yield gen.maybe_future(handler(stream, idents, msg))
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 543, in execute_request
>>>     self.do_execute(
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/tornado/gen.py", line 209, in wrapper
>>>     yielded = next(result)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 306, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2877, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2923, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3146, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "<ipython-input-37-9997704d280f>", line 8, in <module>
>>>     history = model.fit(paddedSequence,
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 809, in train_step
>>>     loss = self.compiled_loss(
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 201, in __call__
>>>     loss_value = loss_obj(y_t, y_p, sample_weight=sw)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/losses.py", line 141, in __call__
>>>     losses = call_fn(y_true, y_pred)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/losses.py", line 245, in call
>>>     return ag_fn(y_true, y_pred, **self._fn_kwargs)
>>> 
>>>   File "/Users/pranav/opt/anaconda3/lib/python3.8/site-packages/keras/losses.py", line 1797, in binary_crossentropy
>>>     y_true = tf.cast(y_true, y_pred.dtype)
>>> 