<a href="https://colab.research.google.com/github/romeshb/NLP-Review-Classification/blob/main/NLP_Classification_of_Amazon_Music_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [165]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import keras
import os
import cv2
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, Dense, MaxPool2D
from keras.layers import Embedding

In [17]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/amazonreviews.tsv',sep='\t' )

# In the NLP model building process will follow these steps
- Converting Reviews to lowercase
- Remove Stopwords from Reviews
- Remove Punctuations from Reviews
- Get TDM (Term Document Frequency)
- Sampling 
- Training 
- Evaluation

In [20]:
# We'll create a copy of the original dataset
df1 = df.copy()

In [21]:
df1.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [38]:
df1.label.replace({"neg":0, "pos":1}, inplace = True)

In [121]:
df1.label.value_counts() # there's not much class imbalance. So we'll not handle the class imbalance for now.

0    5097
1    4903
Name: label, dtype: int64

In [39]:
# Now we remove the stop words and punctuations

# We'll lowercase all the text
df1.review = df1.review.str.lower()

# Remove Stopwords and Punctuations
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
def text_process(text_df_in):
    """
    removes punctions
    removes stopwords
    returns a clean list of text_df_out
    """
    nopunc = [char for char in text_df_in if char not in string.punctuation] # removes punctutions
    nopunc = ''.join(nopunc)
    
    text_df_out = [char for char in nopunc.split() if char not in stopwords.words('english')] # removes stopwords
    return text_df_out


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
# commmented out for running the second time
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer = text_process).fit(df1.review)

In [41]:
len(bow_transformer.vocabulary_)

40040

In [42]:
# commmented out for running the second time
reviews_bow = bow_transformer.transform(df1.review) # creating the TDM

## so ou TDM will act as my X  Variables

In [43]:
# sampling the data
from sklearn.model_selection import train_test_split

x_train,x_test, y_train,y_test=  train_test_split(reviews_bow, df1.label, test_size= 0.2)

### Random Forrest

In [44]:
# Random Forrest Classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train,y_train) # Training the model

pred_rd = rf.predict(x_test) # making predictions


In [48]:
from sklearn.metrics import confusion_matrix , accuracy_score

In [53]:
rf_conf = confusion_matrix(y_test , pred_rd)
rf_conf

array([[862, 154],
       [162, 822]])

In [73]:
rf_score = rf.score(x_test, y_test)
rf_score

0.842

In [98]:
print(classification_report(y_test, pred_rd))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1016
           1       0.84      0.84      0.84       984

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



### Naive Bayes

In [58]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

MultinomialNB()

In [65]:
pred_nb = nb.predict(x_test)

In [66]:
nb_conf = confusion_matrix(y_test , pred_nb)
nb_conf

array([[870, 146],
       [172, 812]])

In [63]:
nb.score(x_train,y_train)

0.948125

In [64]:
nb_score = nb.score(x_test,y_test)
nb_score

0.841

In [96]:
from sklearn.metrics import classification_report

In [99]:
print(classification_report(y_test, pred_nb))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1016
           1       0.85      0.83      0.84       984

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



### Logistic Regression

In [81]:
from sklearn.linear_model import LogisticRegression

In [82]:
lr =  LogisticRegression()

In [83]:
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [86]:
print(f"Accurancy on Train Data, using Logistic Regeression {lr.score(x_train,y_train)*100}")

Accurancy on Train Data, using Logistic Regeression 99.8125


In [102]:
lr_score = lr.score(x_test, y_test)
lr_score

0.8485

In [87]:
lr_conf = confusion_matrix(lr.predict(x_test),y_test)
lr_conf

array([[864, 151],
       [152, 833]])

In [92]:
pred_lr = lr.predict(x_test)

In [100]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1016
           1       0.85      0.85      0.85       984

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [103]:
from prettytable import PrettyTable

model_tab = PrettyTable(['Model','Score'])

model_tab.add_row(['RandomForestClassifier',rf_score])
model_tab.add_row(['naive_bayes',nb_score])
model_tab.add_row(['LogisticRegression',lr_score])
print(model_tab)

+------------------------+--------+
|         Model          | Score  |
+------------------------+--------+
| RandomForestClassifier | 0.842  |
|      naive_bayes       | 0.841  |
|   LogisticRegression   | 0.8485 |
+------------------------+--------+


# We'll check the polarity Score using pretrained sentiment classifiers such as Vader Lexicon, and Text Blob

In [104]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [105]:
# Lowercase the text
df.review = df.review.str.lower()

### Vader - Lexicon (Sentiment Intensity Analyzer)

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically designed to detect sentiments expressed in social media.


pos, neu, and neg scores are ratios for proportions of text that fall in each category (so these should all add up to be 1 or close to it with float operation).

The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and 1 (most extreme positive). This is the most useful metric if you want a single measure of sentiment for a given sentence.

In [106]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




True

In [107]:
sent = SentimentIntensityAnalyzer()
sent.polarity_scores(" this drink was awesome: I had a heavenly feeling by taking a sip") # example text

{'compound': 0.8625, 'neg': 0.0, 'neu': 0.422, 'pos': 0.578}

In [108]:
score_com = []
score_pos = []
score_neg = []


from tqdm import tqdm
for i in tqdm(range(0,df1.shape[0])):
    score = sent.polarity_scores(df1.iloc[i][1])
    score_com.append(score['compound'])
    score_pos.append(score['pos'])
    score_neg.append(score['neg'])

100%|██████████| 10000/10000 [00:14<00:00, 713.01it/s]


In [109]:
df['Positive_Score'] = score_pos
df['Negative_Score'] = score_neg
df['Compound_Score'] = score_com

In [110]:
df.head()

Unnamed: 0,label,review,Positive_Score,Negative_Score,Compound_Score
0,pos,stuning even for the non-gamer: this sound tra...,0.243,0.088,0.9454
1,pos,the best soundtrack ever to anything.: i'm rea...,0.145,0.018,0.8957
2,pos,amazing!: this soundtrack is my favorite music...,0.268,0.04,0.9858
3,pos,excellent soundtrack: i truly like this soundt...,0.295,0.09,0.9814
4,pos,"remember, pull your jaw off the floor after he...",0.254,0.0,0.9781


### Text blob

TextBlob is a simple Python library for processing textual data and performing tasks such as sentiment analysis, text pre-processing, etc.

The sentiment property provides of tuple with polarity and subjectivity scores. 

Polarity, in simple terms, means emotions expressed in a sentence – negative vs. positive, The polarity score is a float within the range [-1.0, 1.0]

Subjectivity expresses some personal feelings, views, or beliefs – objective vs. subjective, subjectivity is a float within the range [0.0, 1.0], where 0 is very objective and 1 is very subjective.

In [127]:
from textblob import TextBlob

In [131]:
# Example using TextBlob
# earth is round is fact "is it correct?" context is required to answer , ie. subjective
text = " okay, good" 
output = TextBlob(text)
output.sentiment

Sentiment(polarity=0.6, subjectivity=0.55)

In [180]:
score_pol = []
score_sub = []
from tqdm import tqdm
for i in tqdm(range(0,df.shape[0])):
    output = TextBlob(df.iloc[i][1]).sentiment
    
    score_pol.append(output[0])
    score_sub.append(output[1])

100%|██████████| 10000/10000 [00:09<00:00, 1024.01it/s]


In [134]:
df['Polarity']= score_pol
df["Subjectivity"]= score_sub

In [135]:
df.head()

Unnamed: 0,label,review,Positive_Score,Negative_Score,Compound_Score,Polarity,Subjectivity
0,pos,stuning even for the non-gamer: this sound tra...,0.243,0.088,0.9454,-0.021875,0.55
1,pos,the best soundtrack ever to anything.: i'm rea...,0.145,0.018,0.8957,0.261111,0.51746
2,pos,amazing!: this soundtrack is my favorite music...,0.268,0.04,0.9858,0.274691,0.545988
3,pos,excellent soundtrack: i truly like this soundt...,0.295,0.09,0.9814,0.272727,0.463636
4,pos,"remember, pull your jaw off the floor after he...",0.254,0.0,0.9781,0.324802,0.520317


In [136]:
# We'll analyse the sentiment scores calculatd using SentimentIntensityAnalyzer from Vader Lexicon

df.Compound_Score.mean()

0.3350300499999999

In [137]:
df.describe()

Unnamed: 0,Positive_Score,Negative_Score,Compound_Score,Polarity,Subjectivity
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.151041,0.076004,0.33503,0.144995,0.532418
std,0.101915,0.073066,0.675263,0.247205,0.149731
min,0.0,0.0,-0.9945,-1.0,0.0
25%,0.079,0.016,-0.29765,0.0,0.447619
50%,0.135,0.0615,0.6697,0.15,0.533864
75%,0.207,0.115,0.9137,0.298359,0.621055
max,0.634,0.611,0.9987,1.0,1.0


In [138]:
df.label.value_counts() # there's not much class imbalance. So we'll not handle the class imbalance for now.

neg    5097
pos    4903
Name: label, dtype: int64

In [139]:
len(df.Compound_Score[df.Compound_Score > 0]) # Pos

6793

In [140]:
len(df.Compound_Score[df.Compound_Score < 0]) # Neg

3066

In [141]:
len(df.Polarity[df.Polarity > 0]) # Positive

7410

In [142]:
len(df.Polarity[df.Polarity < 0]) # Negative

2428

As we can check sentiment Intensity Analyser is not very good in classifing if our data is Positive or Negative
- Though this pretrained sentiment analysis techniques are usefull for unlabled data.

# Now as we know LSTM's are good for long sequecial text data.
# We'll build model using LSTM.

In [143]:
df.head()

Unnamed: 0,label,review,Positive_Score,Negative_Score,Compound_Score,Polarity,Subjectivity
0,pos,stuning even for the non-gamer: this sound tra...,0.243,0.088,0.9454,-0.021875,0.55
1,pos,the best soundtrack ever to anything.: i'm rea...,0.145,0.018,0.8957,0.261111,0.51746
2,pos,amazing!: this soundtrack is my favorite music...,0.268,0.04,0.9858,0.274691,0.545988
3,pos,excellent soundtrack: i truly like this soundt...,0.295,0.09,0.9814,0.272727,0.463636
4,pos,"remember, pull your jaw off the floor after he...",0.254,0.0,0.9781,0.324802,0.520317


In [144]:
df.label.value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [145]:
df.label.replace({'neg': 0,'pos':1}, inplace = True)

In [146]:
df.label.value_counts()

0    5097
1    4903
Name: label, dtype: int64

In [147]:
df_x = df.iloc[:,1]
df_y = df.iloc[:,0]

In [148]:
df_x

0       stuning even for the non-gamer: this sound tra...
1       the best soundtrack ever to anything.: i'm rea...
2       amazing!: this soundtrack is my favorite music...
3       excellent soundtrack: i truly like this soundt...
4       remember, pull your jaw off the floor after he...
                              ...                        
9995    a revelation of life in small town america in ...
9996    great biography of a very interesting journali...
9997    interesting subject; poor presentation: you'd ...
9998    don't buy: the box looked used and it is obvio...
9999    beautiful pen and fast delivery.: the pen was ...
Name: review, Length: 10000, dtype: object

In [149]:
df_x[2] # checking how a random review looks like

'amazing!: this soundtrack is my favorite music of all time, hands down. the intense sadness of "prisoners of fate" (which means all the more if you\'ve played the game) and the hope in "a distant promise" and "girl who stole the star" have been an important inspiration to me personally throughout my teen years. the higher energy tracks like "chrono cross ~ time\'s scar~", "time of the dreamwatch", and "chronomantique" (indefinably remeniscent of chrono trigger) are all absolutely superb as well.this soundtrack is amazing music, probably the best of this composer\'s work (i haven\'t heard the xenogears soundtrack, so i can\'t say for sure), and even if you\'ve never played the game, it would be worth twice the price to buy it.i wish i could give it 6 stars.'

In [150]:
from sklearn.model_selection import train_test_split

In [151]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2)

In [152]:
print(x_train.shape,
      x_test.shape,
      y_train.shape,
      y_test.shape)

(8000,) (2000,) (8000,) (2000,)


In [153]:
from tensorflow.keras.utils import to_categorical 

In [154]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [155]:
y_test

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [156]:
from keras.preprocessing.text import Tokenizer # used for tokenization
from keras.preprocessing.sequence import pad_sequences # used for 

In [157]:
max_num_words = 10000
seq_len = 50
embedding_size = 100

In [158]:
tokenizer = Tokenizer(num_words= max_num_words)
tokenizer.fit_on_texts(df.review)

In [159]:
x_train = tokenizer.texts_to_sequences(x_train) # will convert the text to sequences of IDs
x_train = pad_sequences(x_train, maxlen = seq_len)

In [160]:
x_test  = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen = seq_len)

In [172]:
model = Sequential() # initialize the network

model.add(Embedding (input_dim = max_num_words,
                     input_length = seq_len,
                     output_dim = embedding_size))

In [173]:
from keras.layers import LSTM

model.add(LSTM(10))
model.add(Dense(2,activation ='softmax'))
model.compile(optimizer= 'adam',loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [174]:
model.fit(x_train, y_train, epochs= 5, batch_size= 32, validation_split =0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3dc3262ad0>

In [184]:
pred = model.predict(x_test)
pred_class = np.argmax(pred,1)
pred_class

array([0, 0, 1, ..., 1, 1, 1])

In [185]:
from sklearn.metrics import  confusion_matrix
tab = confusion_matrix(pred_class, np.argmax(y_test,1))
tab

array([[764, 168],
       [238, 830]])

In [186]:
from sklearn.metrics import accuracy_score
lstm_score = accuracy_score(pred_class, np.argmax(y_test,1))
lstm_score

0.797

In [178]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 100)           1000000   
                                                                 
 lstm_3 (LSTM)               (None, 10)                4440      
                                                                 
 dense_1 (Dense)             (None, 2)                 22        
                                                                 
Total params: 1,004,462
Trainable params: 1,004,462
Non-trainable params: 0
_________________________________________________________________


In [187]:
model_tab.add_row(['LSTM',lstm_score])
print(model_tab)

+------------------------+--------+
|         Model          | Score  |
+------------------------+--------+
| RandomForestClassifier | 0.842  |
|      naive_bayes       | 0.841  |
|   LogisticRegression   | 0.8485 |
|          LSTM          | 0.797  |
+------------------------+--------+


# We can check that Random Forrest was little more accurate on our data,
- Considering Computation time to build the random forrest model was less, but preprocessing task such as generating bag of words was computational heavy.
- As LSTM are good for long sequence data, if the Reviews are too lengthy we can try using LSTM. Where bagofwords vocabulary is very large.

- Random forrest considering a good model from our data, we can further hypertune the parameters and get more better model for this data.

- Some conclusions were refered in this project from the study done at
https://kth.diva-portal.org/smash/get/diva2:1334069/FULLTEXT01.pdf

-- END of Notebook --