# RNN Training - Vader Labels

Training using the labels generated by the Vader Sentiment Analyzer.

In [287]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import pandas as pd
import string
import nltk
import re

In [278]:
dataset = pd.read_csv('/Users/sankeerthana/Documents/NTU/YEAR_4/SEM_2/CZ4034/IR-Sentiment-Analysis/IR-Sentiment-Analysis/skincare_dataset/product_reviews_labels.csv')
dataset.drop(columns='Unnamed: 0', axis=1, inplace=True)
dataset.head()

Unnamed: 0,product_id,review,vader_label
0,4669755719749,This makes my skin smooth and soft and is ligh...,0
1,4669755719749,Love the silky texture. It's very lightweight ...,0
2,4669755719749,I’ve been trying to find a moisturizer that wo...,0
3,4669755719749,HYRAM made me buy it and I’m on my second bott...,0
4,4669755719749,It's a nice moisturizer I personally will use ...,0


In [188]:
dataset.columns

Index(['product_id', 'review', 'vader_label'], dtype='object')

### Data Exploration - Class Balance

In [189]:
dataset['vader_label'].value_counts()

 0    14271
 1      751
-1       18
Name: vader_label, dtype: int64

From the above, we can say that the model is pretty skewed, and favours the neutral class as opposed to positive or negative. This means we might need to relook into the labels to see if they are actually of 'neutral' nature or add some of the crawled reviews that have the label as 1 and -1.

### Data Processing

In [190]:
#downloading the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sankeerthana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

There are 179 words in the stopwords list provided by NLTK, which we will be using.

In [197]:
def processing_pipeline(data):
    #remove new line characters
    data = re.sub('\s+', ' ', data)

    #remove distracting single quotes
    data = str(re.sub("\'", "", data))

    #converting the text to lowercase
    data = data.lower()
    
    return data
    

In [273]:
#returns a list
def remove_punctuation(data):
    ans = []
    for word in data.split():
        x = word.strip(string.punctuation)
        ans.append(x)

    return ans

In [227]:
#Function to remove stopwords
def remove_stopwords(data):
    #as the input is already a list
    review = data
    filtered_words = []

    for word in review:
        if word not in stopwords.words('english'):
            filtered_words.append(word)

    return filtered_words

In [279]:
#Processing the dataset
for row in dataset.itertuples():
    data = row.review
   
    #run the processed_pipeline
    data = processing_pipeline(data)

    #remove punctuations
    data = remove_punctuation(data)
    
    #remove stopwords
    data = remove_stopwords(data)

    dataset.at[row.Index,'review'] = ' '.join(data)

In [286]:
dataset

Unnamed: 0,product_id,review,vader_label
0,4669755719749,makes skin smooth soft lightweight absorbs qui...,0
1,4669755719749,love silky texture lightweight hydrating leave...,0
2,4669755719749,i’ve trying find moisturizer would dry skin mo...,0
3,4669755719749,hyram made buy i’m second bottle love product ...,0
4,4669755719749,nice moisturizer personally use gets cold outs...,0
...,...,...,...
15035,260851073033,i’ve used lot creams stuff works wonders i’ve ...,0
15036,260851073033,wasn’t sure product first days skin loves goes...,0
15037,260851073033,nice,1
15038,260851073033,really like doesnt fragrance goes smooth stays...,0


### Splitting to Train-Test-Validation 

In [288]:
x = dataset['review']
y = dataset['vader_label']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20)

In [289]:
print(f"Shape of Train: {len(x_train)}")
print(f"Shape of Test: {len(x_test)}")

Shape of Train: 12032
Shape of Test: 3008


### Comverting Sentences to Word Embeddings

In [282]:
MAX_WORDS = 5000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)




[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 

In [284]:
embedding_layer = Embedding(1000, 64)

### Building an RNN

In [None]:
model1 = Sequential()
model1.add(layers.Embedding(MAX_WORDS, 20)) #The embedding layer
model1.add(layers.LSTM(15,dropout=0.5)) #Our LSTM layer
model1.add(layers.Dense(3,activation='softmax'))


model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint1])