# DATA CLEANSING
---
##   PART1: DATASET HANDLING

In [17]:
import pandas as pd
import numpy as np
# jupyter run .\DataHandlerWord2Vec.ipynb

''' Importing dataset using pandas '''

dataset = pd.read_csv('amazon_reviews.csv')
# Getting "reviewText" column
reviewText = dataset['reviewText']

dataset.columns

Index(['Unnamed: 0', 'reviewerName', 'overall', 'reviewText', 'reviewTime',
       'day_diff', 'helpful_yes', 'helpful_no', 'total_vote',
       'score_pos_neg_diff', 'score_average_rating', 'wilson_lower_bound'],
      dtype='object')

In [5]:
import nltk
from nltk.corpus import stopwords
from string import punctuation

''' Set punctuations and stop words '''

# Downloading stopwords from nltk library
nltk.download('stopwords')

# Remove the NaN values from the dataset
reviewText = reviewText.dropna()

# Setting stopwords
stop_words = set(stopwords.words('english'))
# Except "no" from the stopwords
stop_words.remove('no')

# Set punctuations
punctuations = set(punctuation)
# Exceptions on punctuations
punctuations.remove('.' and ',' and '!' and '?')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
''' Remove punctuations and stop words from the reviews '''

# Applying removed punctuations
reviewText = reviewText.apply(lambda x: ''.join([word for word in x 
                                                  if word not in punctuations
                                                  and word not in stop_words]))
dataset['reviewText'] = reviewText

In [11]:
# run the pip command on terminal or here
# %pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz --user
import spacy

lem = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def spacy_text_lemmatizer(text):
    ''' Spacy Lemmatizer '''
    text = str(text).lower()
    text = lem(text)
    # convert spacy.tokens.doc.Doc to str
    text = " ".join([token.lemma_ for token in text])
    
    return text

In [10]:
import tensorflow_hub as hub
''' Load pretrained Word2Vec from tensorflow_hub '''

word2vecPretrained = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

In [149]:
def get_word2vec_enc(reviews):
    """ get word2vec value for each word in sentence and join so we use as RNN input """
    encoded_reviews = []
    for review in reviews:
        if type(review) == float or review == "": # if review is empty, make it " "
            review = " " 
        tokens = review.split(" ")
        word2vec_embedding = word2vecPretrained(tokens) # use word2vec model
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews
        
def get_padded_encoded_reviews(encoded_reviews):
    """ make all the encoded sentences same length (50)"""
    max_length = 50 # max number of words in a sentence
    padded_reviews_encoding = []
    for enc_review in encoded_reviews:
        if len(enc_review) > max_length: # if length is bigger than 50 just truncate
            enc_review = enc_review[:max_length]
        zero_padding_cnt = max_length - enc_review.shape[0]
        pad = np.zeros((1, 250))
        for i in range(zero_padding_cnt):
            enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return padded_reviews_encoding

def rating_encode(rating):
    """ return one hot encoding for rating value """
    if rating == '5.0':
        return [1,0]
    else: # rating == '1.0'
        return [0,1]
    
def getTrainAndTestData(dataset):
    ''' Split the data to train and test'''
    # 90% of the dataset
    train = dataset.sample(frac=0.9, random_state=100)
    # 10% of the dataset
    test = dataset.drop(train.index)
    return train, test

In [151]:
def preprocess(df):
    """ encode text value to numeric value (except 2.0 3.0 and 4.0 rating) """
    df = df[df['overall'] != 2.0]
    df = df[df['overall'] != 3.0]
    df = df[df['overall'] != 4.0]
    # shape of dataframe is (3750, 12)

    # apply spacy_text_lemmatizer to every review
    df['reviewText'] = df['reviewText'].apply(spacy_text_lemmatizer)
    reviews = df['reviewText'].tolist()
    
    # apply word2vec encoder to every review
    encoded_reviews = get_word2vec_enc(reviews)
    # apply padding to every encoded review to make them same length
    padded_encoded_reviews = get_padded_encoded_reviews(encoded_reviews)
    
    # encoded rating
    rates = df['overall'].tolist()
    # make rates list of string to use it in rating_encode()
    rates = [str(rate) for rate in rates]
    
    encoded_rating = [rating_encode(rate) for rate in rates]
    X = np.array(padded_encoded_reviews)
    Y = np.array(encoded_rating)
    
    return X, Y

trainData, testData = getTrainAndTestData(dataset)
train_X, train_Y = preprocess(trainData)

---
## PART2: DEEP LEARNING MODEL

In [152]:
import tensorflow
from tensorflow.python.keras.layers import Dense, LSTM
from tensorflow.python.keras.engine.sequential import Sequential

''' Build model using RNN + LSTM '''

model = Sequential()
model.add(LSTM(32))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# Train
print('Fitting Train Data')
model.fit(train_X, train_Y, epochs=5)
model.summary()

# Test
print('Evaluating Test Data')
test_X, test_Y = preprocess(testData)
score, acc = model.evaluate(test_X, test_Y, verbose=2)
print('Test score:' , score, "\nTest Accuracy:", acc)

Fitting Train Data
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_36 (LSTM)               (None, 32)                36224     
_________________________________________________________________
dense_36 (Dense)             (None, 2)                 66        
Total params: 36,290
Trainable params: 36,290
Non-trainable params: 0
_________________________________________________________________
Evaluating Test Data
13/13 - 1s - loss: 0.1872 - accuracy: 0.9543
Test score: 0.18716658651828766 
Test Accuracy: 0.9543269276618958
