In [1]:
import pandas as pd
import string
from random import random
import re
from collections import Counter
from bs4 import BeautifulSoup
import json
from sklearn.utils import class_weight
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from keras.metrics import binary_accuracy

Using TensorFlow backend.


In [2]:
data = pd.read_csv("./Reviews.csv").fillna("")

In [3]:
print(data.columns)

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [4]:
imp_cols = set(data.columns)-{'Id','ProductId'}
print ('Number of duplicates:',data.duplicated(subset=imp_cols).sum())
data = data.drop_duplicates(subset=imp_cols)
print("Size of data : {}".format(data.shape))

Number of duplicates: 172145
Size of data : (396309, 10)


In [5]:
full_text = []
for s, t in zip(list(data["Summary"]), list(data["Text"])):
    temp = BeautifulSoup("{} {}".format(s,t).lower(), "lxml").getText()
    for punctuation in string.punctuation:
        if "\'" != punctuation:
            temp = temp.replace(punctuation, " {} ".format(punctuation))
    full_text.append(re.sub(" +", " ", temp))   

In [6]:
full_text[0]

'good quality dog food i have bought several of the vitality canned dog food products and have found them all to be of good quality . the product looks more like a stew than a processed meat and it smells better . my labrador is finicky and she appreciates this product better than most . '

In [7]:
vocab = list(Counter(" ".join(full_text).split()).items())

vocab = sorted(vocab, reverse=True, key=lambda x:x[1])

In [8]:
top_words = 5000
req_vocab = vocab[:top_words]

In [9]:
req_vocab = {item[0]:index+1 for index, item in enumerate(req_vocab)}

In [10]:
# lets save the dictionary 
with open("required_vocabulary.txt", "w", encoding="utf8") as f:
    f.write(json.dumps(req_vocab, indent=4, sort_keys=True))

In [11]:
# change the words to integers 

full_text_int = [[req_vocab[word] for word in reveiw.split() if word in req_vocab.keys()]\
             for reveiw in full_text]

In [12]:
# sanity check, the lengths will be different because we have take taken top "top_words"
# for our model

print(full_text[0])
print(len(full_text[0]))
print(full_text_int[0])
print(len(full_text_int[0]))

good quality dog food i have bought several of the vitality canned dog food products and have found them all to be of good quality . the product looks more like a stew than a processed meat and it smells better . my labrador is finicky and she appreciates this product better than most . 
288
[30, 169, 120, 65, 4, 22, 135, 341, 9, 2, 567, 120, 65, 215, 5, 22, 123, 36, 47, 7, 37, 9, 30, 169, 1, 2, 43, 675, 54, 28, 6, 2658, 62, 6, 1185, 485, 5, 8, 663, 91, 1, 15, 10, 2160, 5, 127, 11, 43, 91, 62, 156, 1]
52


In [13]:
# Here we can see that the word vitality is not found
req_vocab.get("vitality", "NOT FOUND!!")

'NOT FOUND!!'

In [14]:
# now we will make the sequence lenght equal 

default_seq_len = 600
full_text_int = sequence.pad_sequences(full_text_int, maxlen=default_seq_len)

In [15]:
# now we fill fix the labels
labels = [1 if item>3 else 0 for item in list(data["Score"].values)]

In [16]:
# Now we would prepare train and test data
whole_data = list(zip(full_text_int, labels))
class_0 = []; class_1 = []
for index, item in enumerate(whole_data):
    if item[1] == 0:
        class_0.append(item)
    else:
        class_1.append(item)

XY_train = []; XY_test = []
XY_train.extend(class_0[ : int(len(class_0)*.8)])
XY_train.extend(class_1[ : int(len(class_1)*.8)])
XY_test.extend(class_0[int(len(class_0)*.8) : ])
XY_test.extend(class_1[int(len(class_1)*.8) : ])

X_train = [item[0] for item in XY_train]
y_train = [item[1] for item in XY_train]
X_test = [item [0] for item in XY_test]
y_test = [item[1] for item in XY_test]

In [17]:
cw = class_weight.compute_class_weight("balanced", np.array([0, 1]), y_train)

In [18]:
X_train

[array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# Model

In [19]:
embedding_size = 64

In [20]:
model = Sequential()

model.add(Embedding(top_words, embedding_size, input_length=default_seq_len))

model.add(LSTM(100, return_sequences=True))
model.add(Dropout(.2))

model.add(LSTM(100))
model.add(Dropout(.2))

model.add(Dense(1))
model.add(Activation("sigmoid"))

In [21]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 64)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 600, 100)          66000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 600, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total para

In [23]:
adam = Adam(lr=3e-4)

model.compile(loss=binary_crossentropy, optimizer=adam, metrics=[binary_accuracy])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test),\
                    class_weight=cw, verbose=1)