# Preprocessing
##### The necessary preprocessing steps were carried out

In [1]:
# Importing the relevant libraries,classes,etc
import os
import sys
import json
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# Some more importing
from keras import backend as K
from sklearn.metrics import precision_recall_fscore_support

In [3]:
with open("pos_amazon_cell_phone_reviews.json") as f: # reading the file and loading the json file 
    data = json.load(f)
data = data['root']
#print(len(data))
df_pos = pd.DataFrame(data)
summaries_positive = []
text_positive = []
# for getting the summary and the text data.
for i in data:
    summaries_positive.append(i['summary'])
    text_positive.append(i['text'])
label = [1] * 108664   # for creating a label of 1 for positive sentiment
df_pos['label'] = label

In [4]:
df_pos2 = df_pos[0:1625] # this is to select half of 3250, which is 3000(training) and 250(testing) which is then merged.

In [5]:
# same procedure for negative reviews
with open("neg_amazon_cell_phone_reviews.json") as f:
    data = json.load(f)
summaries_negative=[]
text_negative=[]
data = data['root']
df_neg = pd.DataFrame(data)
for i in data:
    summaries_negative.append(i['summary'])
    text_negative.append(i['text'])
label = [0] * len(data)
df_neg['label'] = label

In [6]:
df_neg2 = df_neg[0:1625] # same step for getting the second half of 3250, which is then split into training and testing

In [7]:
df = df_pos2.append(df_neg2) # merging the dataframes for the train/test split(need to be fed to the function from sklearn)

In [8]:
seed = 123 # for consistent split each run
X = df.iloc[:,:-1] # selecting the data
Y = df.iloc[:,-1] # selecting the label
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.076, random_state=seed) # randomized train/test split of approx 3000/250

In [9]:
# for tokenizing and generating the sequence input for the LSTM model. Done seperately for train and test.
tokenizer = Tokenizer()
summary = list(X_train['summary'])
summary_test = list(X_test['summary'])

tokenizer.fit_on_texts(summary)
vocab = tokenizer.word_index
vocab_counts = tokenizer.word_counts

sequences = tokenizer.texts_to_sequences(summary)
sequences_test = tokenizer.texts_to_sequences(summary_test)

In [11]:
pad_length = len(max(sequences, key = lambda x: len(x)))
len(max(sequences_test, key = lambda x: len(x)))
pad_length = 22 # the maximum length of sequence from train and test

In [12]:
# padding the sequence till maxlength of sequence computed in above cell.
new_X_train = pad_sequences(sequences, maxlen = pad_length)
new_X_test =  pad_sequences(sequences_test, maxlen = pad_length)

In [16]:
new_Y_train = to_categorical(y_train)
new_Y_test = to_categorical(y_test)

In [13]:
def precision(y_true, y_pred):
    """Precision metric.
 
    Only computes a batch-wise average of precision.
 
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
 
def recall(y_true, y_pred):
    """Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

# Basic LSTM Model
#### A basic LSTM acrhitecture was constructed, which used a learned embedding layer that comes with the keras API, that is specific to our example. The embedding layer takes as input the vocabulary size and the size of the output vector, which is a hyperparamter that needs to be tuned. We have have used 32(fairly arbitrary)
#### There are 128 hidden units(LSTM cells), this is a hyperparameter that can be tuned.
#### The final output layer has 2 units(because we passed the data as a one hot encoding), one for each class negative(0) or positive(1)

In [20]:
model = Sequential()
model.add(Embedding(len(vocab) + 1, 32, input_length=pad_length)) # the embedding layer as described above
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2)) # The LSTM hidden units. A dropout was used.
model.add(Dense(2,activation='sigmoid')) # 2 sigmoid output units for obtaining a 0 to 1 probability of classification
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', precision, recall]) # precision and recall also calculated for each epoch

In [21]:
model.fit(new_X_train, new_Y_train, validation_split = 0.16, epochs = 10) # ran for 10 epochs, with validation/hold out set

Train on 2522 samples, validate on 481 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20138f5be48>

In [22]:
# Evaluating the model on the test set.
scores = model.evaluate(new_X_test, new_Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
y_pred = model.predict(new_X_test)

[p, r, f, _] = precision_recall_fscore_support([np.argmax(x) for x in new_Y_test], [np.argmax(x) for x in y_pred], average='binary')

print("Precision: %.2f%%" % (p*100))
print("Recall: %.2f%%" % (r*100))
print("F-score: %.2f%%" % (f*100))

Accuracy: 89.07%
Precision: 83.61%
Recall: 93.58%
F-score: 88.31%


In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 22, 32)            65760     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 148,450
Trainable params: 148,450
Non-trainable params: 0
_________________________________________________________________
