In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint

import csv
import numpy as np
import re
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


<h1>Important Functions</h1>  

The first three cells are important functions dealing with data loading, models and evaluation metrics  
**RUN the first three code cells**  
  
**Read review dataset**  
Fields: ID, Product_ID, User_ID, Profile, HN, HD, Score, Time, Summary, Text  
Fields needed: Score, Summary, text  
  
**Tokenization**  
Build a vocabulary of max_words = 10000 most used words  
Clip sentenses to maxlen = 200 words  
One hot-encode words

In [2]:
# load review data
# text_type: 1 -> reviews full text, 0 -> generated summaries
max_words = 10000

def data_loader_reviews(text_type, maxlen = 200):
    text = []
    classes = []
    score = []
    head_line = True
    with open("Reviews_summaries.csv", 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            if head_line:
                head_line = False
                continue
            if(int(row[7]) >= 3):
                classes.append(1)
            else:
                classes.append(0)

            if text_type == 1:
                text.append(row[10])
            else:
                text.append(row[11])
                
            score.append(int(row[7]))
    
    max_words = 10000
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    if text_type == 1:
        sequences = sequence.pad_sequences(sequences, maxlen=maxlen)
    else:
        sequences = sequence.pad_sequences(sequences)

    print(sequences[1])
    return classes, score, sequences

**Define Model**  
define two RNN - models  
  
*Model 1:*  
    a) sequential model with first layer as embedding layer followed by an LSTM layer and a sigmoid output for classifying review as positive-negative.  
    b) sequential model with first layer as embedding layer followed by an LSTM layer and a softmax output with 2 nodes for classifying review as positive-negative.  
  
*Model 2:*  
sequential model with first layer as embedding layer followed by an LSTM layer and a softmax output with 5 nodes for classifying review on a scale of 1-5.  

In [3]:
def pos_neg_model(case):
    m = Sequential()
    m.add(Embedding(max_words, 32))
    m.add(LSTM(32))
    
    if case == 1:  # sigmoid output
        m.add(Dense(1, activation='sigmoid'))
        m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    else:
        m.add(Dense(2, activation='softmax'))
        m.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
    m.summary()
    return m

def rating_model():
    m = Sequential()
    m.add(Embedding(max_words, 32))
    m.add(LSTM(32))
    m.add(Dense(5, activation='softmax'))
    m.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
    m.summary()
    return m

In [4]:
def precision_recall(predicted, actual):
    precision = 0
    recall = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for p, a in zip(predicted, actual):
        p = round(p)
        
        if a==1 and p==1:
            tp += 1
        elif a==1 and p==0:
            fn += 1
        elif a ==0 and p==1:
            fp += 1
        else:
            tn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    return precision, recall, accuracy

def MAE(predicted, actual):
    mae = 0
    for p, a in zip(predicted, actual):
        mae += abs(a - p)
    mae = mae/len(predicted)
    return mae

def accuracy(predicted, actual):
    acc = 0
    for p, a in zip(predicted, actual):
        if a == p:
            acc += 1
    acc = acc/len(predicted)
    return acc

___
<h1>Part1: Classifying reviews as positive/negative</h1>

**Train** model for predicting pos/neg  
Steps:  
1. Load and tokenize data
2. Split data into training and testing set in ratio 4:1
3. Select the model type. type = 1 (default, preferred) for sigmoid output layer.
4. Train the model
5. Save the weights
6. If not training, then load the weights. Model weights are provided for type = 1 and are selected by default
7. Predict class for test data and evaluate

*Complete Part1 takes around 2.5 hours to execute on a 1070TI GPU*

In [19]:
# Split dataset to test and train
review_classes, review_rating, review_text = data_loader_reviews(text_type = 1)
text_train, text_test, class_train, class_test = train_test_split(review_text, review_classes, test_size=0.20)
print("Train Set:", len(text_train))
print("Test Set:", len(text_test))

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
   38  376 2210   23 5763 1961 1083    1 1083   78  252  193 1054 3613
   18  209   36    9   21   71 3190   31   36    1 1593 2218    5 7946
    1 

In [46]:
# model_type: 1 -> sigmoid output, 2 -> softmax with two nodes output
model_type = 1
model = pos_neg_model(model_type)

filepath="models/weights-best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

if model_type == 1:
    history = model.fit(text_train, class_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=callbacks_list)
else:
    # one hot encode classes
    print("One Hot Encode classes")
    classes = []
    for c in class_train:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    print("Start Training")
    classes = np.array(classes)
    history = model.fit(text_train, classes, epochs=10, batch_size=128, validation_split=0.33, callbacks=callbacks_list)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 66        
Total params: 328,386
Trainable params: 328,386
Non-trainable params: 0
_________________________________________________________________
One Hot Encode classes
Start Training
Train on 304691 samples, validate on 150072 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.91637, saving model to weights-best.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.91637 to 0.92867, saving model to weights-best.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.92867
Epoch 4/10

Epoch 00004: val_acc improved from 0.9286

In [47]:
# SAVE WEIGHTS
if model_type == 1:
    model.save_weights("models/rnn-pos-neg-model-sigmoid.h5")
else:
    model.save_weights("models/rnn-pos-neg-model-softmax.h5")

In [48]:
# LOAD WEIGHTS
if model_type == 1:
    model = pos_neg_model(1)
    model.load_weights("models/rnn-pos-neg-model-sigmoid.h5")
else:
    model = pos_neg_model(2)
    model.load_weights("models/rnn-pos-neg-model-softmax.h5")

**EVALUATION** of pos-neg classification

In [57]:
if model_type == 1:
    scores = model.evaluate(text_test, class_test)
else:
    print("One Hot Encode classes")
    classes = []
    for c in class_test:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    classes = np.array(classes)
    scores = model.evaluate(text_test, classes)

print(scores)

One Hot Encode classes
[0.14725378104959091, 0.9409715808691633]


In [55]:
pred = model.predict(text_test)
print(len(pred))
if model_type == 1:
    preds_classes = np.argmax(pred, axis=-1)
    results = precision_recall(pred_classes, class_test)
else:
    print("One Hot Encode classes")
    classes = []
    for c in class_test:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    preds_classes = np.argmax(pred, axis=-1)
    results = precision_recall(preds_classes, class_test)

print("Precision:", results[0])
print("Recall:", results[1])
print("Accuracy:", results[2])

113691
One Hot Encode classes
Precision: 0.9491320305525246
Recall: 0.9837248753405644
Accuracy: 0.9409715808639206


**RESULTS** for LSTM with sigmoid activation in last layer  
Precision: 0.9709278222295925  
Recall: 0.9631316506451447  
Accuracy: 0.9437862275817787  

**RESULTS** for LSTM with softmax last layer  
Precision: 0.9491320305525246  
Recall: 0.9837248753405644  
Accuracy: 0.9409715808639206  

___
<h1>Part2: Rating prediction for reviews (1 - 5)</h1>

**Train** model for predicting ratings  
Steps:  
1. Load and tokenize data. text_type = 1 (default, preferred) for full text reviews, 2 for summaries
2. Split data into training and testing set in ratio 4:1
3. Select the model type.
4. Train the model
5. Save the weights
6. If not training, then load the weights. Model weights are provided and are selected by default
7. Predict class for test data and evaluate

*Complete Part2 takes around 4 hours to execute on a 1070TI GPU*

In [31]:
# Split dataset to test and train
text_type = 2  # 1 -> full length review, 2 -> short summaries
review_classes, review_rating, review_text = data_loader_reviews(text_type = text_type, maxlen = 100)
text_train, text_test, class_train, class_test = train_test_split(review_text, review_rating, test_size=0.20)
print("Train Set:", len(text_train))
print("Test Set:", len(text_test))

[  0   0   0   0   0   0 176]
Train Set: 454763
Test Set: 113691


In [22]:
model = rating_model()

filepath="rating-weights-best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# one hot encode classes
print("One Hot Encode classes")
classes = []
for c in class_train:
    temp = [0]*5
    temp[c-1] = 1
    classes.append(temp)
print("Start Training")
classes = np.array(classes)
history = model.fit(text_train, classes, epochs=10, batch_size=128, validation_split=0.2, callbacks=callbacks_list)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 165       
Total params: 328,485
Trainable params: 328,485
Non-trainable params: 0
_________________________________________________________________
One Hot Encode classes
Start Training
Train on 363810 samples, validate on 90953 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.72591, saving model to rating-weights-best.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.72591
Epoch 3/10

Epoch 00003: val_acc improved from 0.72591 to 0.74158, saving model to rating-weights-best.h5
Epoch 4/10

Epoch 00004: val_acc improve

In [None]:
# SAVE WEIGHTS
model.save_weights("models/rnn-rating-model.h5")

In [32]:
# LOAD MODEL
model = rating_model()
model.load_weights("models/rnn-rating-model.h5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 165       
Total params: 328,485
Trainable params: 328,485
Non-trainable params: 0
_________________________________________________________________


In [33]:
pred = model.predict(text_test)
print(len(pred))

print("One Hot Encode classes")
pred_classes = np.argmax(pred, axis=-1)
pred_classes = pred_classes + 1
print(pred_classes)
# results = precision_recall(pred_classes, class_test)
mae = MAE(pred_classes, class_test)
acc = accuracy(pred_classes, class_test)

print("MAE:", mae)
print("Accuracy:", acc)

113691
One Hot Encode classes
[5 5 5 ... 3 5 1]
MAE: 0.5979453078959637
Accuracy: 0.6821384278438927


**RESULTS** for LSTM with full text reviews  
MAE: 0.3116341662928464  
Accuracy: 0.7860428705878214  
  
**RESULTS** for LSTM with summary review  
MAE: 0.5979453078959637  
Accuracy: 0.6821384278438927  