In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint

import csv
import numpy as np
import re
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [None]:
# IMDB dataset: To make our review dataset compatible
max_features = 10000
maxlen = 500
batch_size = 32
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')
print(input_train[1])

Read review dataset  
Fields: ID, Product_ID, User_ID, Profile, HN, HD, Score, Time, Summary, Text  
Fields needed: Score, Summary, text

In [2]:
# data = []
score = []
# summary = []
text = []
review_class = []
vocabulary = dict()
num_words = 1
maxlen = 0
head_line = True
with open("Reviews.csv", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        if head_line:
            head_line = False
            continue
        # data.append(row)
        score.append(int(row[6]))
        if(int(row[6]) >= 3):
            review_class.append(1)
        else:
            review_class.append(0)
        
        # summary.append(row[8])
        text.append([])
        review = re.findall('\w+', row[9])
        maxlen = max(maxlen, len(review))
        for w in review:
            if w not in vocabulary:
                vocabulary[w] = num_words
                num_words += 1
            text[-1].append(vocabulary[w])
        del review
        
maxfeat = len(vocabulary)        
print("Number of samples:", len(text))
# print(text[1])
print("maximum length review:", maxlen)
print("vocabulary size:", maxfeat)

Number of samples: 568454
maximum length review: 3529
vocabulary size: 159169


In [2]:
# tokenization
reviews = []
review_class = []
score = []
head_line = True
with open("Reviews.csv", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        if head_line:
            head_line = False
            continue
        # data.append(row)
        # score.append(row[6])
        if(int(row[6]) >= 3):
            review_class.append(1)
        else:
            review_class.append(0)
            
        reviews.append(row[9])
        score.append(int(row[6]))
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
print(sequences[1])

[38, 376, 2210, 23, 5763, 1961, 1083, 1, 1083, 78, 252, 193, 1054, 3613, 18, 209, 36, 9, 21, 71, 3190, 31, 36, 1, 1593, 2218, 5, 7946, 1, 38, 23, 5763]


In [8]:
print('vocabulary size:',len(tokenizer.word_index))

vocabulary size: 133039


In [8]:
maxlen = 200
sequences = sequence.pad_sequences(sequences, maxlen=maxlen)
text_train, text_test, class_train, class_test = train_test_split(sequences, review_class, test_size=0.20)
print("Train Set:", len(text_train))
print("Test Set:", len(text_test))
print(text_train[0])


Train Set: 454763
Test Set: 113691
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    2  125   13  106   46    7 7700  265  475
   32    1  249 5422    5  101    9  164   57    2   21 1311  602   74
   32  340 3119    4   46    2 1065    9   56   30    4  708  293  164
   44   18   10   10  435    6 3146    1 2379 3607   16  999 8754   13
 1496  288  266   29    3 1527 8140    4 3654  105   15   32    4   69
   48 1615   87    6   21   25 1527    2  185  212   13 1025  483  105
   10   10  435    9   40  196   17    1   42    7    4    2   56  136
  146    9  164   11  233  137 1300    4  203  105    7   40   12    1
  312 1332    6 1426    5 1120    9   40    8  120   11 8248  135    3
   14  849  203    3  453   10   10  677    9   40    8  136  430   32
   13  249 5422   58  261   65   89   16  

Model

In [9]:
def pos_neg_model(case):
    m = Sequential
    m = Sequential()
    m.add(Embedding(max_words, 32))
    m.add(LSTM(32))
    
    if case == 1:  # sigmoid output
        m.add(Dense(1, activation='sigmoid'))
        m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    else:
        m.add(Dense(2, activation='softmax'))
        m.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
    m.summary()
    return m

def rating_model():
    m = Sequential
    m = Sequential()
    m.add(Embedding(max_words, 32))
    m.add(LSTM(32))
    m.add(Dense(5, activation='softmax'))
    m.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
    m.summary()
    return m

In [46]:
# filepath="weights-{epoch:02d}-{val_acc:.2f}.hdf5"
model_type = 2
model = pos_neg_model(model_type)

filepath="weights-best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

if model_type == 1:
    history = model.fit(text_train, class_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=callbacks_list)
else:
    # one hot encode classes
    print("One Hot Encode classes")
    classes = []
    for c in class_train:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    print("Start Training")
    classes = np.array(classes)
    history = model.fit(text_train, classes, epochs=10, batch_size=128, validation_split=0.33, callbacks=callbacks_list)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 66        
Total params: 328,386
Trainable params: 328,386
Non-trainable params: 0
_________________________________________________________________
One Hot Encode classes
Start Training
Train on 304691 samples, validate on 150072 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.91637, saving model to weights-best.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.91637 to 0.92867, saving model to weights-best.h5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.92867
Epoch 4/10

Epoch 00004: val_acc improved from 0.9286

In [47]:
model.save_weights("weights-best.h5")

In [48]:
model.load_weights("weights-best.h5")

In [57]:
if model_type == 1:
    scores = model.evaluate(text_test, class_test)
else:
    print("One Hot Encode classes")
    classes = []
    for c in class_test:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    classes = np.array(classes)
    scores = model.evaluate(text_test, classes)

print(scores)

One Hot Encode classes
[0.14725378104959091, 0.9409715808691633]


In [50]:
pred = model.predict(text_test)

In [55]:
def precision_recall(predicted, actual):
    precision = 0
    recall = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for p, a in zip(predicted, actual):
        p = round(p)
        
        if a==1 and p==1:
            tp += 1
        elif a==1 and p==0:
            fn += 1
        elif a ==0 and p==1:
            fp += 1
        else:
            tn += 1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    return precision, recall, accuracy

print(len(pred))
if model_type == 1:
    preds_classes = np.argmax(pred, axis=-1)
    results = precision_recall(pred_classes, class_test)
else:
    print("One Hot Encode classes")
    classes = []
    for c in class_test:
        temp = [0, 0]
        temp[c] = 1
        classes.append(temp)
    preds_classes = np.argmax(pred, axis=-1)
    results = precision_recall(preds_classes, class_test)

print("Precision:", results[0])
print("Recall:", results[1])
print("Accuracy:", results[2])

113691
One Hot Encode classes
Precision: 0.9491320305525246
Recall: 0.9837248753405644
Accuracy: 0.9409715808639206


**RESULTS** for LSTM with sigmoid activation in last layer  
Precision: 0.9709278222295925  
Recall: 0.9631316506451447  
Accuracy: 0.9437862275817787  

**RESULTS** for LSTM with softmax last layer  
Precision: 0.9491320305525246  
Recall: 0.9837248753405644  
Accuracy: 0.9409715808639206  

In [11]:
maxlen = 500
sequences = sequence.pad_sequences(sequences, maxlen=maxlen)
text_train, text_test, rating_train, rating_test = train_test_split(sequences, score, test_size=0.20)
print("Train Set:", len(text_train))
print("Test Set:", len(text_test))
print(text_train[0])


Train Set: 454763
Test Set: 113691
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0  

In [None]:
model = rating_model()

filepath="rating-weights-best.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

# one hot encode classes
print("One Hot Encode classes")
classes = []
for c in rating_train:
    temp = [0]*5
    temp[c-1] = 1
    classes.append(temp)
print("Start Training")
classes = np.array(classes)
history = model.fit(text_train, classes, epochs=10, batch_size=128, validation_split=0.33, callbacks=callbacks_list)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 165       
Total params: 328,485
Trainable params: 328,485
Non-trainable params: 0
_________________________________________________________________
One Hot Encode classes
Start Training
Train on 304691 samples, validate on 150072 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.68689, saving model to rating-weights-best.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.68689 to 0.72317, saving model to rating-weights-best.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.72317 to 0.74523, saving model to rating-weights-bes