In [1]:
import numpy as np
import re, sys
import itertools
from collections import Counter
import pandas as pd
import tensorflow as tf
import os
import pickle
import gensim
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support
np.random.seed(0)

Using TensorFlow backend.


# Loading Text reviews data

In [2]:
x_text_review = np.load('text_review.npy')
ratings = np.load('ratings.npy')
vocabulary_inv_text_review = np.load('text_review_vocabulary_inv.npy')
with open('text_review_vocabulary.pkl', 'rb') as f:
    vocabulary_text_review = pickle.load(f)
vocabulary_inv_text_review = {rank: word for rank, word in enumerate(vocabulary_inv_text_review)}

# Loading Generated summary data

In [3]:
x_gen_summary = np.load('predicted_summary_x.npy')
vocabulary_inv_gen_summary = np.load('predicted_summary_vocabulary_inv.npy')
with open('predicted_summary_vocabulary.pkl', 'rb') as f:
    vocabulary_gen_summary = pickle.load(f)
vocabulary_inv_gen_summary = {rank: word for rank, word in enumerate(vocabulary_inv_gen_summary)}

# Splitting into Train and Test sets

In [4]:
train_indices = np.load('train_indices.npy')
test_indices = np.load('test_indices.npy')

In [5]:
x_train_text_review = x_text_review[train_indices]
x_test_text_review = x_text_review[test_indices]

In [6]:
x_train_gen_summary = x_gen_summary[train_indices]
x_test_gen_summary = x_gen_summary[test_indices]

In [7]:
train_ratings = ratings[train_indices]
test_ratings = ratings[test_indices]

In [8]:
sequence_length_text_review = x_test_text_review.shape[1]
sequence_length_gen_summary = x_test_gen_summary.shape[1]

One Hot encoding on labels (ratings)

In [9]:
def convert_to_onehot(y):
    res = [[0 for j in range(5)] for i in y]
    
    for index,i in enumerate(y):
        res[index][i-1] = 1
    return np.array(res)

In [10]:
train_ratings = convert_to_onehot(train_ratings)
test_ratings = convert_to_onehot(test_ratings)

In [34]:
print("x_train_text_review shape:", x_train_text_review.shape)
print("x_train_gen_summary shape:", x_train_gen_summary.shape)
print("train_ratings shape:", train_ratings.shape)
print("x_test_text_review shape:", x_test_text_review.shape)
print("x_test_gen_summary shape:", x_test_gen_summary.shape)
print("test_ratings shape:", test_ratings.shape)
print("Vocabulary Size Text reviews: {:d}".format(len(vocabulary_inv_text_review)))
print("Vocabulary Size Generated Summary: {:d}".format(len(vocabulary_inv_gen_summary)))

x_train_text_review shape: (454741, 3768)
x_train_text_review shape: (454741, 7)
train_ratings shape: (454741, 5)
x_test_text_review shape: (113686, 3768)
x_train_text_review shape: (113686, 7)
test_ratings shape: (113686, 5)
Vocabulary Size Text reviews: 127686
Vocabulary Size Generated Summary: 537


# Loading emebdding vectors of pre-trained GoogleNews model

In [12]:
embedding_dim = 300
filter_sizes = (3,4,5)
num_filters = 50
dropout_prob = (0.5, 0.5)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 40
input_shape = (sequence_length_text_review,)

In [11]:
embedding_weights = {}
pretrained_fpath = os.path.expanduser("GoogleNews-vectors-negative300.bin")
model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_fpath, binary=True)
found_cnt = 0
for id, word in vocabulary_inv_text_review.items():
    if word in model.vocab:
        embedding_weights[id] = model.word_vec(word)
        found_cnt += 1
    else:
        embedding_weights[id] = np.random.uniform(-0.25, 0.25, embedding_dim)
with open('gensim_embedding_weights', 'wb') as f:
    pickle.dump(embedding_weights, f)

# Text Review Model Training

In [17]:
model_input = Input(shape=input_shape)

z = Embedding(len(vocabulary_inv_text_review), embedding_dim, input_length=sequence_length_text_review, name="embedding",trainable=True)(model_input)

z = Dropout(dropout_prob[0])(z)

conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(5, activation="sigmoid")(z)

model = Model(model_input, model_output)


In [18]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",keras.metrics.Precision(),keras.metrics.Recall()])

In [19]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 3768)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3768, 300)    38305800    input_3[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 3768, 300)    0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 3766, 50)     45050       dropout_1[0][0]                  
____________________________________________________________________________________________

In [15]:
weights = np.array([v for v in embedding_weights.values()])
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

In [17]:
callbacks = [
    ModelCheckpoint(filepath='models/cnn_text_review.hdf5', verbose=1, save_best_only=True, save_weights_only=True),
    
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1),
    
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),

    CSVLogger('./01-metrics.csv')]

In [18]:
# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=1,callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 454741 samples, validate on 113686 samples
Epoch 1/40

Epoch 00001: val_loss improved from inf to 0.23487, saving model to cnn_base_pretrained_1.hdf5
Epoch 2/40

Epoch 00002: val_loss improved from 0.23487 to 0.21643, saving model to cnn_base_pretrained_1.hdf5
Epoch 3/40

Epoch 00003: val_loss improved from 0.21643 to 0.21190, saving model to cnn_base_pretrained_1.hdf5
Epoch 4/40

Epoch 00004: val_loss improved from 0.21190 to 0.20251, saving model to cnn_base_pretrained_1.hdf5
Epoch 5/40
 81600/454741 [====>.........................] - ETA: 19:55 - loss: 0.1731 - accuracy: 0.9305 - precision_1: 0.8716 - recall_1: 0.7656

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00007: val_loss improved from 0.19591 to 0.19384, saving model to cnn_base_pretrained_1.hdf5
Epoch 8/40
 45376/454741 [=>............................] - ETA: 21:49 - loss: 0.1518 - accuracy: 0.9398 - precision_1: 0.8879 - recall_1: 0.7999

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00011: val_loss did not improve from 0.19275
Epoch 12/40

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Epoch 00014: val_loss did not improve from 0.19275
Epoch 00014: early stopping


<keras.callbacks.callbacks.History at 0x7ef3d429fbd0>

# Text Review Evaluation

In [20]:
model.load_weights('models/cnn_text_review.hdf5')

In [21]:
train_predictions = model.predict(x_train_text_review)

In [22]:
test_predictions = model.predict(x_test_text_review)

In [23]:
def calc_mean_prediction_error(predicted_classes,labels):
    error = 0
    for index,i in enumerate(predicted_classes):
        error += abs(labels[index]-i)
    
    return error/float(len(predicted_classes))   

In [24]:
def calc_average(predicted_classes,labels):
    acc = 0
    for index,i in enumerate(predicted_classes):
        if(labels[index]==i):
            acc += 1
    
    return acc/float(len(predicted_classes))

In [26]:
mpe = calc_mean_prediction_error(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
accuracy = calc_average(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
precision_recall = precision_recall_fscore_support(np.argmax(test_ratings,axis=1), np.argmax(test_predictions,axis=1), average='macro')

In [27]:
print("Accuracy for model based on Text reviews:",accuracy)
print("Mean Prediction Error for model based on Text reviews:",mpe)
print("Precision for model based on Text reviews:",precision_recall[0])
print("Recall for model based on Text reviews:",precision_recall[1])

Accuracy for model based on Text reviews: 0.8093784634871488
Mean Prediction Error for model based on Text reviews: 0.2774220220607639
Precision for model based on Text reviews: 0.6814158475214385
Recall for model based on Text reviews: 0.6466022289151423


# Model Training based on Generated summary

In [None]:
embedding_weights = {}
pretrained_fpath = os.path.expanduser("GoogleNews-vectors-negative300.bin")
model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_fpath, binary=True)
found_cnt = 0
for id, word in vocabulary_inv_gen_summary.items():
    if word in model.vocab:
        embedding_weights[id] = model.word_vec(word)
        found_cnt += 1
    else:
        embedding_weights[id] = np.random.uniform(-0.25, 0.25, embedding_dim)
with open('gensim_embedding_weights', 'wb') as f:
    pickle.dump(embedding_weights, f)

In [28]:
input_shape = (sequence_length_gen_summary,)

In [29]:
model_input = Input(shape=input_shape)

z = Embedding(len(vocabulary_inv_gen_summary), embedding_dim, input_length=sequence_length_gen_summary, name="embedding",trainable=True)(model_input)

z = Dropout(dropout_prob[0])(z)

conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(5, activation="sigmoid")(z)

model = Model(model_input, model_output)


In [30]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",keras.metrics.Precision(),keras.metrics.Recall()])

In [None]:
weights = np.array([v for v in embedding_weights.values()])
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

In [31]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 7)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 7, 300)       161100      input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 7, 300)       0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 5, 50)        45050       dropout_3[0][0]                  
____________________________________________________________________________________________

In [32]:
callbacks = [
    ModelCheckpoint(filepath='models/cnn_gen_summary.hdf5', verbose=1, save_best_only=True, save_weights_only=True),
    
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1),
    
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),

    CSVLogger('./01-metrics.csv')]

In [None]:
# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=1,callbacks=callbacks)

# Model Evaluation - Generated Summary

In [33]:
model.load_weights('models/cnn_gen_summary.hdf5')

In [35]:
train_predictions = model.predict(x_train_gen_summary)
test_predictions = model.predict(x_test_gen_summary)

In [36]:
mpe = calc_mean_prediction_error(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
accuracy = calc_average(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
precision_recall = precision_recall_fscore_support(np.argmax(test_ratings,axis=1), np.argmax(test_predictions,axis=1), average='macro')

  'precision', 'predicted', average, warn_for)


In [37]:
print("Accuracy for model based on Generated Summary:",accuracy)
print("Mean Prediction Error for model based on Generated Summary:",mpe)
print("Precision for model based on Generated Summary:",precision_recall[0])
print("Recall for model based on Generated Summary:",precision_recall[1])

Accuracy for model based on Text reviews: 0.6831448023503334
Mean Prediction Error for model based on Text reviews: 0.5998363914642084
Precision for model based on Text reviews: 0.37091235139847417
Recall for model based on Text reviews: 0.3536162549654081
