In [18]:
import numpy as np
import re, sys
import itertools
from collections import Counter
import pandas as pd
import os
import pickle
import gensim
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger
np.random.seed(0)

# Loading and pre-processing data

In [2]:
x_text_review = np.load('text_review.npy')
ratings = np.load('ratings.npy')
vocabulary_inv_text_review = np.load('text_review_vocabulary_inv.npy')
with open('text_review_vocabulary.pkl', 'rb') as f:
    vocabulary_text_review = pickle.load(f)
vocabulary_inv_text_review = {rank: word for rank, word in enumerate(vocabulary_inv_text_review)}

In [3]:
train_indices = np.load('train_indices.npy')
test_indices = np.load('test_indices.npy')

In [4]:
x_train_text_review = x_text_review[train_indices]
x_test_text_review = x_text_review[test_indices]

In [5]:
train_ratings = ratings[train_indices]
test_ratings = ratings[test_indices]

In [6]:
sequence_length_text_review = x_test_text_review.shape[1]

In [7]:
def convert_to_twoclass(y):
    res = [0 for i in y]
    
    for index,i in enumerate(y):
        if(i<=3):
            res[index] = 0
        else:
            res[index] = 1
    return np.array(res)

Creating labels for each review such that when the rating is greater than 3, it is positive. When it is less than or equal to 3, it is negative

In [8]:
y_pos_neg_train = convert_to_twoclass(train_ratings)
y_pos_neg_test = convert_to_twoclass(test_ratings)

In [9]:
def convert_to_onehot(y):
    res = [[0 for j in range(5)] for i in y]
    
    for index,i in enumerate(y):
        res[index][i-1] = 1
    return np.array(res)

In [10]:
train_ratings = convert_to_onehot(train_ratings)
test_ratings = convert_to_onehot(test_ratings)

In [11]:
print("x_train_text_review shape:", x_train_text_review.shape)
print("train_ratings shape:", train_ratings.shape)
print("y_pos_neg_train shape:", y_pos_neg_train.shape)
print("x_test_text_review shape:", x_test_text_review.shape)
print("test_ratings shape:", test_ratings.shape)
print("y_pos_neg_test shape:", y_pos_neg_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv_text_review)))

x_train_text_review shape: (454741, 3768)
train_ratings shape: (454741, 5)
y_pos_neg_train shape: (454741,)
x_test_text_review shape: (113686, 3768)
test_ratings shape: (113686, 5)
y_pos_neg_test shape: (113686,)
Vocabulary Size: 127686


# Model to predict whether a review is pos/neg

In [None]:
embedding_dim = 300
filter_sizes = (3,5)
num_filters = 50
dropout_prob = (0.5, 0.5)
hidden_dims = 10

# Training parameters
batch_size = 64
num_epochs = 20
input_shape = (sequence_length_text_review,)

In [32]:
with open('gensim_embedding_weights', 'rb') as f:
    embedding_weights = pickle.load(f)

In [34]:
model_input = Input(shape=input_shape)

z = Embedding(len(vocabulary_inv_text_review), embedding_dim, input_length=sequence_length_text_review, name="embedding",trainable=False)(model_input)

z = Dropout(dropout_prob[0])(z)

conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)


In [35]:
weights = np.array([v for v in embedding_weights.values()])
embedding_layer = model.get_layer("embedding")
embedding_layer.set_weights([weights])

In [36]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",keras.metrics.Precision(),keras.metrics.Recall()])

In [37]:
callbacks = [
    ModelCheckpoint(filepath='models/cnn_pos_neg.hdf5', verbose=1, save_best_only=True, save_weights_only=True),
    
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),
    
    EarlyStopping(monitor='val_loss', patience=4, verbose=1),

    CSVLogger('./02-metrics.csv')]

In [38]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 3768)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3768, 300)    38305800    input_5[0][0]                    
__________________________________________________________________________________________________
dropout_7 (Dropout)             (None, 3768, 300)    0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 3766, 50)     45050       dropout_7[0][0]                  
____________________________________________________________________________________________

In [None]:
# Train the model
model.fit(x_train_text_review, y_pos_neg_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test_text_review, y_pos_neg_test), verbose=1,callbacks=callbacks)

In [40]:
model.load_weights('models/cnn_pos_neg.hdf5')

In [41]:
train_predictions = model.predict(x_train_text_review, batch_size=batch_size,verbose=1)



In [42]:
valid_predictions = model.predict(x_test_text_review, batch_size=batch_size,verbose=1)



In [43]:
np.save('train_predictions_pos_neg.npy',train_predictions)
np.save('valid_predictions_pos_neg.npy',valid_predictions)

In [44]:
y_pos_neg_train = np.load('train_predictions_pos_neg.npy')
y_pos_neg_test = np.load('valid_predictions_pos_neg.npy')

# Model which takes input as Text review and whether it is pos/neg

In [None]:
embedding_dim = 300
filter_sizes = (3,5)
num_filters = 50
dropout_prob = (0.5, 0.5)
hidden_dims = 10

# Training parameters
batch_size = 64
num_epochs = 20
input_shape = (sequence_length_text_review,)

In [45]:
model_input = Input(shape=input_shape)
model_input2 = Input(shape=(1,))
z = Embedding(len(vocabulary_inv_text_review), embedding_dim, input_length=sequence_length_text_review, name="embedding")(model_input)

z = Dropout(dropout_prob[0])(z)

conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)

z = Dense(10, activation="relu")(z)
merged = Concatenate()([z,model_input2])
model_output = Dense(5, activation="sigmoid")(merged)

model = Model([model_input,model_input2], model_output)


In [46]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 3768)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3768, 300)    38305800    input_6[0][0]                    
__________________________________________________________________________________________________
dropout_9 (Dropout)             (None, 3768, 300)    0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 3766, 50)     45050       dropout_9[0][0]                  
____________________________________________________________________________________________

In [47]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",keras.metrics.Precision(),keras.metrics.Recall()])

In [48]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger

In [49]:
callbacks = [
    ModelCheckpoint(filepath='models/cnn_text_review_modified_arch.hdf5', verbose=1, save_best_only=True, save_weights_only=True),
    
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1),
    
    EarlyStopping(monitor='val_loss', patience=4, verbose=1),

    CSVLogger('./03-metrics.csv')]

In [None]:
# Train the model
model.fit([x_train_text_review,y_pos_neg_train], train_ratings, batch_size=batch_size, epochs=num_epochs,
          validation_data=([x_test_text_review,y_pos_neg_test], test_ratings), verbose=1,callbacks=callbacks)

In [59]:
model.load_weights('models/cnn_text_review_modified_arch.hdf5')

In [61]:
test_predictions = model.predict([x_test_text_review,y_pos_neg_test])

In [62]:
def calc_mean_prediction_error(predicted_classes,labels):
    error = 0
    for index,i in enumerate(predicted_classes):
        error += abs(labels[index]-i)
    
    return error/float(len(predicted_classes))   

In [63]:
def calc_average(predicted_classes,labels):
    acc = 0
    for index,i in enumerate(predicted_classes):
        if(labels[index]==i):
            acc += 1
    
    return acc/float(len(predicted_classes))

In [64]:
mpe = calc_mean_prediction_error(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
accuracy = calc_average(np.argmax(test_predictions,axis=1),np.argmax(test_ratings,axis=1))
precision_recall = precision_recall_fscore_support(np.argmax(test_ratings,axis=1), np.argmax(test_predictions,axis=1), average='macro')

In [65]:
print("Accuracy for model based on Text review and pos/neg prediction:",accuracy)
print("Mean Prediction Error for model based on Text review and pos/neg prediction:",mpe)
print("Precision for model based on Text review and pos/neg prediction:",precision_recall[0])
print("Recall for model based on Text review and pos/neg prediction:",precision_recall[1])

Accuracy for model based on Text review and pos/neg prediction: 0.7599792410675017
Mean Prediction Error for model based on Text review and pos/neg prediction: 0.3613989409425963
Precision for model based on Text review and pos/neg prediction: 0.48835704290260573
Recall for model based on Text review and pos/neg prediction: 0.5194668078797433
