In [7]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, GRU
from keras.datasets import imdb
import pandas as pd
import spacy 
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.layers import Dense, Activation, Dropout, Conv1D, Flatten, MaxPooling1D, AveragePooling1D, Reshape, SimpleRNN, \
TimeDistributed, Bidirectional, BatchNormalization
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
import keras
from sklearn.model_selection import train_test_split
import keras.backend as K
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import string
import re

In [2]:
class LRScheduler(Callback):

    def __init__(self, factor=0.6, loss_trigger=0.11, min_lr=1e-5):
        super(LRScheduler, self).__init__()

        self.factor = factor
        self.loss_trigger = loss_trigger
        self.min_lr = min_lr
        
        self.best = np.Inf

    def on_train_begin(self, logs=None):
        self.best = np.Inf
        
    def _reduce_lr(self):
        old_lr = float(K.get_value(self.model.optimizer.lr))
        if old_lr > self.min_lr:
            new_lr = old_lr * self.factor
            new_lr = max(new_lr, self.min_lr)
            K.set_value(self.model.optimizer.lr, new_lr)
            print(' $ LRScheduler reducing learning rate to %s.' % (new_lr))

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get("val_loss")
        
        if not np.less(current, self.best) or np.less(current, self.loss_trigger):
            self._reduce_lr()
        
        if np.less(current, self.best):
            self.best = current
            
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - generate_matrix: whether to generate an embedding matrix
        Outputs:
                - word2coefs: Dictionary. Word to its corresponding coefficients
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    # return word2coefs, word2index, embedding_matrix
    return word2index, np.asarray(embedding_matrix)

def custom_tokenize(docs):
    output_matrix = []
    for d in docs:
        indices = []
        for w in d.split():
            try:
                indices.append(word2index[re.sub(r'[^\w\s]','',w).lower()])
            except:
                pass
        output_matrix.append(indices)
    return output_matrix

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Data Preprocessing

In [3]:
data = pd.read_csv("reviews.csv")
data=data[['Review Text','Rating']]
data = data[data['Review Text'].isnull()==False]
data = data.reset_index()
Eindex = list(filter(lambda x:isEnglish(data['Review Text'][x]),range(data.shape[0])))
data = data.iloc[Eindex,:]
X_train, X_test, y_train, y_test = train_test_split(data['Review Text'], data['Rating'], random_state=1)

In [4]:
data['Rating'].value_counts()

5    231919
4     46797
1     27369
3     23279
2      9293
Name: Rating, dtype: int64

In [5]:
word2index, embedding_matrix = load_glove_embeddings('glove.6B.200d.txt', embedding_dim=200)
X_train_oh = custom_tokenize(X_train)
X_test_oh = custom_tokenize(X_test)
maxlen = 40
X_train_oh = pad_sequences(X_train_oh, maxlen=maxlen, padding='post')
X_test_oh = pad_sequences(X_test_oh, maxlen=maxlen, padding='post')

# "Bag of words" random forest

In [9]:
vect = CountVectorizer(stop_words = 'english',min_df=20,token_pattern='[a-z]+')
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [10]:
rf = RandomForestRegressor()
rf.fit(X_train_dtm, y_train)
y_pred_class = rf.predict(X_test_dtm)

In [11]:
print('RMSE: ', metrics.mean_squared_error(y_test,y_pred_class))
print('AMPE: ', mean_absolute_percentage_error(y_test, y_pred_class))

RMSE:  0.948403134168
AMPE:  25.8583564754


# Text RNN

In [24]:
model1 = Sequential()
model1.add(Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1], 
                            input_length=maxlen,
                            weights=[embedding_matrix], 
                            trainable=False, 
                            name='embedding_layer'))
model1.add(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model1.add(Flatten())
model1.add(Dropout(0.3))
model1.add(Dense(1, activation='relu'))

In [34]:
early_stop = EarlyStopping(monitor="val_loss", min_delta=0, patience=10, verbose=1)
reduce_lr = LRScheduler(factor=0.7, min_lr=5e-5)
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,decay=0.01)
model1.compile(loss='mean_squared_error', optimizer=adam, metrics=['mape'])
model1.fit(X_train_oh, y_train, batch_size=200, epochs=50,validation_data = (X_test_oh,y_test),callbacks=[early_stop,reduce_lr])

Train on 253992 samples, validate on 84665 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4fd02eaf98>