# Predict the food rating based on Customer review(LSTM)

In [5]:
# Import the necessary libraries, modules
import pandas as pd # Pandas library for reading '.csv' files as dataframes
import numpy as np  # Numpy library for creating and modifying arrays.
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding # Import layers from Keras
from keras.models import Sequential

In [6]:
raw_data = pd.read_csv('train.csv', encoding='latin-1') # Read the data as a DataFrame using Pandas
raw_test_data = pd.read_csv('test.csv', encoding='latin-1')

print(raw_data.shape) # Print the dimensions of train DataFrame
print(raw_data.columns) # Print the column names of the DataFrame
print(raw_test_data.shape) # Print the dimensions of train DataFrame
print(raw_test_data.columns) # Print the column names of the DataFrame
print('\n')
raw_data.head(5) # Print the top few records

(100000, 2)
Index(['Rating', 'Review'], dtype='object')
(50000, 2)
Index(['Rating', 'Review'], dtype='object')




Unnamed: 0,Rating,Review
0,3,Not sure why there are such bad reviews for th...
1,5,This is Jersey Boys as in Frankie Valli and th...
2,1,I am curious know of how much they have paid f...
3,3,Wynn oh how I want to love you so... with sple...
4,2,"I took my kid in for wash/deep cond, she has V..."


In [7]:
# Print the unique classes and their counts/frequencies
classes = np.unique(raw_data['Rating'], return_counts=True) # np.unique returns a tuple with class names and counts
print(classes[0]) #Print the list of unique classes
print(classes[1]) #Print the list of frequencies of the above classes

[1 2 3 4 5]
[20194 20106 20013 19969 19718]


In [8]:
max_num_words = 10000 #corpus
seq_len = 200 # input size of each sentence(window)
embedding_size = 64 #size of vector when transformed in  space

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(raw_data.Review) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(raw_data.Review) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=seq_len) #pad_sequences makes every sequence a fixed size list by padding with 0s 

x_test = tokenizer.texts_to_sequences(raw_test_data.Review) 
x_test = pad_sequences(x_test, maxlen=seq_len)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((100000, 200), (50000, 200))

In [12]:
unique_labels = list(raw_data.Rating.unique())
print(unique_labels)

[3, 5, 1, 2, 4]


In [13]:
from keras.utils import to_categorical # This convers the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in raw_data.Rating]) # Convert the word labels to indeces
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in raw_test_data.Rating])
y_test = to_categorical(y_test)

In [36]:
import keras.backend as K # This 'K' can be used to create user defined functions in keras #backend-tensorflow function
# Define a custom function in keras to compute recall.
# Arguments:
# y_true - Actual labels
# y_pred - Predicted labels
def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

In [14]:
# Building an LSTM model
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
model.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model.add(LSTM(10, return_sequences=False)) #return _sequences=false(for feedback is disabled)
model.add(Dense(5, activation='softmax')) # Add an ouput layer. Since classification,5 nodes for 5 classes.b

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 64)           640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 10)           3000      
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 55        
Total params: 643,895
Trainable params: 643,895
Non-trainable params: 0
_________________________________________________________________


In [16]:
from keras.optimizers import Adam
adam = Adam(lr=0.001)

In [17]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

model.fit(x_train, y_train, epochs=1, validation_split=0.25)

Train on 75000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0xeb392f07b8>

In [18]:
test_prob = model.predict(x_test)
test_prob.shape

(50000, 5)

In [16]:
test_prob[:5]

array([[0.05237984, 0.0479962 , 0.7144758 , 0.15819609, 0.02695202],
       [0.01390082, 0.0046335 , 0.89898443, 0.07946791, 0.00301333],
       [0.24441177, 0.07773126, 0.20653117, 0.34573495, 0.12559089],
       [0.06733017, 0.01398574, 0.63380396, 0.2731845 , 0.01169559],
       [0.03034748, 0.00644709, 0.7924411 , 0.16599083, 0.00477352]],
      dtype=float32)

In [19]:
test_classes = model.predict_classes(x_test)
test_classes.shape

(50000,)

In [20]:
raw_test_data['Rating']=test_classes

In [22]:
raw_test_data.to_csv('testresults.csv')