# Predict the food rating based on Customer review(1dcnn)

In [25]:
# Import the necessary libraries, modules
import pandas as pd # Pandas library for reading '.csv' files as dataframes
import numpy as np  # Numpy library for creating and modifying arrays.
import string
import re
from keras.layers import Dense, SimpleRNN, GRU, LSTM, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D # Import layers from Keras
from keras.models import Sequential

In [2]:
import os
os.getcwd()
os.chdir("F:\\Insofe\\Cute_DeepLearning\\PS2")

### Reading the data

In [3]:
raw_data = pd.read_csv('Train.csv', encoding='latin-1') # Read the data as a DataFrame using Pandas
raw_test_data = pd.read_csv('Test.csv', encoding='latin-1')

print(raw_data.shape) # Print the dimensions of train DataFrame
print(raw_data.columns) # Print the column names of the DataFrame
print('\n')
raw_data.head(5) # Print the top few records
print(raw_test_data.shape) # Print the dimensions of train DataFrame

(100000, 2)
Index(['Rating', 'Review'], dtype='object')


(50000, 2)


In [4]:
X = raw_data.Review
X.shape

(100000,)

In [5]:
X_test = raw_test_data.Review
X_test.shape

(50000,)

In [6]:
X[:2]

0    Not sure why there are such bad reviews for th...
1    This is Jersey Boys as in Frankie Valli and th...
Name: Review, dtype: object

In [7]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    output = re.sub(r'\d+', '', txt)
    p = re.compile("(\.|\!|\?)-'")
    txt = p.sub("", output)
    return txt

In [8]:
corpus = [clean_text(x) for x in X]
corpus[:4]

['not sure why there are such bad reviews for this location as far as starbucks go it is pretty average not especially bad they get points knocked off due to the small size and lack of sittinglounging space you can only walk up or drive thru at this location  but the drive thru is quick enough ive never had any issues with my order getting mixed up or attitude from the baristas i mostly visit in evenings or afternoon and it is never busy  ive also been during morning hours but not super early so i cant comment on early morning rushnnseems like this low rating is partially skewed from people who dont like starbucks coffee in general understood but i wish we could just filter out those reviews  after all we all know what starbucks is and isnt by now some of us need an accurate review of the location without coffee snobbery polluting the rating p nsee there  i included the smiley so my snobby coffee brethren cant take offense',
 'this is jersey boys as in frankie valli and the  seasons  i

In [9]:
test_corpus = [clean_text(x) for x in X_test]

In [10]:
test_corpus[:2]

['i got new tires from them and within two weeks got a flat i took my car to a local mechanic to see if i could get the hole patched but they said the reason i had a flat was because the previous patch had blown  wait what i just got the tire and never needed to have it patched this was supposed to be a new tire ni took the tire over to flynns and they told me that someone punctured my tire then tried to patch it so there are resentful tire slashers i find that very unlikely after arguing with the guy and telling him that his logic was far fetched he said hed give me a new tire this time ni will never go back to flynns bc of the way this guy treated me and the simple fact that they gave me a used tire',
 'dont waste your time  we had two different people come to our house to give us estimates for a deck one of them the owner  both times we never heard from them  not a call not the estimate nothing']

In [11]:
# Summarize number of words
print("Number of words: ")
print(len(np.unique(np.hstack(corpus))))

Number of words: 
99991


In [12]:
# Summarize review length
import matplotlib.pyplot as plt
print("Review length: ")
result = [len(x) for x in X]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))
# plot review length
plt.boxplot(result)
plt.show()

Review length: 
Mean 735.48 words (666.675728)


<Figure size 640x480 with 1 Axes>

### Check the labels and their frequencies

In [13]:
# Print the unique classes and their counts/frequencies
classes = np.unique(raw_data['Rating'], return_counts=True) # np.unique returns a tuple with class names and counts
print(classes[0]) #Print the list of unique classes
print(classes[1]) #Print the list of frequencies of the above classes

[1 2 3 4 5]
[20194 20106 20013 19969 19718]


In [14]:
pd.value_counts(raw_data['Rating'])

1    20194
2    20106
3    20013
4    19969
5    19718
Name: Rating, dtype: int64

### Converting unstructured text to structured numeric form
This includes:
1. Tokenizing
2. Converting sequence of words to sequence of word indeces
3. Converting varing length sequences to fixed length sequences through padding

In [15]:
max_num_words = 20000
seq_len = 400
embedding_size = 32

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_num_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(corpus) #Fit this to our corpus

x_train = tokenizer.texts_to_sequences(corpus) #'text to sequences converts the text to a list of indices
x_train = pad_sequences(x_train, maxlen=seq_len) #pad_sequences makes every sequence a fixed size list by padding with 0s 

x_test = tokenizer.texts_to_sequences(test_corpus) 
x_test = pad_sequences(x_test, maxlen=seq_len)

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((100000, 400), (50000, 400))

In [17]:
len(tokenizer.word_counts.keys())

209586

In [70]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'i': 4,
 'to': 5,
 'was': 6,
 'of': 7,
 'it': 8,
 'for': 9,
 'in': 10,
 'is': 11,
 'that': 12,
 'my': 13,
 'but': 14,
 'we': 15,
 'with': 16,
 'this': 17,
 'they': 18,
 'on': 19,
 'you': 20,
 'not': 21,
 'have': 22,
 'had': 23,
 'were': 24,
 'at': 25,
 'so': 26,
 'are': 27,
 'food': 28,
 'be': 29,
 'good': 30,
 'place': 31,
 'there': 32,
 'as': 33,
 'me': 34,
 'like': 35,
 'just': 36,
 'if': 37,
 'out': 38,
 'all': 39,
 'very': 40,
 'our': 41,
 'here': 42,
 'get': 43,
 'one': 44,
 'its': 45,
 'or': 46,
 'when': 47,
 'from': 48,
 'would': 49,
 'time': 50,
 'up': 51,
 'great': 52,
 'service': 53,
 'their': 54,
 'about': 55,
 'back': 56,
 'go': 57,
 'an': 58,
 'really': 59,
 'no': 60,
 'what': 61,
 'some': 62,
 'which': 63,
 'he': 64,
 'been': 65,
 'only': 66,
 'your': 67,
 'more': 68,
 'she': 69,
 'will': 70,
 'us': 71,
 'because': 72,
 'dont': 73,
 'can': 74,
 'by': 75,
 'got': 76,
 'even': 77,
 'other': 78,
 'them': 79,
 'didnt': 80,
 'do': 81,
 'also': 

### Prepare the target vectors for the network

In [71]:
x_train[:2]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [18]:
unique_labels = list(raw_data.Rating.unique())
print(unique_labels)

[3, 5, 1, 2, 4]


In [19]:
from keras.utils import to_categorical # This convers the labels to one-hot vectors(Dummies)

y_train = np.array([unique_labels.index(i) for i in raw_data.Rating]) # Convert the word labels to indeces
y_train = to_categorical(y_train) # Dummify the labels
y_test = np.array([unique_labels.index(i) for i in raw_test_data.Rating])
y_test = to_categorical(y_test)

In [20]:
import keras.backend as K # This 'K' can be used to create user defined functions in keras

# Define a custom function in keras to compute recall.
# Arguments:
# y_true - Actual labels
# y_pred - Predicted labels
def recall(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    PP = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = TP / (PP + K.epsilon())
    return recall

### Building and training an LSTM model

In [30]:
# Building an LSTM model
model = Sequential() # Call Sequential to initialize a network
model.add(Embedding(input_dim = max_num_words, 
                    input_length = seq_len, 
                    output_dim = embedding_size)) # Add an embedding layer which represents each unique token as a vector
#model.add(LSTM(10, return_sequences=True)) # Add an LSTM layer
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
#model.add(MaxPooling1D(5))
#model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(5, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 400, 32)           640000    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 396, 128)          20608     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 79, 128)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 75, 128)           82048     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
Total para

In [32]:
from keras.optimizers import Adam
adam = Adam(lr=0.01)

In [None]:
# Mention the optimizer, Loss function and metrics to be computed
model.compile(optimizer=adam,                  # 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', # categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            # These metrics are computed for evaluating and stored in history

hist = model.fit(x_train, y_train, epochs=3, batch_size=64, validation_split=0.5)

Train on 50000 samples, validate on 50000 samples
Epoch 1/3
Epoch 2/3

### Prediction and evaluation on test data
1. Check the network output on test data. What do these values represent?
2. Predict the class labels on test data
2. Evaluate the model on test data

Hint: Check model.predict, model.predict_classes, model.evaluate in keras

In [83]:
test_prob = model.predict(x_test)
test_prob.shape

(50000, 5)

In [84]:
test_prob[:5]

array([[1.1927897e-02, 4.6400461e-02, 9.0910357e-01, 2.6089994e-02,
        6.4780507e-03],
       [2.2966247e-03, 1.2917294e-03, 9.5121670e-01, 4.4544749e-02,
        6.5017806e-04],
       [1.5728736e-02, 1.0053492e-03, 8.1425256e-01, 1.6738100e-01,
        1.6323308e-03],
       [2.4099227e-02, 3.5055433e-04, 5.5162960e-01, 4.2284080e-01,
        1.0798245e-03],
       [1.8477701e-02, 1.7427143e-03, 8.4170526e-01, 1.3513272e-01,
        2.9415824e-03]], dtype=float32)

In [85]:
test_classes = model.predict_classes(x_test)
test_classes.shape

(50000,)

In [15]:
test_classes = np.argmax(test_prob, axis=1)
test_classes.shape

(1000,)

### Understanding an intermediate layer in keras
Please understand the below code to get output from an intermediate layer in keras. you can do this for every layer to fully understand how the tensors/arrays are flowing through the layers.

In [16]:
model.layers

[<keras.layers.embeddings.Embedding at 0x7fec4e179ef0>,
 <keras.layers.recurrent.LSTM at 0x7fed09776978>,
 <keras.layers.core.Dense at 0x7fec4e18d1d0>]

In [17]:
import keras.backend as K 
# Create a user defined function in keras, where we mention the input and output
# This function returns a list
eo = K.function([model.layers[0].input],
                  [model.layers[0].output, model.layers[1].output])

out = eo([x_train[0:5]]) 
print(type(out))
print(len(out))
print(out[0].shape)

<class 'list'>
2
(5, 50, 100)


In [18]:
print(out[1].shape)

(5, 10)


In [87]:
type(raw_test_data['Rating'])

pandas.core.series.Series

In [88]:
raw_test_data['Rating']

0        1
1        1
2        1
3        1
4        1
5        3
6        2
7        4
8        4
9        3
10       2
11       4
12       1
13       2
14       4
15       4
16       3
17       1
18       1
19       3
20       2
21       3
22       2
23       3
24       3
25       1
26       2
27       4
28       5
29       2
        ..
49970    4
49971    1
49972    5
49973    2
49974    3
49975    5
49976    1
49977    5
49978    2
49979    2
49980    1
49981    2
49982    5
49983    1
49984    5
49985    5
49986    3
49987    4
49988    4
49989    5
49990    5
49991    3
49992    1
49993    1
49994    1
49995    1
49996    5
49997    4
49998    2
49999    1
Name: Rating, Length: 50000, dtype: int64

In [89]:
raw_test_data['Rating'].shape

(50000,)

In [90]:
raw_test_data['Rating'] = test_classes

In [93]:
raw_test_data.head(7)

Unnamed: 0,Rating,Review
0,2,I got 'new' tires from them and within two wee...
1,2,Don't waste your time. We had two different p...
2,2,All I can say is the worst! We were the only 2...
3,2,I have been to this restaurant twice and was d...
4,2,Food was NOT GOOD at all! My husband & I ate h...
5,0,This is a tiny Starbucks and it locations like...
6,3,Typical Starbucks coffee chain. 2 things I don...


In [92]:
test_classes[:10]

array([2, 2, 2, 2, 2, 0, 3, 4, 0, 0], dtype=int64)

In [95]:
raw_test_data.to_csv("F:\\Insofe\\Cute_DeepLearning\\PS2\\test_preds.csv")