In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re

%matplotlib inline

**Text Preprocessing**

In [None]:
# Convert csv files to input and labels lists
def read_csv(filename = 'data/emojify_data.csv'):
    text = []
    label = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        next(csvReader, None)
        for row in csvReader:
            text.append(row[1])
            label.append(re.search("[1-5]",row[2]).group(0))

    X = list(text)
    Y = np.array(label, dtype=int)

    return X, Y

In [None]:
# returning input and output lists
X_train, Y_train = read_csv('sentiment_dataset_train.csv')
X_eval, Y_eval = read_csv('sentiment_dataset_dev.csv')

In [None]:
# sample review with rating
print(X_train[0], "Rating :",Y_train[0])
len(X_train[0])

Arrived about 10pm and check in was painless.   The only downside to this hotel is if you are looking for a city centre location. If you don't mind some walking and want to be out of the noise of the city then this place is ideal.   Hotel has a bar and restaurant, decent size gym and roof terrace with sun loungers.   The rooms are a good size, especially when traveling with a large teenager. Good sized lounge with double sofa bed, kitchen area and dining table. Main bedroom is a good size with double wardrobes and safe. Shower room is well sized with plenty of towels, good supply of toiletries, and hairdryer.   Fridge comes stocked with bottles of water to get you started and you can get more at the hotel bar  A 5 minute walk takes you to Marina metro station and your access toâ€¦ Rating : 4


789

In [None]:
# Convert label to one-hot tensors
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [None]:
# Convert label to one-hot tensors
Y_oh_train = convert_to_one_hot(Y_train, C = 6)
Y_oh_eval = convert_to_one_hot(Y_eval, C =6)

In [None]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import string
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# preprocess each review to remove hyperlinks, duplicate letters (i.e, sooo), punctuation marks
def pre_process(tweet):
  tweet = tweet.strip().lower()
  tweet = re.sub(r'http[s]?://\S+','',tweet)  #/https?\:(\\\\|\/\/)(www.)?/,'' re.sub('http[s]?://\S+', '', text)
  tweet = re.sub(r'(\w)\1+', r'\1',tweet)
  tweet = re.sub(r'[!.?]+','',tweet)
  return str(tweet)


X_train_words = []
X_eval_words = []

for row in X_train:
  X_train_words.append(pre_process(str(row)))

for row in X_eval:
  X_eval_words.append(pre_process(str(row)))
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Building the vocabulary with the train set         (this might take a minute)
from collections import defaultdict
x_train = []

vocab = defaultdict(lambda: 0)

for idx in range(len(X_train_words)):
    li = nltk.word_tokenize(X_train_words[idx])
    x_train.append(li)

    for word in li:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  51619


In [None]:
#Split eval sequence to tokens
x_eval = []
for idx in range(len(X_eval_words)):
    li = nltk.word_tokenize(X_eval_words[idx])
    x_eval.append(li)

In [None]:
# convert tokens to tensors

for i in range(len(x_train)):
    x_train[i] = [vocab[word] for word in x_train[i]]

for i in range(len(x_eval)):
    x_eval[i] = [vocab[word] for word in x_eval[i]]


In [None]:
print('Second review:')
print(X_train_words[1], '\n') 
print('encoded version:')
print(x_train[1],'\n')

first review:

i checked in at 4pm even tough rom was not ready  and the staf are busy with their mobiles instead of making it fast to provide me my suite rom and in bathrom hair dryer was not kept  bathrob was not kept totaly bad experience i faced  the rates for hotel fod is 5 times more then outisde fod the rom service boys are beter then receptionist people very wel trained with quick service the receptionist people should welxome guest with smile they are seing us like we are staying for fre 

encoded version:
[98, 99, 6, 87, 100, 101, 102, 71, 7, 103, 104, 4, 9, 105, 18, 106, 47, 107, 108, 109, 33, 110, 111, 112, 12, 113, 114, 115, 116, 71, 4, 6, 117, 118, 119, 7, 103, 120, 121, 7, 103, 120, 122, 123, 124, 98, 125, 9, 126, 20, 14, 127, 15, 88, 128, 86, 35, 129, 127, 9, 71, 130, 131, 18, 132, 35, 133, 134, 135, 72, 136, 47, 137, 130, 9, 133, 134, 138, 139, 140, 47, 141, 142, 18, 143, 144, 145, 146, 18, 147, 20, 148] 



**Model Training**

In [None]:
import numpy as np
np.random.seed(0)
from keras import models
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, LSTM, Activation, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

In [None]:
# finding out max length to make all input sequences fixed
max_len = 0
for i in x_train:
  if len(i)>max_len:
    max_len = len(i)

In [None]:
# padding sequences with zeros to make input length fixed
X = sequence.pad_sequences(x_train , maxlen=max_len )  # change to max_len
x_eval_final = sequence.pad_sequences(x_eval , maxlen=max_len )

In [None]:
# model defination
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(len(vocab), embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 926, 128)          7371136   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 926, 128)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 6)                 1182      
Total params: 7,627,118
Trainable params: 7,627,118
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# training model
batch_size = 64
model.fit(X, Y_oh_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
547/547 - 2466s - loss: 1.2091 - accuracy: 0.4452
Epoch 2/7
547/547 - 2477s - loss: 0.8271 - accuracy: 0.6413
Epoch 3/7
547/547 - 2488s - loss: 0.6255 - accuracy: 0.7465
Epoch 4/7
547/547 - 2466s - loss: 0.4900 - accuracy: 0.8089
Epoch 5/7
547/547 - 2461s - loss: 0.3917 - accuracy: 0.8519
Epoch 6/7
547/547 - 2506s - loss: 0.3141 - accuracy: 0.8840
Epoch 7/7
547/547 - 2488s - loss: 0.2526 - accuracy: 0.9067


<tensorflow.python.keras.callbacks.History at 0x7f12a8138748>

In [None]:
# model aved for later reference
model.save('/SOTA_Model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /SOTA_Model/assets


In [None]:
model = models.load_model('/SOTA_Model');



In [None]:
# evaluate model with dev dataset
score,acc = model.evaluate(x_eval_final, Y_oh_eval, verbose = 2, batch_size = batch_size)
print("Score",score, "Accuracy",acc)

118/118 - 44s - loss: 0.7978 - accuracy: 0.7313
Score 0.7977930903434753 Accuracy 0.731297492980957


**Testing/prediction**

In [None]:
X_test = np.asarray(pd.read_csv('sentiment_dataset_test.csv',usecols=['review']))
X_test = X_test.tolist()

In [None]:
 # convert test set - text to tokens
 x_test= []
for row in X_test:
  element = pre_process(str(row))
  li = nltk.word_tokenize(element)
  x_test.append(li)

# convert tokens to index sequences
for i in range(len(x_test)):
    x_test[i] = [vocab[word] for word in x_test[i]]

In [None]:
x_test_final = sequence.pad_sequences(x_test , maxlen=max_len )

In [None]:
# single prediction
sentiment = model.predict(x_test_final,batch_size=1,verbose = 2)[0]
y_pred = np.argmax(sentiment)
print("Rating for first record in test set", y_pred)

6500/6500 - 1928s
Rating for first record in test set 2


In [None]:
# batch prediction
sentiment = model.predict(x_test_final,verbose = 2)
sentiment

204/204 - 72s


array([[8.9533132e-06, 3.1808585e-02, 9.1258073e-01, 5.4758497e-02,
        7.5182790e-04, 9.1400550e-05],
       [2.2048758e-04, 1.3305760e-03, 2.5415553e-03, 1.0439120e-02,
        3.6296847e-01, 6.2249976e-01],
       [5.7646641e-05, 9.8178530e-01, 1.7116940e-02, 8.4301515e-04,
        1.4563967e-04, 5.1472311e-05],
       ...,
       [7.5304764e-05, 2.5466131e-03, 3.1420842e-03, 9.3044508e-03,
        6.4698911e-01, 3.3794239e-01],
       [7.3185993e-07, 8.1595109e-04, 9.9576449e-01, 3.3504518e-03,
        5.4701246e-05, 1.3682134e-05],
       [3.6244073e-05, 9.9979781e-02, 8.9124590e-01, 8.3362563e-03,
        2.9282694e-04, 1.0902197e-04]], dtype=float32)

In [None]:
def predictlabel(ypred):
  predictedLabels = []
  for i in range(len(x_test_final)):
    num = np.argmax(sentiment[i])
    predictedLabels.append(num)
  return predictedLabels

In [None]:
y_pred = predictlabel(sentiment)

In [None]:
data = {'review':X_test,'predicted_rating':y_pred}
df_test = pd.DataFrame(data)
df_test.head()

Unnamed: 0,review,predicted_rating
0,[Not at all what expected. Our mountain view...,2
1,[Good location as we needed to head to Reims t...,5
2,[Me and my son just returned from Broadmoor Mi...,1
3,[The place was filthy and full of stoned backp...,1
4,[The hotel itself is really nice and modern wh...,4
