In [168]:
import numpy as np
import os
import shutil
import tensorflow as tf

from sklearn.metrics import accuracy_score, confusion_matrix

In [169]:
def download_and_read(url):
 local_file = url.split('/')[-1]
 local_file = local_file.replace("%20", " ")
 p = tf.keras.utils.get_file(local_file, url,extract=True, cache_dir=".")
 local_folder = os.path.join("datasets", local_file.split('.')[0])
 labeled_sentences = []
 for labeled_filename in os.listdir(local_folder):
  if labeled_filename.endswith("_labelled.txt"):
    with open(os.path.join(local_folder, labeled_filename), "r") as f:
      for line in f:
        sentence, label = line.strip().split('\t')
        labeled_sentences.append((sentence, label))
 return labeled_sentences

In [170]:
labeled_sentences = download_and_read(
 "https://archive.ics.uci.edu/ml/machine-learning-databases/" +
 "00331/sentiment%20labelled%20sentences.zip")

In [171]:
sentences = [s for (s,l) in labeled_sentences]
labels = [int(l) for (s,l) in labeled_sentences]

In [172]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)

In [173]:
vocab_size = len(tokenizer.word_counts)
vocab_size

5271

In [174]:
word2idx = tokenizer.word_index
idx2word = {i:p for (p,i) in word2idx.items()}

In [175]:
max_seqlen = 64

sentences_as_ints = tokenizer.texts_to_sequences(sentences)
sentences_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sentences_as_ints,maxlen=max_seqlen)
labels_as_ints = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices((sentences_as_ints,labels_as_ints))


In [176]:
dataset = dataset.shuffle(10000)
test_size = len(sentences) // 3
val_size = (len(sentences) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

In [177]:
batch_size = 64
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [178]:
class SentimentAnalysisModel(tf.keras.Model):
  def __init__(self,vocab_size,max_seqlen, **kwargs):
    super(SentimentAnalysisModel, self).__init__(**kwargs)
    self.embedding = tf.keras.layers.Embedding(vocab_size,max_seqlen)
    self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))
    self.dense = tf.keras.layers.Dense(64,activation="relu")
    self.out = tf.keras.layers.Dense(1,activation='sigmoid')
  def call(self,x):
    x = self.embedding(x)
    x = self.bilstm(x)
    x = self.dense(x)
    x = self.out(x)
    return x

In [179]:
model = SentimentAnalysisModel(vocab_size+1,max_seqlen)
model.build((batch_size,max_seqlen))
model.summary()

Model: "sentiment_analysis_model_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     multiple                  337408    
_________________________________________________________________
bidirectional_18 (Bidirectio multiple                  197632    
_________________________________________________________________
dense_36 (Dense)             multiple                  16448     
_________________________________________________________________
dense_37 (Dense)             multiple                  65        
Total params: 551,553
Trainable params: 551,553
Non-trainable params: 0
_________________________________________________________________


In [180]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['acc'])

In [181]:
data_dir = './data'
logs_dir = os.path.join('./logs')
best_model = os.path.join(data_dir,'best_model.h5')

checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model,save_weights_only=True,save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)

num_epochs = 10
history = model.fit(train_dataset,epochs=num_epochs,validation_data=val_dataset,callbacks=[checkpoint,tensorboard])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [182]:
best = SentimentAnalysisModel(vocab_size+1, max_seqlen)
best.build(input_shape=(batch_size, max_seqlen))
best.load_weights(best_model)
best.compile(
 loss="binary_crossentropy",
 optimizer="adam",
 metrics=["accuracy"]
)

In [183]:
test_loss, test_acc = best.evaluate(test_dataset)
print("test loss: {:.3f}, test accuracy: {:.3f}".format(
 test_loss, test_acc))

test loss: 0.036, test accuracy: 0.991


In [184]:
labels, predictions = [], []
idx2word[0] = "PAD"
is_first_batch = True
for test_batch in test_dataset:
 inputs_b, labels_b = test_batch
 pred_batch = best.predict(inputs_b)
 print(pred_batch.shape)
 predictions.extend([(1 if p > 0.5 else 0) for p in pred_batch])
 labels.extend([l for l in labels_b])
 if is_first_batch:
  # print first batch of label, prediction, and sentence
  for rid in range(inputs_b.shape[0]):
    words = [idx2word[idx] for idx in inputs_b[rid].numpy()]
    words = [w for w in words if w != "PAD"]
    sentence = " ".join(words)
    print("{:d}\t{:d}\t{:s}".format(
    labels[rid], predictions[rid], sentence))
  is_first_batch = False
print("accuracy score: {:.3f}".format(accuracy_score(labels,predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

(64, 1)
1	1	go watch it
1	1	hawaiian breeze mango magic and pineapple delight are the smoothies that i've tried so far and they're all good
0	0	then our food came out disappointment ensued
1	1	great phone
1	1	in fact it's hard to remember that the part of ray charles is being acted and not played by the man himself
0	0	you never know if you pushed it hard enough or the right number of times for the function you want or not
1	1	this is a great restaurant at the mandalay bay
0	0	avoid at any and all costs
0	0	it's just painful
1	1	the patio seating was very comfortable
0	0	it was not good
1	1	setup couldn't have been simpler
0	0	kind of flops around
1	1	definitely worth checking out
1	1	good food good service
1	1	it was quite comfortable in the ear
0	0	the manager was the worst
0	0	i wouldn't say they're worth 2 hours of your time though
0	0	your servers suck wait correction our server heimer sucked
0	0	it is practically useless and did not add any kind of boost to my reception after i b