# Exercise 4

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')

dataset.replace({1:0, 2:0, 3:1, 4:1, 5:1}, inplace=True)
print("After Replacing:\n", dataset)

sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()



After Replacing:
                                                  review  rating
0                       sir okay armygreen shorts nice        1
1     di pareha yong mga size nila may sobrang liit ...       1
2     super worth it ang ganda Sombra grabi order na...       1
3                                      ganda po salamat       1
4                   maayos pagkadeliver maganda den sya       1
...                                                 ...     ...
996   manipis siya masyado, tapos 9pcs lang yung isa...       0
997   maluwang and sobrang nipis. maluwang and sobra...       0
998   hope hindi tayo manloloko di ba???sa dami ng n...       0
999   Nakaka disappointed lng ng sobra sa seller .. ...       0
1000  salamat po sa courier pero Yung items po nakak...       0

[1001 rows x 2 columns]


In [3]:
# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [4]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


## 1. Tokenize the data

In [5]:
vocab_size = 300
oov_tok = "<OOV>"


tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)



## 2. Sequence the data

In [6]:
sequences = tokenizer.texts_to_sequences(sentences)
print (sequences)

[[1, 80, 1, 1, 66], [42, 1, 1, 63, 62, 219, 46, 89, 1, 1], [94, 126, 11, 9, 55, 1, 1, 24, 3, 100, 42, 100, 1, 1, 55, 24, 167, 54, 14, 9, 168, 38], [55, 21, 104], [119, 1, 23, 1, 22], [9, 1, 1, 1, 1, 1, 1, 5, 104, 44, 6, 41, 1, 9, 1, 180, 27, 15, 21, 220], [23, 22, 153, 127, 120, 12], [], [], [127, 1, 14, 1, 1, 1, 10, 1, 42, 23, 9, 1, 42, 1, 1, 91, 1, 91, 1, 1, 1, 10, 1], [1, 1], [1, 1, 47, 1, 1], [27, 15, 9, 55, 1, 167, 54, 10, 1, 35, 36, 272, 3, 114, 101, 1, 1], [30, 47, 14, 119, 17, 9, 1, 27, 15, 16], [55, 1, 12, 1, 167, 1, 5, 1, 33, 107, 28, 9, 55, 1, 1], [1, 1, 1], [9, 55, 1, 1, 25, 6, 1, 114, 25, 1, 9, 1, 77, 16, 14, 6, 1, 169, 115, 24, 167, 1], [40, 17, 7, 1, 1, 1, 3, 1, 253, 108, 12, 8, 127, 107, 22, 42, 1, 136, 6, 181, 195, 154, 1, 28, 40, 3, 22, 19, 2, 50, 108, 12, 17, 182, 1, 5, 12, 22, 10, 1, 1, 3], [4, 90, 2, 49, 4, 102, 11, 34, 81, 82, 89, 273, 10, 41, 1, 10, 1, 221, 38, 7, 24, 5, 221, 1], [140], [66, 1, 140], [23, 7, 47, 70, 1], [1, 1, 9, 63, 1, 1, 1, 1, 222, 100, 1, 1], 

In [7]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


## 3. Pad the data

In [8]:
max_length = 150
trunc_type='post'
padding_type='post'

padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

In [9]:
print(decode_review(padded[1]))
print(training_sentences[1])

di <OOV> <OOV> mga size nila may sobrang <OOV> <OOV> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
di pareha yong mga size nila may sobrang liit akjejrjrjjfjjriro4k4jrjrjfjrjrjrjrjjtjrj


## 4. Train a sentiment model

In [10]:
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 16)           4800      
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 6433 (25.13 KB)
Trainable params: 6433 (25.13 KB)
Non-trainable params: 0 (0.00 Byte)
______________________

In [11]:
num_epochs = 30
history = model.fit(training_padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Get files for visualing the network

In [12]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(300, 16)


In [13]:
import io

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [14]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 5. Predict sentiment with new reviews

In [15]:
# Use the model to predict a review
fake_reviews = ['hindi ko gusto ang kulay', 'ayoko na bumalik',
                'maganda dahil malamig ang tubig',
                'si koyah mabaho at panget',
                'maganda ang lugar',
                'mabilis lang makaupo',
                'libre ang pagkain',
                'mabait ang mga staff',
                'may baliw sa gilid ayaw ko',
                'gumagana ang lamesa very good']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

# Try adding reviews of your own
# Add some negative words (such as "not") to the good reviews and see what happens
# For example:
# they gave us free chocolate cake and did not charge us

['hindi ko gusto ang kulay', 'ayoko na bumalik', 'maganda dahil malamig ang tubig', 'si koyah mabaho at panget', 'maganda ang lugar', 'mabilis lang makaupo', 'libre ang pagkain', 'mabait ang mga staff', 'may baliw sa gilid ayaw ko', 'gumagana ang lamesa very good']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

hindi ko gusto ang kulay
[0.5533226]


ayoko na bumalik
[0.6148811]


maganda dahil malamig ang tubig
[0.8769839]


si koyah mabaho at panget
[0.6789764]


maganda ang lugar
[0.80231005]


mabilis lang makaupo
[0.7037312]


libre ang pagkain
[0.6903395]


mabait ang mga staff
[0.81484854]


may baliw sa gilid ayaw ko
[0.53315425]


gumagana ang lamesa very good
[0.8844602]


