# Exercise 4

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')

dataset.replace({1:0, 2:0,3:1,4:1,5:1}, inplace=True)
print("After Replacement:\n",dataset)

sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()



training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

After Replacement:
                                                  review  rating
0                       sir okay armygreen shorts nice        1
1     di pareha yong mga size nila may sobrang liit ...       1
2     super worth it ang ganda Sombra grabi order na...       1
3                                      ganda po salamat       1
4                   maayos pagkadeliver maganda den sya       1
...                                                 ...     ...
996   manipis siya masyado, tapos 9pcs lang yung isa...       0
997   maluwang and sobrang nipis. maluwang and sobra...       0
998   hope hindi tayo manloloko di ba???sa dami ng n...       0
999   Nakaka disappointed lng ng sobra sa seller .. ...       0
1000  salamat po sa courier pero Yung items po nakak...       0

[1001 rows x 2 columns]


In [3]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


## 1. Tokenize the data

In [4]:
vocab_size=400
oov_tok ="<OOV>"
tokenizer = Tokenizer(num_words = vocab_size,
                      oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)


## 2. Sequence the data

In [5]:
sequences = tokenizer.texts_to_sequences(training_sentences)
print (sequences)

[[1, 80, 1, 390, 66], [42, 1, 1, 63, 62, 219, 46, 89, 1, 1], [94, 126, 11, 9, 55, 1, 1, 24, 3, 100, 42, 100, 1, 1, 55, 24, 167, 54, 14, 9, 168, 38], [55, 21, 104], [119, 1, 23, 1, 22], [9, 1, 345, 1, 1, 345, 1, 5, 104, 44, 6, 41, 1, 9, 1, 180, 27, 15, 21, 220], [23, 22, 153, 127, 120, 12], [], [], [127, 1, 14, 1, 1, 1, 10, 1, 42, 23, 9, 1, 42, 1, 1, 91, 391, 91, 317, 1, 1, 10, 1], [1, 1], [1, 1, 47, 1, 1], [27, 15, 9, 55, 1, 167, 54, 10, 1, 35, 36, 272, 3, 114, 101, 1, 1], [30, 47, 14, 119, 17, 9, 1, 27, 15, 16], [55, 1, 12, 392, 167, 1, 5, 1, 33, 107, 28, 9, 55, 1, 1], [1, 1, 1], [9, 55, 1, 1, 25, 6, 1, 114, 25, 1, 9, 1, 77, 16, 14, 6, 1, 169, 115, 24, 167, 1], [40, 17, 7, 1, 1, 1, 3, 1, 253, 108, 12, 8, 127, 107, 22, 42, 393, 136, 6, 181, 195, 154, 1, 28, 40, 3, 22, 19, 2, 50, 108, 12, 17, 182, 1, 5, 12, 22, 10, 1, 1, 3], [4, 90, 2, 49, 4, 102, 11, 34, 81, 82, 89, 273, 10, 41, 1, 10, 1, 221, 38, 7, 24, 5, 221, 1], [140], [66, 394, 140], [23, 7, 47, 70, 1], [1, 1, 9, 63, 1, 1, 1, 395,

In [6]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

## 3. Pad the data

In [7]:
max_length = 100
trunc_type='post'
padding_type='post'

padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

## 4. Train a sentiment model

In [8]:
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           6400      
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 7489 (29.25 KB)
Trainable params: 7489 (29.25 KB)
Non-trainable params: 0 (0.00 Byte)
______________________

In [9]:
num_epochs = 35
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.src.callbacks.History at 0x7ebe146ab2b0>

## Get files for visualing the network

In [10]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(400, 16)


In [11]:
import io

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [12]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 5. Predict sentiment with new reviews

In [13]:
fake_reviews = ['basura ung shorts',
                'Ang mahal tas ang panget',
                'lugi sa presyo','nice po ung armygreen',
                'tangina','punit ung shorts',
                'di legit ung seller',
                'tshirt dumating sakin di short',
                'tagal ng driver', 'worth it ung price',
                'bobo nyo', 'maganda ung shorts', 'sira ung box','ganda po salamat']

print(fake_reviews)

padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)


for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

['basura ung shorts', 'Ang mahal tas ang panget', 'lugi sa presyo', 'nice po ung armygreen', 'tangina', 'punit ung shorts', 'di legit ung seller', 'tshirt dumating sakin di short', 'tagal ng driver', 'worth it ung price', 'bobo nyo', 'maganda ung shorts', 'sira ung box', 'ganda po salamat']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

basura ung shorts
[0.34198055]


Ang mahal tas ang panget
[0.55938303]


lugi sa presyo
[0.8242396]


nice po ung armygreen
[0.8825986]


tangina
[0.628571]


punit ung shorts
[0.34198055]


di legit ung seller
[0.2658982]


tshirt dumating sakin di short
[0.43202502]


tagal ng driver
[0.4552013]


worth it ung price
[0.44250584]


bobo nyo
[0.5862216]


maganda ung shorts
[0.5315215]


sira ung box
[0.5067301]


ganda po salamat
[0.9882705]


