# Exercise 4

In [5]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')


In [7]:

mask = (dataset['rating'] > 0 ) & (dataset['rating'] <4)
column_name = 'rating'
dataset.loc[mask, column_name] = 0

mask = (dataset['rating'] > 3 ) & (dataset['rating'] < 6)
column_name = 'rating'
dataset.loc[mask, column_name] = 1

dataset.head()



Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


## 1. Tokenize the data

In [8]:
sentences = dataset['review'].tolist()
labels = dataset['rating'].tolist()

# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [9]:
vocab_size = 3000
embedding_dim = 64
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

## 2. Sequence the data

In [10]:
sequences = tokenizer.texts_to_sequences(training_sentences)

## 3. Pad the data

In [11]:
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)



## 4. Train a sentiment model

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           192000    
                                                                 
 flatten (Flatten)           (None, 6400)              0         
                                                                 
 dense (Dense)               (None, 6)                 38406     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 230413 (900.05 KB)
Trainable params: 230413 (900.05 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
num_epochs = 30
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7a2a521c9d50>

## Get files for visualing the network

In [14]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


(3000, 64)


## 5. Predict sentiment with new reviews

In [15]:
# Use the model to predict a review
fake_reviews = ['Mali yung kulay na pinadala niyo',
                'Nasunog yung buhok ko sa product niyo',
                'Hindi legit yung product',
                'Kulang yung pinadala',
                'Hindi maayos ang packaging',
                'Masyadong maliit yung damit',
                'May butas yung damit',
                'Hindi ko nagustuhan yung produkto',
                'Ang ganda nung product',
                'Ok naman yung case']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')


['Mali yung kulay na pinadala niyo', 'Nasunog yung buhok ko sa product niyo', 'Hindi legit yung product', 'Kulang yung pinadala', 'Hindi maayos ang packaging', 'Masyadong maliit yung damit', 'May butas yung damit', 'Hindi ko nagustuhan yung produkto', 'Ang ganda nung product', 'Ok naman yung case']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

Mali yung kulay na pinadala niyo
[0.0188629]


Nasunog yung buhok ko sa product niyo
[0.53021467]


Hindi legit yung product
[0.2920726]


Kulang yung pinadala
[0.00237773]


Hindi maayos ang packaging
[0.3477387]


Masyadong maliit yung damit
[0.03950476]


May butas yung damit
[0.00394353]


Hindi ko nagustuhan yung produkto
[0.05033471]


Ang ganda nung product
[0.9940585]


Ok naman yung case
[0.38830626]


