# Análisis de sentimiento con RNN (Redes neuronales recurrentes)

In [38]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

from collections import Counter

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

import joblib  

In [6]:
df = pd.read_csv('movie_data_clean.csv', encoding='utf-8')

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1
1,ok so i really like kris kristofferson and his...,0
2,spoiler do not read this if you think about w...,0
3,hi for all the people who have seen this wonde...,1
4,i recently bought the dvd forgetting just how ...,0


In [8]:
## Step 1: create a dataset
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [9]:
## inspection:
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][ :50], ex[1])

b'in 1974 the teenager martha moxley maggie grace mo' 1
b'ok so i really like kris kristofferson and his usu' 0
b' spoiler do not read this if you think about watch' 0


In [11]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
       50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [14]:
## Step 2: find unique tokens (words)
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

In [15]:
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)

In [16]:
print('Vocab-size:', len(token_counts))

Vocab-size: 70651


In [17]:
## Step 3: encoding unique tokens to integers
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[70652, 9, 258, 1046]


In [18]:
## Step 3-A: define the function for transformation

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

In [19]:
## Step 3-B: wrap the encode function to a TF Op.

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [20]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [23]:
# look at the shape of some examples:

tf.random.set_seed(1)

for example in ds_train.shuffle(1000).take(5):
    print('Sequence length:', example[0].shape)

Sequence length: (24,)
Sequence length: (162,)
Sequence length: (258,)
Sequence length: (521,)
Sequence length: (130,)


In [24]:
## Take a small subset
ds_subset = ds_train.take(8)
for example in ds_subset:
    print('Individual size:', example[0].shape)

Individual size: (119,)
Individual size: (686,)
Individual size: (308,)
Individual size: (204,)
Individual size: (318,)
Individual size: (240,)
Individual size: (127,)
Individual size: (445,)


In [25]:
## Dividing the dataset into batches
ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))

In [26]:
for batch in ds_batched:
    print('Batch dimension:', batch[0].shape)

Batch dimension: (4, 686)
Batch dimension: (4, 445)


In [27]:
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))
test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

In [31]:
## Example of Embedding layer
model = tf.keras.Sequential()

model.add(tf.keras.layers.Embedding(input_dim=100,
    output_dim=6,
    input_length=20,
    name ='embed-layer'))
          
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [34]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)

## build the model
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    name='embed-layer'),

    tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(64, name='lstm-layer'),
    name='bidir-lstm'),
    
    tf.keras.layers.Dropout(.5, seed = 123),

    tf.keras.layers.Dense(64, activation='relu'),
    
     tf.keras.layers.Dense(1, activation='sigmoid')
    ])

In [35]:
bi_lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1413060   
                                                                 
 bidir-lstm (Bidirectional)  (None, 128)               43520     
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,464,901
Trainable params: 1,464,901
Non-trainable params: 0
_________________________________________________________________


In [36]:
## compile and train:
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])

In [37]:
history = bi_lstm_model.fit(train_data,
    validation_data=valid_data,
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
joblib.dump(bi_lstm_model, 'modelo_entrenado_1.pkl') # Guardo el modelo.



INFO:tensorflow:Assets written to: ram://23ab8223-172a-4526-94f8-f8754dedf1c8/assets


INFO:tensorflow:Assets written to: ram://23ab8223-172a-4526-94f8-f8754dedf1c8/assets


['modelo_entrenado_1.pkl']

In [40]:
## evaluate on the test data
test_results = bi_lstm_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(test_results[1]*100))

Test Acc.: 82.86%
