<img src="Tarjeta.png">

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from tensorflow import keras as ks
from tensorflow.keras.datasets import imdb
from matplotlib import pyplot as plt
import numpy as np

In [2]:
vocabulary_size = 10000

In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocabulary_size)

In [4]:
print('Size train', x_train.size)
print('Size test', x_test.size)

Size train 25000
Size test 25000


In [5]:
# Cada uno de los elementos en x_train y x_test son una review
# Si imprimimos una cualquiera veremos que no son mas que enteros
print(x_train[0]) 

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [6]:
# Para redistribuir el dataset de train y test concatenamos todos los datos
# Y lo mismo con las etiquetas
datos = np.concatenate((x_train, x_test), axis=0)
etiquetas = np.concatenate((y_train, y_test), axis=0)

In [7]:
# Podemos ver que valores tienen estas etiquetas
print(np.unique(etiquetas))
# Como vemos solo contiene 1's y 0's ya que solo queremos clasificar en positivo
# o negativo

[0 1]


In [8]:
# Parece que IMDB se reserva 2 posiciones, la 0 y la 3, por eso hay 2 palabras menos
print("Palabras unicas:", len(np.unique(np.hstack(datos))))

Palabras unicas: 9998


In [9]:
# Podemos trabajar ahora con las frases como si fueran un vector normal

longitudes = [len(i) for i in datos]

print("Longitud media de las frases:", np.mean(longitudes))
print("Desviacion std en las longitudes:", np.std(longitudes))

Longitud media de las frases: 234.75892
Desviacion std en las longitudes: 172.91149458735703


In [10]:
# Podemos ver tambien la etiqueta y el texto decodificado de alguna review
print("Etiqueta de review 0:", etiquetas[0])
print("Review 0:", datos[0])

Etiqueta de review 0: 1
Review 0: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [11]:
# Como vemos las reviews ahora no son mas que los indices de las palabras que
# los componen
indices = imdb.get_word_index()
# Ahora tenemos un diccionario donde las keys son las palabras y el value los
# indices, necesitamos hacer el reverso
reverse_index = dict([(valor, clave) for (clave, valor) in indices.items()])

In [12]:
review = datos[0]
review_decodificada = ' '.join([reverse_index.get(i-3, "\n") for i in review])
print(review_decodificada)


 this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert 
 is an amazing actor and now the same being director 
 father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for 
 and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also 
 to the two little boy's that played the 
 of norman and paul they were just brilliant children are often left out of the 
 list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

In [13]:
from tensorflow.keras.preprocessing import sequence

In [14]:
datos = sequence.pad_sequences(datos, maxlen=vocabulary_size)
etiquetas = np.array(etiquetas).astype("float32")

In [15]:
longitud = [len(i) for i in datos]

print("Longitud media:", np.mean(longitud))
print("Desviacion std:", np.std(longitud))

Longitud media: 10000.0
Desviacion std: 0.0


In [16]:
# Los ceros se añaden al principio para que la red no vaya perdiendo memoria
print(datos[0])

[  0   0   0 ...  19 178  32]


In [17]:
x_val = datos[-10000:]
y_val = etiquetas[-10000:]

x_train = datos[:-10000]
y_train = etiquetas[:-10000]

In [18]:
model = ks.Sequential()

# Empezamos con el embedding
model.add(ks.layers.Embedding(vocabulary_size, output_dim=32, input_length=vocabulary_size))

# Agregamos una RNN (LSTM)
model.add(ks.layers.LSTM(100))
model.add(ks.layers.Dense(32, activation='relu', kernel_initializer='he_uniform'))
model.add(ks.layers.Dropout(0.5))
model.add(ks.layers.Dense(1, activation='sigmoid'))

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10000, 32)         320000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 376,465
Trainable params: 376,465
Non-trainable params: 0
_________________________________________________________________


In [20]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_loss', patience=5)

In [21]:
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
# Si ponemos un batch size demasiado grande (512) fallará
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_val, y_val),
                    batch_size=128, callbacks=[callback]) 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 61/313 [====>.........................] - ETA: 5:39 - loss: 0.1522 - accuracy: 0.9481