In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import sys
import time
from tensorflow import keras
import os

In [None]:
imdb = keras.datasets.imdb
vocab_size = 10000
index_from = 3
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = vocab_size,
                                                                     index_from = index_from)

In [None]:
print(train_data[0], train_labels[0])
print(train_data.shape, train_labels.shape)
print(len(train_data[0]), len(train_data[1]))

print(test_data.shape, test_labels.shape)

In [None]:
word_index = imdb.get_word_index()
print(len(word_index))
print(word_index)

In [None]:
word_index = { k: (v+3) for k, v in word_index.items() }

In [None]:
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<END>'] = 3

reverse_word_index = dict([(value, key) for key, value in word_index.items()])

def decode_review(text_ids):
    return " ".join([reverse_word_index.get(word_id, "<UNK>") for word_id in text_ids])

decode_review(train_data[0])

In [None]:
max_length = 500
train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, # lis of list
    value = word_index['<PAD>'],
    padding = 'post',   # post, pre
    maxlen = max_length
)

test_data = keras.preprocessing.sequence.pad_sequences(
    test_data, # lis of list
    value = word_index['<PAD>'],
    padding = 'post',   # post, pre
    maxlen = max_length
)

print(train_data[0])

In [None]:
embedding_dim = 16
batch_size = 128
single_rnn_model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.LSTM(units = 64, return_sequences  =False), # return_sequences=True, 返回的结果是所有的输出；False，返回的结果是最后的输出
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
single_rnn_model.summary()
single_rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = single_rnn_model.fit(train_data, train_labels, epochs = 10, batch_size = batch_size, validation_split = 0.2)

In [None]:
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label] = history.history[label]
    data['val_' + label] = history.history['val_' + label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()
    
plot_learning_curves(history, 'accuracy', 30, 0, 1)
plot_learning_curves(history, 'loss', 30, 0, 1)
# 有过拟合现象

In [None]:
single_rnn_model.evaluate(test_data, test_labels, batch_size=batch_size)

In [None]:
# 出现过拟合
embedding_dim = 16
batch_size = 128
single_rnn_model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.Bidirectional(
        keras.layers.LSTM(units = 64, return_sequences  =True), # return_sequences=True, 返回的结果是所有的输出；False，返回的结果是最后的输出
    ),
    keras.layers.Bidirectional(
        keras.layers.LSTM(units = 64, return_sequences  =False), # return_sequences=True, 返回的结果是所有的输出；False，返回的结果是最后的输出
    ),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
single_rnn_model.summary()
single_rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = single_rnn_model.fit(train_data, train_labels, epochs = 10, batch_size = batch_size, validation_split = 0.2)

In [None]:
plot_learning_curves(history, 'accuracy', 30, 0, 1)
plot_learning_curves(history, 'loss', 30, 0, 4)

In [None]:
embedding_dim = 16
batch_size = 128
single_rnn_model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    keras.layers.Bidirectional(
        keras.layers.LSTM(units = 64, return_sequences  =False), # return_sequences=True, 返回的结果是所有的输出；False，返回的结果是最后的输出
    ),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])
single_rnn_model.summary()
single_rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = single_rnn_model.fit(train_data, train_labels, epochs = 10, batch_size = batch_size, validation_split = 0.2)

In [None]:
plot_learning_curves(history, 'accuracy', 30, 0, 1)
plot_learning_curves(history, 'loss', 30, 0, 2)

In [None]:
single_rnn_model.evaluate(test_data, test_labels, batch_size=batch_size)