In [3]:
import numpy as np
import tensorflow as tf

# 该数据集共有 50000 条评论数据，并被打上了积极（1）或消极（0）的标签。数据集中的每一条评论都经过预处理，并编码为词索引（整数）的序列表示。词索引的意思是，将词按数据集中出现的频率进行索引，例如整数 3 编码了数据中第三个最频繁的词。一般情况下，IMDB 数据集会被划分为训练集和测试集各占一半，斯坦福研究人员在 2011 年发布该数据集时，得到的预测准确率为 88.89%
# 加载数据, num_words 表示只考虑最常用的 n 个词语，代表本次所用词汇表大小
MAX_DICT = 1000
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(
    num_words=MAX_DICT
)

# X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [4]:
# 这是一段话里，每个单词的索引
np.array(X_train[0])  # 直接运行
# array([  1,  14,  22,  16,  43, 530, 973,   2,   2,  65, 458,   2,  66,
#          2,   4, 173,  36, 256,   5,  25, 100,  43, 838, 112,  50, 670,
#          2,   9,  35, 480, 284,   5, 150,   4, 172, 112, 167,   2, 336,
#        385,  39,   4, 172,   2,   2,  17, 546,  38,  13, 447,   4, 192,
#         50,  16,   6, 147,   2,  19,  14,  22,   4,   2,   2, 469,   4,
#         22,  71,  87,  12,  16,  43, 530,  38,  76,  15,  13,   2,   4,
#         22,  17, 515,  17,  12,  16, 626,  18,   2,   5,  62, 386,  12,
#          8, 316,   8, 106,   5,   4,   2,   2,  16, 480,  66,   2,  33,
#          4, 130,  12,  16,  38, 619,   5,  25, 124,  51,  36, 135,  48,
#         25,   2,  33,   6,  22,  12, 215,  28,  77,  52,   5,  14, 407,
#         16,  82,   2,   8,   4, 107, 117,   2,  15, 256,   4,   2,   7,
#          2,   5, 723,  36,  71,  43, 530, 476,  26, 400, 317,  46,   7,
#          4,   2,   2,  13, 104,  88,   4, 381,  15, 297,  98,  32,   2,
#         56,  26, 141,   6, 194,   2,  18,   4, 226,  22,  21, 134, 476,
#         26, 480,   5, 144,  30,   2,  18,  51,  36,  28, 224,  92,  25,
#        104,   4, 226,  65,  16,  38,   2,  88,  12,  16, 283,   5,  16,
#          2, 113, 103,  32,  15,  16,   2,  19, 178,  32])

array([  1,  14,  22,  16,  43, 530, 973,   2,   2,  65, 458,   2,  66,
         2,   4, 173,  36, 256,   5,  25, 100,  43, 838, 112,  50, 670,
         2,   9,  35, 480, 284,   5, 150,   4, 172, 112, 167,   2, 336,
       385,  39,   4, 172,   2,   2,  17, 546,  38,  13, 447,   4, 192,
        50,  16,   6, 147,   2,  19,  14,  22,   4,   2,   2, 469,   4,
        22,  71,  87,  12,  16,  43, 530,  38,  76,  15,  13,   2,   4,
        22,  17, 515,  17,  12,  16, 626,  18,   2,   5,  62, 386,  12,
         8, 316,   8, 106,   5,   4,   2,   2,  16, 480,  66,   2,  33,
         4, 130,  12,  16,  38, 619,   5,  25, 124,  51,  36, 135,  48,
        25,   2,  33,   6,  22,  12, 215,  28,  77,  52,   5,  14, 407,
        16,  82,   2,   8,   4, 107, 117,   2,  15, 256,   4,   2,   7,
         2,   5, 723,  36,  71,  43, 530, 476,  26, 400, 317,  46,   7,
         4,   2,   2,  13, 104,  88,   4, 381,  15, 297,  98,  32,   2,
        56,  26, 141,   6, 194,   2,  18,   4, 226,  22,  21, 13

In [7]:
index = tf.keras.datasets.imdb.get_word_index()  # 获取词索引表

# 生成 index -> word 映射
# {34701: 'fawn',
#  52006: 'tsukino'}
reverse_index = dict([(value, key) for (key, value) in index.items()])

In [8]:
# 为什么-3, 0、1、2是为“padding”（填充）、“start of sequence”（序列开始）、“unknown”（未知词）分别保留的索引
comment = " ".join(
    [reverse_index.get(i - 3, "#") for i in X_train[0]]
)  # 还原第 1 条评论
comment
# "# this film was just brilliant casting # # story direction # really # the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same # # as myself so i loved the fact there was a real # with this film the # # throughout the film were great it was just brilliant so much that i # the film as soon as it was released for # and would recommend it to everyone to watch and the # # was amazing really # at the end it was so sad and you know what they say if you # at a film it must have been good and this definitely was also # to the two little # that played the # of # and paul they were just brilliant children are often left out of the # # i think because the stars that play them all # up are such a big # for the whole film but these children are amazing and should be # for what they have done don't you think the whole story was so # because it was true and was # life after all that was # with us all"

"# this film was just brilliant casting # # story direction # really # the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same # # as myself so i loved the fact there was a real # with this film the # # throughout the film were great it was just brilliant so much that i # the film as soon as it was released for # and would recommend it to everyone to watch and the # # was amazing really # at the end it was so sad and you know what they say if you # at a film it must have been good and this definitely was also # to the two little # that played the # of # and paul they were just brilliant children are often left out of the # # i think because the stars that play them all # up are such a big # for the whole film but these children are amazing and should be # for what they have done don't you think the whole story was so # because it was true and was # life after all that was # with us all"

In [9]:
# 神经网络输入时，必须保证每一条数据的形状是一致的，所以这里需要对数据进行预处理
MAX_LEN = 200  # 设定句子最大长度
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, MAX_LEN)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, MAX_LEN)

X_train.shape, X_test.shape

((25000, 200), (25000, 200))

In [None]:
# Word Embedding
# 词索引特征化手段
# 字典只能单纯将词处理成数值，但 Embedding 却可以让词与词直接产生联系: 向量距离

# tf.keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=False, input_length=None)
# - input_dim：int > 0，词汇表大小。
# - output_dim：int >= 0，词向量的维度。
# - input_length：输入序列的长度，当它是固定的时候。如果你需要连接 Flatten 和 Dense 层，则这个参数是必须的。

In [10]:
# 搭建一个简单的全连接网络来完成评论情绪分类
model = tf.keras.Sequential()

# 将整数形式的词典索引转换为固定大小的稠密向量
model.add(tf.keras.layers.Embedding(MAX_DICT, 16, input_length=MAX_LEN))

# 添加一个展平层（Flatten Layer），将嵌入层的输出展平为一维向量，以便连接到全连接层
model.add(tf.keras.layers.Flatten())

# 添加一个全连接层（Dense Layer），输出维度为1，使用 sigmoid 激活函数来输出二分类概率
# 输出是一个二分类，情绪是正向 or 逆向
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model.summary()



In [11]:
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])

In [14]:
EPOCHS = 1
BATCH_SIZE = 64
# X_train 是索引，对索引进行训练
model.fit(X_train, y_train, BATCH_SIZE, EPOCHS, validation_data=(X_test, y_test))

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5965 - loss: 0.6513 - val_accuracy: 0.8327 - val_loss: 0.3888


<keras.src.callbacks.history.History at 0x21f3ec6bed0>

In [None]:
# 简单 RNN 神经网络：全连接的 RNN

In [None]:
# `tf.keras.layers.SimpleRNN(units, activation='tanh', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.0, return_sequences=False, return_state=False, go_backwards=False, stateful=False, unroll=False)`
# - units: 正整数，输出空间的维度。
# - activation: 要使用的激活函数。如果传入 None，则使用线性激活。
# - use_bias: 布尔值，该层是否使用偏置项量。
# - dropout: 在 0 和 1 之间的浮点数。
# - return_sequences: 布尔值。是返回输出序列中的最后一个输出，还是全部序列。

# Dropout: 深度学习中经常会接触到的概念，其经常以 tf.keras.layers.Dropout 🔗 这样的网络层出现。Dropout 主要的作用是防止过拟合，实现原理是以一定概率（Dropout 参数值）断开神经元之间的连接

model_RNN = tf.keras.Sequential()
model_RNN.add(tf.keras.layers.Embedding(MAX_DICT, 32))
# dropout 是层与层之前的 dropout 数值，recurrent_dropout 是上个时序与这个时序的 dropout 值
model_RNN.add(tf.keras.layers.SimpleRNN(units=32, dropout=0.2, recurrent_dropout=0.2))
model_RNN.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model_RNN.summary()

In [None]:
model_RNN.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
model_RNN.fit(X_train, y_train, BATCH_SIZE, EPOCHS, validation_data=(X_test, y_test))

In [None]:
# LSTM 循环神经网络

In [None]:
model_LSTM = tf.keras.Sequential()
# 将整数形式的词典索引转换为固定大小的稠密向量
model_LSTM.add(tf.keras.layers.Embedding(MAX_DICT, 32))
model_LSTM.add(tf.keras.layers.LSTM(units=32, dropout=0.2, recurrent_dropout=0.2))
model_LSTM.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model_LSTM.summary()
# LSTM 比起简单 RNN 会多学到一些参数，但这些参数帮助我们规避了梯度消失等致命性问题

In [None]:
model_LSTM.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
model_LSTM.fit(X_train, y_train, BATCH_SIZE, EPOCHS, validation_data=(X_test, y_test))
# LSTM 做为一个循环 神经网络的模块，设计非常巧妙，通过遗忘门和输入门对记忆单元不断更新，消除了循环 神经网络训练时梯度消失的致命问题

In [None]:
# GRU

In [None]:
model_GRU = tf.keras.Sequential()
model_GRU.add(tf.keras.layers.Embedding(MAX_DICT, 32))
model_GRU.add(tf.keras.layers.GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model_GRU.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model_GRU.summary()

In [None]:
model_GRU.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])
model_GRU.fit(X_train, y_train, BATCH_SIZE, EPOCHS, validation_data=(X_test, y_test))