In [1]:
import sys
import sklearn
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os

In [2]:
tf.random.set_seed(42)# to make this notebook's output stable across runs

#      导入数据集

In [3]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
#加载IMDb评论数据集，包含50000条英语电影评论，25000用于测试，25000用于训练。
#每条评论有一个标签，负面为0,正面为1.

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
#展示
X_train[0][:15]
#X_train中每个评论由一个np整数数组表示，每个整数表示一个单词
#这里取训练集中第一条评论的前15个单词显示

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4]

# 可视化评论

#### get_word_index()返回词索引词典。键是字符串，值是它们的索引。

#### enumerate(sequence, [start=0]) 返回 enumerate(枚举) 对象。
#### 参数：sequence为一个序列、迭代器或其他支持迭代对象。start 为下标起始位置的值。

#### str.join(sequence)将序列sequence中的元素与指定的字符str连接生成一个新的字符串。

In [5]:
word_index = keras.datasets.imdb.get_word_index()
#获取评论的标签，返回字典 word_index = {x:y}

id_to_word = {id_ + 3: word for word, id_ in word_index.items()}

#相当于
for word, id_ in word_index.items():
    id_to_word[id_ + 3] = word
    
#给评论中每一个单词分配一个数字id，从第3个开始分配，前三个为"<pad>"（填充符）, "<sos>"（序列开始）, "<unk>"（未知）

#演示
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    #对列表("<pad>", "<sos>", "<unk>")遍历索引和元素
    id_to_word[id_] = token#给每个token（单词）分配id
    
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])#每个单词用空格分割

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

## 加载原始文本数据集

In [6]:
import tensorflow_datasets as tfds
#以文本（字节字符串）的形式加载原始的IMDb评论

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete6UY8PU/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete6UY8PU/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete6UY8PU/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [7]:
datasets.keys()#打印数据集的类型

dict_keys(['test', 'train', 'unsupervised'])

In [8]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [9]:
train_size, test_size

(25000, 25000)

In [10]:
#演示
for X_batch, y_batch in datasets["train"].batch(2).take(1):#取出训练集中的2条评论，逐一进行以下循环，
                                #其中X_batch为评论的文本数据, y_batch为评论的标签
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")#只输出处理的评论的前200个单词
        print("Label:", label, "= Positive" if label else "= Negative")#输出处理的评论的类型
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



# 预处理

In [11]:
#定义预处理函数
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)#从截断评论开始，每条评论保留前300个字符
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")#用空格替换<br/>
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")#用空格替换字母和引号以外的所有字符
    X_batch = tf.strings.split(X_batch)#将评论按照空格分割，返回代表代表每条评论转化成的不规则长度的向量
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch#将所有评论用<pad>填充使得所有向量长度相同

In [12]:
#演示
preprocess(X_batch, y_batch)#预处理并打印取出的两条评论

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

# 构建Dictionary

In [13]:
from collections import Counter#使用Counter对单词出现的次数计数

vocabulary = Counter()#构建词汇表
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):#取32条训练集中的评论批量预处理
    for review in X_batch:
        vocabulary.update(list(review.numpy()))#更新词汇表

In [14]:
#演示
vocabulary.most_common()[:3]#三个最常见的词

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [15]:
len(vocabulary)

53893

In [16]:
vocab_size = 10000#截断词表，减小词表长度至10000，保留出现频率前10000的词语
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [17]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}#将每个单词替换成该单词对应的id
#演示
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)#在前10000个单词中未出现的词语用10000表示

22
12
11
10000


# 构建查找表

In [18]:
#创建查找表
words = tf.constant(truncated_vocabulary)#用前10000个词语定义词汇表
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)#创建相关索引
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)#为查找表创建初始化程序，将类别列表及其对应索引传递给它
num_oov_buckets = 1000 #1000个存储桶
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)
#当查找词表中不存在的类别时查找表会计算该类别的哈希值，将这个未知类别分配给一个oov桶，它的类别从已知类别开始

In [19]:
#演示用查找表查找几个单词的id
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))
#基于数据样本定义词汇表，为不在数据样本中的其他类别添加一些桶
#faaaaaantastic映射到ID＞=10000的一个oov桶中了

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

# 创建最终训练集和测试集

In [20]:
#创建最终的训练集
def encode_words(X_batch, y_batch):#对单词进行编码
    return table.lookup(X_batch), y_batch


train_set = datasets["train"].batch(32).map(preprocess)#将32条评论转化为一个单词的短序列的[ [] [] ]形式的Tensor
train_set = train_set.map(encode_words).prefetch(1)#将单词短序列转换成单词id短序列

In [21]:
test_set = datasets["test"].batch(32).map(preprocess)#将32条评论转化为一个单词的短序列的[ [] [] ]形式的Tensor
test_set = test_set.map(encode_words).prefetch(1)#将单词短序列转换成单词id短序列

In [22]:
#演示
for X_batch, y_batch in train_set.take(1):#从训练集中取出一条评论
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


# 创建模型 及 在训练集上训练

In [23]:
#创建模型，开始训练
embed_size = 128  #步长
#嵌入维度为128维
model = keras.models.Sequential([
                                 #嵌入矩阵形状为（10000+1000,128）即（11000,128），每一行为一个tensor
                                 #一个tensor包含32条评论，每条评论中显示前300个单词，一次处理128个Tensor（128列）
                                 #将所有索引标号映射到致密的低维向量：[[4],[32],[67]]被映射为[[0.3,0.9,0.2]
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,#随机初始化嵌入矩阵，使用索引调用时返回嵌入矩阵中该索引在的行
                           mask_zero=True, 
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),#使用128个神经元的GRU层（未指定输入的形状）。预测评论的类型
    #由于GRU堆叠使用，故添加return_sequences=True使得下一个GRU层有三维输入
    keras.layers.GRU(128),#返回最后一个时间步长的输出
    keras.layers.Dense(1, activation="sigmoid")#使用sigmoid输出估计为Positive类型概率的单个神经元
])
#Embedding层将单词id转换为嵌入，训练期间嵌入会逐渐改善，梯度下降会使得该评论渐渐靠近其所属类别
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])#计算模型的二元交叉熵作为loss，使用adam优化，计算每一层的ACC
history = model.fit(train_set, epochs=15)#迭代次数，训练的轮数为15
#782*batch_size=25000,782=25000/32，是每轮次处理的tensor数量

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


# 利用测试集测试模型性能

In [24]:
test = model.fit(test_set, epochs=15)#迭代次数，训练的轮数为15
#782*batch_size=25000,782=25000/32，是每轮次处理的tensor数量

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
