In [1]:
import io
import os
import re
import shutil
import string
import datetime
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import TensorBoard

### 第1步：下载数据集

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [3]:
# 参考API：https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file

dataset_dir = tf.keras.utils.get_file("aclImdb", origin=url, cache_dir='./IMDB', untar=True)

In [4]:
dataset_dir

'./IMDB/datasets/aclImdb'

In [5]:
train_dir = os.path.join(dataset_dir, 'train')

train_dir

'./IMDB/datasets/aclImdb/train'

In [6]:
os.listdir(train_dir)

['urls_pos.txt',
 'neg',
 'pos',
 'urls_unsup.txt',
 'urls_neg.txt',
 'labeledBow.feat',
 'unsupBow.feat']

In [7]:
# 清理unsup文件夹

# remove_dir = os.path.join(train_dir, 'unsup')

# shutil.rmtree(remove_dir)

In [8]:
os.listdir(train_dir)

['urls_pos.txt',
 'neg',
 'pos',
 'urls_unsup.txt',
 'urls_neg.txt',
 'labeledBow.feat',
 'unsupBow.feat']

### 第2步：数据集分离

#### 方式1：基于tensorflow API

In [9]:
# 参考 API : https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text_dataset_from_directory

BATCH_SIZE = 256 # 一批数据的大小
SEED = 666 # 随机种子

train_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir,
                                                              batch_size=BATCH_SIZE,
                                                              validation_split=0.2,
                                                              subset='training',
                                                              seed=SEED)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [10]:
val_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir,
                                                            batch_size=BATCH_SIZE,
                                                            validation_split=0.2,
                                                            subset='validation',
                                                            seed=SEED)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [11]:
# 显示一批数据中的前5个

for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch[i].numpy())

1 b"I love this movie, Jouvet, Arletty, Blier, Carn\xc3\xa9... almost everything has already been said about the movie, but there is one detail I'd like to shed some light onto: no footage of the real, still standing, H\xc3\xb4tel du Nord (is it still? I heard it was to be demolished...) has been used for the movie - the whole scene has been rebuilt on set, the main reason being that they could not stop the traffic on the St Martin canal for several weeks."
1 b'Brazilian films often get more positive appraisals than they actually deserve. Rather incredibly, Contra Todos (Against Everybody) (original title, which the producers discarded: God Against Everybody) got very low GPA (grade point average) in this website. It seems to be bluntly rejected by female spectators at large. Actually, it is not so brutal. I mean as far as graphical violence is concerned. Its brutality is intrinsic as it portrays would-be lumpens, I mean underdog citizens who in fact possess high-tech equipment, who co

#### 方式2：手动实现数据集切分

In [12]:
train_dir # 训练集数据集所在路径

'./IMDB/datasets/aclImdb/train'

In [13]:
os.listdir(train_dir) # 文件夹下的内容

['urls_pos.txt',
 'neg',
 'pos',
 'urls_unsup.txt',
 'urls_neg.txt',
 'labeledBow.feat',
 'unsupBow.feat']

In [14]:
# 分别读取 neg和pos文件夹下的文本文件

labels = ['pos', 'neg'] # 标签
texts_list = [] # 保存每一个文本文件中的内容
labels_list = [] # 保存每一个文本文件对应的标签

for lb in labels:
    dir_name = os.path.join(train_dir, lb) # neg 或 pos 的文件夹路径
    # 循环读取neg或pos文件夹下的文本文件
    for fname in os.listdir(dir_name):
        # 判断读取的文本文件的后缀名是否为 .txt
        if fname[-4:] == '.txt':
            file_path = os.path.join(dir_name, fname) # 拼接路径，即文本文件的完整路径
            # 打开文本文件，进行读取
            with open(file_path, 'r') as file:
                content = file.read() # 读取内容
            # 保存到texts_list
            texts_list.append(content)
            # 保存该文本文件对应的标签到labels_list
            if lb == 'pos':
                labels_list.append(1)
            else: # neg
                labels_list.append(0)

In [15]:
# 统计所有文本文件的数量

len(texts_list)

25000

In [16]:
# 统计所有标签的数量

len(labels_list)

25000

In [17]:
# 打乱样本顺序

dataset_size = len(texts_list) # 数据集大小

indices = np.arange(dataset_size) # 生成样本对应的索引

np.random.shuffle(indices) # 打乱顺序

indices[:10] # 显示前10个索引

array([15707,  8280,  6529,  6874, 13584,  6239,  2472, 11465,  4783,
        6795])

In [18]:
# 根据索引，打乱样本

data = np.array(texts_list)[indices]

labels = np.array(labels_list)[indices]

In [19]:
# 第一个样本

data[0]

"Action, violence, sex and coarse language are the things that the characters do during the whole movie. And everything they do is done without reason. Mark L. Lester is (un)known for his violent (without reason)movies (Commando, The Base). The story is weird but stupid. The actors play their stupid characters very well...I'm not telling they are stupid but I mean they are very bad actors. It's another low-budget unknown B series action movie. If you saw something like Operation Delta Force, Drive, The Patriot, Sanctuary or something like these bad movies from the same kind than Misbegotten...don't rent it...and, by the way, don't rent any of the movies I mentioned....I give it 1and a half out of5."

In [20]:
labels[0]

0

In [21]:
# 第二个样本

data[1]

"Eddie Murphy Delirious is by far the funniest thing you will ever see in your life. You can compare it to any movie, and I garuntee you will decide that Delirious is the funniest movie ever! This movie is about 1hr. 45 mins., and throughout that time, there was barely a moment where I wasn't laughing. You will laugh for hours after it is over, replaying the punch lines over and over and over in your head. Eddie Murphy has given so many funny performances over his career (48 Hrs.,Trading Places,Beverly Hills Cop,Raw,Coming To America, The Nutty professor,Shrek,etc.),but this is by far his MOST HILARIOUS moment. I have seen this movie so many times, and it is funnier every time. It never loses its edge. From this day forward, every great stand up performance will be emulated from Delirious. ***** and two thumbs up!"

In [22]:
labels[1]

1

In [23]:
# 数据集分离成 train 和 test

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=666)

In [24]:
X_train.shape

(20000,)

In [25]:
y_train.shape

(20000,)

In [26]:
X_test.shape

(5000,)

In [27]:
y_test.shape

(5000,)

### 第3步：数据加载到内存，提高I/O效率

In [28]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

### 第4步：数据预处理

In [29]:
# 参考 API ： https://www.tensorflow.org/api_docs/python/tf/strings/regex_replace

def preprocessing(input_data):
    # 所有字符串全部转换为小写
    lower_string = tf.strings.lower(input_data)
    # 用 ' ' 替换 '<br />'
    new_string = tf.strings.regex_replace(lower_string, '<br />', ' ')
    # 剔除字符串所有标点符号等，通过string.punctuation获取，用空字符替代
    final_string = tf.strings.regex_replace(new_string, '[%s]'%re.escape(string.punctuation), '')
    # 返回
    return final_string

In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
re.escape(string.punctuation)

'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'

In [32]:
# 参考 API ： https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization

# 单词的数量
VOCAB_SIZE = 20000

# 序列长度
SEQ_LEN = 100

# 对数据集进行预处理
vectorize_layer = TextVectorization(standardize = preprocessing, # 标准化处理
                                    max_tokens = VOCAB_SIZE, # 最大单词量
                                    output_mode = 'int', # 每个word对应一个整型索引
                                    output_sequence_length = SEQ_LEN # 输出序列的长度
                                    )

In [33]:
# 接下来针对文本数据进行预处理（注意：不是标签集）

text_ds = train_ds.map(lambda x, y : x) # 仅仅选择文本数据

vectorize_layer.adapt(text_ds)

In [34]:
# 获取一个batch_size的样本数据

text_batch, label_batch = next(iter(train_ds))

In [35]:
# 从batch_size中获取一个sample

text, label = text_batch[0], label_batch[0]

In [36]:
text

<tf.Tensor: shape=(), dtype=string, numpy=b'FORBIDDEN PLANET is the best SF film from the golden age of SF cinema and what makes it a great film is its sense of wonder . As soon as the spaceship lands the audience - via the ships human crew - travels through an intelligent and sometimes terrifying adventure . We meet the unforgetable Robbie , the mysterious Dr Morbuis , his beautiful and innocent daughter Altair and we learn about the former inhabitants of the planet - The Krell who died out overnight . Or did they ? <br /><br />You can nitpick and say the planet is obviously filmed in a movie studio with painted backdrops but that adds to a sense of menace of claustraphobia I feel and Bebe and Louis Barron`s electronic music adds even more atmosphere <br /><br />I`m shocked this film isn`t in the top 250 IMDB films .'>

In [37]:
label

<tf.Tensor: shape=(), dtype=int32, numpy=1>

In [38]:
vectorize_layer(tf.expand_dims(text, -1))

<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[ 3687,  1165,     7,     2,   114,  6816,    19,    36,     2,
         1924,   563,     5,  6816,   427,     3,    48,   159,     9,
            4,    84,    19,     7,    29,   276,     5,   577,    14,
          510,    14,     2,  7871,  5269,     2,   302,  2790,     2,
         3996,   402,  1001,  4033,   138,    33,  1112,     3,   505,
         3130,  1217,    71,   901,     2,     1, 12103,     2,  1278,
          892,     1,    24,   290,     3,  1254,   589,     1,     3,
           71,   814,    42,     2,  1124,  5708,     5,     2,  1165,
            2, 10141,    35,  1109,    45, 10093,    40,   113,    34,
           22,    68, 13704,     3,   131,     2,  1165,     7,   531,
          772,     8,     4,    17,  1144,    15,  4446,  7995,    18,
           12]])>

### 第5步：模型搭建、编译、训练、验证、预测

#### 方式1： 独立搭建模型训练、验证

In [39]:
# Embedding的维度
embedding_dim = 100

model = Sequential([
    vectorize_layer, # 首先，对strings进行转换
    Embedding(VOCAB_SIZE, embedding_dim, name='embedding'), # 通过training学习每个word的embedding vector
    GlobalAveragePooling1D(), # 统一固定的输出vector，也可以用 Flatten()
    Dense(32, activation='relu'), # 全连接层
    Dense(1) # 输出1个值（neg 或 pos）
])

In [40]:
# 模型编译

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), # loss='binary_crossentropy'
              metrics=['accuracy'])

In [41]:
# 定义 tensorboard callback

!rm -rf log_file # 如果存在log_file文件夹，则先删除

log_file = os.path.join('log_file', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorboard_callback = TensorBoard(log_file)

In [42]:
# 模型训练

model.fit(train_ds, # 训练集
          validation_data=val_ds, # 验证集
          epochs=20, # 训练轮数
          callbacks=[tensorboard_callback]) # 保存训练日志

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fabac223c70>

In [43]:
# 显示训练和验证效果

%load_ext tensorboard

%tensorboard --logdir log_file

In [44]:
# 模型验证

loss, accuracy = model.evaluate(val_ds)



In [45]:
print("loss = ", loss)
print("accuracy = ", accuracy)

loss =  0.9428325891494751
accuracy =  0.8140000104904175


In [46]:
# 模型预测

# 样本
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]

model.predict(examples)

array([[ 0.93476254],
       [-1.300469  ],
       [-3.523317  ]], dtype=float32)

### 方式2：基于Glove预训练word embedding搭建模型，训练、验证

In [47]:
glove_path = './glove.6B/glove.6B.100d.txt' # 文件路径

In [48]:
# 打开读取文件

embedding_dicts = {}

with open(glove_path, 'r') as file:
    temp = file.read().split('\n') # 按照换行符切分
    for t in temp:
        sentence = t.split(' ') # 每一行数据按照空格切分
        word = sentence[0] # 第一个是word
        coefs = sentence[1:] # 剩余部分为embedding数值
        embedding_dicts[word] = coefs # 键 ：值 

In [49]:
# 统计字典中的元素个数

len(embedding_dicts)

400001

In [50]:
# 查看第一个word及其embedding

for k, v in embedding_dicts.items():
    print("key : ", k)
    print("value : ", v)
    break

key :  the
value :  ['-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172', '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459', '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231', '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336', '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971', '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722', '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397', '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531', '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477', '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205', '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167', '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079', '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044', '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972', '0.15006', '-0.53212', '-0.2055', '-1.2526', '0.071624', '0.70565', '0.49744', '-0.42063', '0.26148', '-1.53

In [51]:
# 统计原始text数据集中不同的单词

new_data = []

for dt in data:
    res = preprocessing(dt) # 数据预处理（清理）
    # print(res.numpy().decode('utf-8')) # b'' 解码
    new_data.append(res.numpy().decode('utf-8'))

In [52]:
len(new_data)

25000

In [53]:
# 第一个样本

new_data[0]

'action violence sex and coarse language are the things that the characters do during the whole movie and everything they do is done without reason mark l lester is unknown for his violent without reasonmovies commando the base the story is weird but stupid the actors play their stupid characters very wellim not telling they are stupid but i mean they are very bad actors its another lowbudget unknown b series action movie if you saw something like operation delta force drive the patriot sanctuary or something like these bad movies from the same kind than misbegottendont rent itand by the way dont rent any of the movies i mentionedi give it 1and a half out of5'

In [54]:
# 对清理后的文本进行分词

tokenizer = Tokenizer()

tokenizer.fit_on_texts(new_data)

In [55]:
word_index_dict = tokenizer.word_index

word_index_dict

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'was': 12,
 'as': 13,
 'for': 14,
 'with': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 'on': 19,
 'not': 20,
 'you': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'be': 25,
 'he': 26,
 'one': 27,
 'its': 28,
 'at': 29,
 'all': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'from': 34,
 'who': 35,
 'so': 36,
 'like': 37,
 'her': 38,
 'just': 39,
 'or': 40,
 'about': 41,
 'has': 42,
 'if': 43,
 'out': 44,
 'some': 45,
 'there': 46,
 'what': 47,
 'good': 48,
 'more': 49,
 'when': 50,
 'very': 51,
 'even': 52,
 'she': 53,
 'my': 54,
 'up': 55,
 'no': 56,
 'would': 57,
 'time': 58,
 'which': 59,
 'only': 60,
 'really': 61,
 'story': 62,
 'their': 63,
 'were': 64,
 'see': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'than': 69,
 'we': 70,
 'much': 71,
 'well': 72,
 'been': 73,
 'get': 74,
 'will': 75,
 'also': 76,
 'into': 77,
 'bad': 78,
 'other': 79,
 'people': 80,
 'do': 81,
 'because': 82

In [56]:
# 不同的words

diff_words = len(tokenizer.word_index)

diff_words

112530

In [57]:
# 构建 word_embedding

MAX_WORD = 20000 # 假设常见单词20000个
EMBEDDING_DIM = 100 # 100维

embedding_matrix = np.zeros((MAX_WORD, EMBEDDING_DIM)) # 初始化为全零向量

for word, i in word_index_dict.items():
    # 从glove中检索
    word_embedding = embedding_dicts.get(word)
    # 判断是否超过10000的基线
    if i < MAX_WORD:
        # 判断是否检索到embedding
        if word_embedding is not None:
            embedding_matrix[i] = word_embedding

In [58]:
embedding_matrix[2]

array([-0.071953,  0.23127 ,  0.023731, -0.50638 ,  0.33923 ,  0.1959  ,
       -0.32943 ,  0.18364 , -0.18057 ,  0.28963 ,  0.20448 , -0.5496  ,
        0.27399 ,  0.58327 ,  0.20468 , -0.49228 ,  0.19974 , -0.070237,
       -0.88049 ,  0.29485 ,  0.14071 , -0.1009  ,  0.99449 ,  0.36973 ,
        0.44554 ,  0.28998 , -0.1376  , -0.56365 , -0.029365, -0.4122  ,
       -0.25269 ,  0.63181 , -0.44767 ,  0.24363 , -0.10813 ,  0.25164 ,
        0.46967 ,  0.3755  , -0.23613 , -0.14129 , -0.44537 , -0.65737 ,
       -0.042421, -0.28636 , -0.28811 ,  0.063766,  0.20281 , -0.53542 ,
        0.41307 , -0.59722 , -0.38614 ,  0.19389 , -0.17809 ,  1.6618  ,
       -0.011819, -2.3737  ,  0.058427, -0.2698  ,  1.2823  ,  0.81925 ,
       -0.22322 ,  0.72932 , -0.053211,  0.43507 ,  0.85011 , -0.42935 ,
        0.92664 ,  0.39051 ,  1.0585  , -0.24561 , -0.18265 , -0.5328  ,
        0.059518, -0.66019 ,  0.18991 ,  0.28836 , -0.2434  ,  0.52784 ,
       -0.65762 , -0.14081 ,  1.0491  ,  0.5134  , 

In [59]:
# 搭建模型

model = Sequential([
    vectorize_layer, # 首先，对strings进行转换
    Embedding(MAX_WORD, EMBEDDING_DIM, input_length=SEQ_LEN, weights=[embedding_matrix], trainable=False),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [60]:
# 模型编译

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [61]:
# 模型训练
!rm -rf logs

log_file = os.path.join('logs', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorboard = TensorBoard(log_file)

model.fit(train_ds,
          validation_data=val_ds,
          epochs=20,
          callbacks=[tensorboard])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fa884a9bb80>

In [62]:
%load_ext tensorboard

%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [63]:
# 模型验证

loss, accuracy = model.evaluate(val_ds)



### 第6步：获取训练后的 word embeddings

In [64]:
# 权重

weights = model.get_layer("embedding").get_weights()[0]

In [65]:
weights.shape

(20000, 100)

In [66]:
# 不同的单词数量

vocab = vectorize_layer.get_vocabulary()

vocab # 1000个单词

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'all',
 'at',
 'by',
 'an',
 'they',
 'who',
 'from',
 'so',
 'like',
 'her',
 'or',
 'just',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'more',
 'when',
 'very',
 'even',
 'my',
 'no',
 'she',
 'up',
 'would',
 'time',
 'only',
 'which',
 'really',
 'story',
 'their',
 'had',
 'see',
 'were',
 'can',
 'me',
 'than',
 'we',
 'much',
 'well',
 'get',
 'been',
 'will',
 'other',
 'also',
 'people',
 'into',
 'do',
 'because',
 'bad',
 'great',
 'him',
 'first',
 'most',
 'how',
 'dont',
 'made',
 'then',
 'them',
 'films',
 'movies',
 'way',
 'make',
 'too',
 'could',
 'any',
 'after',
 'characters',
 'think',
 'watch',
 'many',
 'seen',
 'character',
 'two',
 'being',
 'little',
 'never',
 'acting',
 'plot',
 'did',
 'best',
 'lo

In [67]:
# 保存 weights 和 words

out_v = io.open("weights.tsv", 'w', encoding='utf-8')
out_m = io.open("metadata.tsv", 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue # 跳过第一个字符，这是padding
    vec = weights[index] # 权重
    # 写入
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

# 关闭文件
out_v.close()
out_m.close()