In [9]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [3]:
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True,as_supervised=True)



In [4]:
#获取训练集、测试集
train_dataset, test_dataset = dataset['train'], dataset['test'] 

#获取tokenizer对象，用进行字符处理级id转换(这里先转换成subword，再转换为id)等操作 
tokenizer = info.features['text'].encoder
print('vocabulary size: ', tokenizer.vocab_size)

vocabulary size:  8185


In [5]:
#token对象测试
sample_string = 'Hello word , Tensorflow'
tokenized_string = tokenizer.encode(sample_string)
print('tokened id: ', tokenized_string)

tokened id:  [4025, 222, 2621, 1199, 6307, 2327, 2934]


In [6]:
#解码还原字符串
src_string = tokenizer.decode(tokenized_string) 
print('original string: ', src_string) 

original string:  Hello word , Tensorflow


In [7]:
#解出每个subword
for t in tokenized_string:
 print(str(t)+'->['+tokenizer.decode([t])+ ']') 

4025->[Hell]
222->[o ]
2621->[word]
1199->[ , ]
6307->[Ten]
2327->[sor]
2934->[flow]


In [8]:
#构建批次训练集
BUFFER_SIZE=10000
BATCH_SIZE = 64

train_dataset = ( train_dataset
    .shuffle(BUFFER_SIZE)
    .padded_batch(BATCH_SIZE))
test_dataset = (
    test_dataset
    .padded_batch(BATCH_SIZE))

## 模型构建

In [10]:
def get_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])
  return model

model = get_model()

In [11]:
model.compile(loss='binary_crossentropy',
       optimizer='adam',
       metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=5,validation_data=test_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 可视化

In [None]:
# 查看训练过程
import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel('epochs')
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')

## 模型测试

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)
print('test loss: ', test_loss)
print('test acc: ', test_acc)

In [None]:
def pad_to_size(vec, size):
  zeros = [0] * (size-len(vec))
  vec.extend(zeros)
  return vec

def sample_predict(sentence, pad=False):
  tokened_sent = tokenizer.encode(sentence)
  if pad:
    tokened_sent = pad_to_size(tokened_sent, 64)
  pred = model.predict(tf.expand_dims(tokened_sent, 0))
  return pred

# 没有padding的情况
sample_pred_text = ('The movie was cool. The animation and the graphics '
          'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=False)
print(predictions)

# 有padding的情况
sample_pred_text = ('The movie was cool. The animation and the graphics '
          'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text, pad=True)
print (predictions)