# 基于RNN模型进行文本分类任务 

## 导入影评数据

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import numpy as np
import pprint
import logging
import time
from collections import Counter
from pathlib import Path
from tqdm import tqdm

In [2]:
(x_train,y_train),(x_test,y_test) = tf.keras.datasets.imdb.load_data()

In [None]:
x_train.shape

In [None]:
y_train[0]

In [3]:
word2id = tf.keras.datasets.imdb.get_word_index()

In [4]:
np.save("./imdb/word2id.npy",word2id)

In [5]:
word2id = {w:i+3 for w,i in word2id.items()}
word2id['<pad>'] = 0
word2id['<start>'] = 1
word2id['<unk>'] = 2
id2word = {i:w for w,i in word2id.items()}

In [6]:
def sort_by_len(x,y):
    x,y = np.asarray(x),np.asarray(y)
    idx = sorted(range(len(x)),key=lambda i:len(x[i]))
    return x[idx],y[idx]

In [7]:
x_train,y_train = sort_by_len(x_train,y_train)
x_test,y_test = sort_by_len(x_test,y_test)

# 这个word2id是包含test集中的词的
def write_file(path,xs,ys):
    with open(path,'w',encoding='utf-8') as f:
        for x,y in zip(xs,ys):
            f.write(str(y)+'\t'+' '.join([id2word[i] for i in x][1:]) + "\n")

write_file("./imdb/train.txt",x_train,y_train)
write_file("./imdb/test.txt",x_test,y_test)                  

## 构建语料表，基于词频进行统计

In [8]:
# 就是构建词典,去掉不常用的词（词频很低的词）
counter = Counter()
with open("./imdb/train.txt",'r',encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        label,words = line.split('\t')
        words = words.split(' ')
        counter.update(words)

words = ['<pad>'] + [w for w,fre in counter.most_common() if fre >= 10]

with open("./imdb/words.txt",'w',encoding='utf-8') as f:
    for w in words:
        f.write(w+'\n')

## 得到新的word2id

In [9]:
word2id = {}
# 得到新的word2id
with open("./imdb/words.txt",'r',encoding='utf-8') as f:
    for i,line in enumerate(f):
        line = line.strip()
        word2id[line] = i

## embedding层

In [10]:
embeddings = np.zeros((len(word2id)+1,50))
# len(word2id)：表示unk (unk用全零表示？)
# 0：表示pad

with open('./imdb/glove.6B/glove.6B.50d.txt',encoding='utf-8') as f:
    count = 0
    for i,line in enumerate(f):
        if i % 100000 == 0:
            print("-At line{}".format(i))
        line = line.strip()
        sp = line.split(" ")
        word,vec = sp[0],sp[1:]
        if word in word2id:
            count += 1
            embeddings[word2id[word]] = np.asarray(vec,dtype='float32')

-At line0
-At line100000
-At line200000
-At line300000


In [11]:
print(count,len(word2id))

19676 20598


In [12]:
np.save('./imdb/word.npy',embeddings)

## 训练数据

- tf.data.Dataset.from_tensor_slices(tensor)
- tf.data.Dataset.from_generator(data_generator,output_data_type,output_data_shape)

In [13]:
def data_generator(path,params):
    with open(path,encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            label,text = line.split("\t")
            text = text.split(" ")
            x = [word2id.get(w,len(word2id)) for w in text]
            if len(x) >= params['max_len']:
                x = x[:params['max_len']] # 截断
            else:
                x += [0] * (params['max_len']-len(x)) # 补齐
            y = int(label)
            yield x,y

In [14]:
def dataset(is_training,params):
    _shapes = ([params['max_len']],())
    _types = (tf.int32,tf.int32)
    
    if is_training:
        ds = tf.data.Dataset.from_generator(
            lambda:data_generator(params['training_path'],params),
            output_shapes=_shapes,
            output_types=_types)
        ds = ds.shuffle(params['num_samples'])
        ds = ds.batch(params['batch_size'])
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    else:
        ds = tf.data.Dataset.from_generator(
            lambda:data_generator(params['test_path'],params),
            output_shapes=_shapes,
            output_types=_types)
        ds = ds.batch(params['batch_size'])
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
        
    return ds

## 自定义网络模型

In [15]:
class Model(tf.keras.Model):
    def __init__(self,params):
        super().__init__()
        
        self.embedding = tf.Variable(np.load('./imdb/word.npy'),dtype=tf.float32,name='pretrained_embedding',trainable=False)
        
        self.drop1 = tf.keras.layers.Dropout(params['dropout_rate'])
        self.drop2 = tf.keras.layers.Dropout(params['dropout_rate'])
        self.drop3 = tf.keras.layers.Dropout(params['dropout_rate'])
        
        self.rnn1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=True))
        self.rnn2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=True))
        self.rnn3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=False))
        
        self.drop_fc = tf.keras.layers.Dropout(params['dropout_rate'])
        self.fc = tf.keras.layers.Dense(2*params['rnn_units'],tf.nn.elu)
        
        self.out_linear = tf.keras.layers.Dense(2)
        
    def call(self,inputs,training=False):
        if inputs.dtype != tf.int32:
            inputs = tf.cast(inputs,tf.int32)
        
        batch_sz = tf.shape(inputs)[0] 
        rnn_units = 2*params['rnn_units']
        
        x = tf.nn.embedding_lookup(self.embedding,inputs) # inputs:batch x max_len; x: batch x max_len x 50
        
        x = self.drop1(x,training=training)
        x = self.rnn1(x)
        
        x = self.drop2(x,training=training)
        x = self.rnn2(x)
        
        x = self.drop3(x,training=training)
        x = self.rnn3(x)
        
        x = self.drop_fc(x,training=training)
        x = self.fc(x)
        
        x = self.out_linear(x)
        
        return x

In [None]:
class Model(tf.keras.Model):
    def __init__(self,params):
        super().__init__()
        
        self.embedding = tf.Variable(np.load('./imdb/word.npy'),dtype=tf.float32,name='pretrained_embedding',trainable=False)
        
        self.drop1 = tf.keras.layers.Dropout(params['dropout_rate'])
        self.drop2 = tf.keras.layers.Dropout(params['dropout_rate'])
        self.drop3 = tf.keras.layers.Dropout(params['dropout_rate'])
        
        self.rnn1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=True))
        self.rnn2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=True))
        self.rnn3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units'],return_sequences=True))
        
        self.drop_fc = tf.keras.layers.Dropout(params['dropout_rate'])
        self.fc = tf.keras.layers.Dense(2*params['rnn_units'],tf.nn.elu)
        
        self.out_linear = tf.keras.layers.Dense(2)
        
    def call(self,inputs,training=False):
        if inputs.dtype != tf.int32:
            inputs = tf.cast(inputs,tf.int32)
        
        batch_sz = tf.shape(inputs)[0] 
        rnn_units = 2*params['rnn_units']
        
        x = tf.nn.embedding_lookup(self.embedding,inputs) # inputs:batch x max_len; x: batch x max_len x 50
        
        x = tf.reshape(x,(batch_sz*10,10,50)) #(None,10,50)
        x = self.drop1(x,training=training)
        x = self.rnn1(x)
        x = tf.reduce_max(x,1)
        
        x = tf.reshape(x,(batch_sz*10,10,rnn_units))
        x = self.drop2(x,training=training)
        x = self.rnn2(x)
        x = tf.reduce_max(x,1)
        
        x = tf.reshape(x,(batch_sz,10,rnn_units))
        x = self.drop3(x,training=training)
        x = self.rnn3(x)
        x = tf.reduce_max(x,1)
        
        x = self.drop_fc(x,training=training)
        x = self.fc(x)
        
        x = self.out_linear(x)
        
        return x

## 超参数

In [16]:
params = {
    'vocab_path':'./imdb/words.txt',
    'training_path':'./imdb/train.txt',
    'test_path':'./imdb/test.txt',
    'num_samples':25000,
    'num_labels':2,
    'batch_size':32,
    'max_len':300,
    'rnn_units':200,
    'dropout_rate':0.2,
    'clip_norm':10, # 梯度截断
    'num_patience':3,
    'lr':3e-4
}

In [17]:
def is_descending(history):
    history = history[-(params['num_patience']+1):]
    for i in range(1,len(history)):
        if history[i-1] <= history[i]:
            return False
    return True

In [18]:
word2id = {}
with open("./imdb/words.txt",'r',encoding='utf-8') as f:
    for i,line in enumerate(f):
        line = line.strip()
        word2id[line] = i
params['vocab_size'] = len(word2id)+1

model = Model(params)
model.build(input_shape=(None,None))

decay_lr = tf.optimizers.schedules.ExponentialDecay(params['lr'],1000,0.95)
optim = tf.optimizers.Adam(params['lr'])
global_step = 0

history_acc = []
best_acc = 0

t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

In [20]:
while True:
    for texts,labels in dataset(is_training=True,params = params):
        loss = 0
        with tf.GradientTape() as tape:
            logits = model(texts,training=True)
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,logits=logits)
            loss = tf.reduce_mean(loss)
        
        optim.lr.assign(decay_lr(global_step))
        grads = tape.gradient(loss,model.trainable_variables)
        grads,_ = tf.clip_by_global_norm(grads,params['clip_norm'])
        optim.apply_gradients(zip(grads,model.trainable_variables))
        
        if global_step % 50 == 0:
            logger.info('Step {} | loss: {:.4f} | Spent:{:.1f} secs | LR:{:.6f}'.format(global_step,loss.numpy().item(),time.time-t0,optim.lr.numpy().item()))
            t0 = time.time()
        global_step += 1
    
    m = tf.keras.metrics.Accuracy()
    
    for texts,labels in dataset(is_training=False,params=params):
        logits = model(texts,training=False)
        y_pred = tf.argmax(logits,axis=1)
        m.update_state(y_true=labels,y_pred=y_pred)
        
    acc = m.result().numpy()
    logger.info("Evaluation Testing Accuracy:{:.3f}".format(acc))
    history_acc.append(acc)
    
    if acc > best_acc:
        best_acc = acc
    logger.info("Best acc:{:.3f}".format(best_acc))
    
    if len(history_acc) > params['num_patience'] and is_descending(history_acc):
        logger.info("Testing Accuracy not improced over {} epochs, Early stop".format(params['num_patience']))
        break

TypeError: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'