In [2]:
import torch
import torch.nn as nn
import time
import json
import os
import math
from tqdm import tqdm
from torch.utils.data import  DataLoader
from Exp_DataSet import Corpus
from Exp_Model import BiLSTM_model, Transformer_model
import torch.nn.functional as F
import transformers

In [39]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()

        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
class Transformer_model(nn.Module):
    def __init__(self, vocab_size, ntoken, d_emb=512, d_hid=2048, nhead=8, nlayers=6, dropout=0.2, embedding_weight=None):
        super(Transformer_model, self).__init__()
        # 将"预训练的词向量"整理成 token->embedding 的二维映射矩阵 emdedding_weight 的形式，初始化 _weight
        # 当 emdedding_weight == None 时，表示随机初始化
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_emb, _weight=embedding_weight)

        self.pos_encoder = PositionalEncoding(d_model=d_emb, max_len=ntoken)
        self.encode_layer = nn.TransformerEncoderLayer(d_model=d_emb, nhead=nhead, dim_feedforward=d_hid)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer=self.encode_layer, num_layers=nlayers)
        #-----------------------------------------------------begin-----------------------------------------------------#
        # 请自行设计对 transformer 隐藏层数据的处理和选择方法
        self.dropout = nn.Dropout(dropout)  # 可选
        self.ntoken = ntoken
        self.d_hid = d_hid
        self.d_emb = d_emb
        # 请自行设计分类器
        self.fc = nn.Sequential(
            nn.Linear(self.d_emb, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 15),
        )

        #------------------------------------------------------end------------------------------------------------------#

    def forward(self, x):
       
        x = self.embed(x)     
        x = x.permute(1, 0, 2)          
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)      # [batch_size, ntoken, d_emb]
        #-----------------------------------------------------begin-----------------------------------------------------#
        # 对 transformer_encoder 的隐藏层输出进行处理和选择，并完成分类
        x = self.dropout(x) # 可选
        #x = x.reshape(-1, self.ntoken*self.d_emb) 
        #x = F.avg_pool1d(x.permute(0,2,1), x.size(1)).squeeze()   # 池化并挤压后[batch_size, d_emb]
        #取最后两个时间步的输出，然后作池化，然后分类
        x = torch.cat((x[:, -1, :], x[:, -2, :]), dim=1)
        x = F.max_pool1d(x.permute(0,2,1), x.size(1)).squeeze()
        x = self.fc(x)
        #------------------------------------------------------end------------------------------------------------------#
        return x
    
class BiLSTM_model(nn.Module):
    """
    vocab_size: 词表大小,不是词向量表中词的个数，而是数据集中切出来的所有词的个数
    ntoken: 一个句子中的词的最大个数
    d_emb: 词向量的维度
    d_hid: 隐藏层的维度
    nlayers: lstm层数
    dropout: dropout的比例
    embedding_weight: 预训练的词向量，格式为 token->embedding 的二维映射矩阵
    """
    def __init__(self, vocab_size, ntoken, d_emb=100, d_hid=80, nlayers=1, dropout=0.2, embedding_weight=None):
        super(BiLSTM_model, self).__init__()
        # 将"预训练的词向量"整理成 token->embedding 的二维映射矩阵 emdedding_weight 的形式，初始化 _weight
        # 当 emdedding_weight == None 时，表示随机初始化
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_emb, _weight=embedding_weight)

        self.lstm = nn.LSTM(input_size=d_emb, hidden_size=d_hid, num_layers=nlayers, bidirectional=True, batch_first=True)
        #-----------------------------------------------------begin-----------------------------------------------------#
        # 请自行设计对 bilstm 隐藏层数据的处理和选择方法
        self.dropout = nn.Dropout(dropout)  # 可选
        self.ntoken = ntoken
        self.d_hid = d_hid
        # 请自行设计分类器
        self.classifier = nn.Sequential(
            nn.Linear(self.d_hid*2, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 15),
        )
       
        #------------------------------------------------------end------------------------------------------------------#

    def forward(self, x:torch.Tensor):
        
        # x = x.long()
        #print("输入embed的x:",type(x), x.shape)
        x = self.embed(x)
        x = self.lstm(x)[0]
        #-----------------------------------------------------begin-----------------------------------------------------#
        # 对 bilstm 的隐藏层输出进行处理和选择，并完成分类
        #x = self.dropout(x).reshape(-1, self.ntoken*self.d_hid*2)   # ntoken*nhid*2 (2 means bidirectional)
        #先做池化，然后在分类
        #x = F.max_pool1d(x.permute(0,2,1), x.size(1)).squeeze()
        #取最后两个时间步的输出，然后作池化，然后分类
        x = torch.cat((x[:, -1, :], x[:, -2, :]), dim=1)
        x = F.max_pool1d(x.permute(0,2,1), x.size(1)).squeeze()
        x = self.classifier(x)
        #------------------------------------------------------end------------------------------------------------------#
        return x

In [None]:
if __name__ == '__main__':
    dataset_folder = './data/tnews_public'
    output_folder = './output'

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    #-----------------------------------------------------begin-----------------------------------------------------#
    # 以下为超参数，可根据需要修改
        # 每个词向量的维度
    max_token_per_sent = 50 # 每个句子预设的最大 token 数
    batch_size = 32
    num_epochs = 5
    lr = 1e-4
    #------------------------------------------------------end------------------------------------------------------#

    dataset = Corpus(dataset_folder, max_token_per_sent)

    embedding_dim = dataset.embedding_weight.shape[1] # 词向量维度 

    vocab_size = len(dataset.dictionary.tkn2word) # 词表大小

    data_loader_train = DataLoader(dataset=dataset.train, batch_size=batch_size, shuffle=True)
    data_loader_valid = DataLoader(dataset=dataset.valid, batch_size=batch_size, shuffle=False)
    data_loader_test = DataLoader(dataset=dataset.test, batch_size=batch_size, shuffle=False)

  

In [20]:
  #-----------------------------------------------------begin-----------------------------------------------------#
#     # 可修改选择的模型以及传入的参数
# model = BiLSTM_model(vocab_size=vocab_size,
#                      ntoken=max_token_per_sent,
#                      d_emb=embedding_dim,
#                      embedding_weight=dataset.embedding_weight # 使用预训练的词向量，需传入 embedding_weight
#                      ).to(device)     
    
model = Transformer_model(vocab_size=vocab_size,
                         ntoken=max_token_per_sent,
                         d_emb=embedding_dim,
                         nhead=5,#head需要整除d_emb，300除以5等于60
                         #d_hid=80,
                         embedding_weight=dataset.embedding_weight # 使用预训练的词向量，需传入 embedding_weight
                         ).to(device)         
    #------------------------------------------------------end------------------------------------------------------#
    
    # 设置损失函数
loss_function = nn.CrossEntropyLoss()
    # 设置优化器                                       
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)  


In [40]:
# 进行训练
def valid():
    '''
    进行验证，返回模型在验证集上的 accuracy
    '''
    total_true = []

    model.eval() #这句话在测试之前使用，不启用 BatchNormalization 和 Dropout
    with torch.no_grad():
        for data in tqdm(data_loader_valid, dynamic_ncols=True):
            batch_x, batch_y = data[0].to(device), data[1].to(device)

            y_hat = model(batch_x)
            # 取分类概率最大的类别作为预测的类别
            y_hat = torch.tensor([torch.argmax(_) for _ in y_hat]).to(device)

            total_true.append(torch.sum(y_hat == batch_y).item())

        return sum(total_true) / (batch_size * len(total_true))

def train():

    max_valid_acc = 0
    
    for epoch in range(num_epochs):
        model.train()  #这一句话是为了启用 BatchNormalization 和 Dropout，一定要在训练前调用，否则会有影响

        total_loss = []
        total_true = []

        tqdm_iterator = tqdm(data_loader_train, dynamic_ncols=True, desc=f'Epoch {epoch + 1}/{num_epochs}')

        for data in tqdm_iterator:
            # 选取对应批次数据的输入和标签
            batch_x, batch_y = data[0].to(device), data[1].to(device)

            # 模型预测
            #------------------------------------change------------------------------------#
            y_hat = model(batch_x)
            #------------------------------------endOfChange------------------------------------#
            loss = loss_function(y_hat, batch_y)

            optimizer.zero_grad()   # 梯度清零
            loss.backward()         # 计算梯度
            optimizer.step()        # 更新参数

            y_hat = torch.tensor([torch.argmax(_) for _ in y_hat]).to(device)
            
            total_true.append(torch.sum(y_hat == batch_y).item())
            total_loss.append(loss.item())

            tqdm_iterator.set_postfix(loss=sum(total_loss) / len(total_loss),
                                      acc=sum(total_true) / (batch_size * len(total_true)))
        
        tqdm_iterator.close()

        train_loss = sum(total_loss) / len(total_loss)
        train_acc = sum(total_true) / (batch_size * len(total_true))

        valid_acc = valid()

        #if valid_acc > max_valid_acc:
           # torch.save(model, os.path.join(output_folder, "model.ckpt"))

        print(f"epoch: {epoch}, train loss: {train_loss:.4f}, train accuracy: {train_acc*100:.2f}%, valid accuracy: {valid_acc*100:.2f}%")

train()

    # 对测试集进行预测
def predict():
    '''
    读取训练好的模型对测试集进行预测，并生成结果文件
    '''
    test_ids = [] 
    test_pred = []

    model = torch.load(os.path.join(output_folder, "model.ckpt")).to(device)
    model.eval()
    with torch.no_grad():
        for data in tqdm(data_loader_test, dynamic_ncols=True): 
            batch_x, batch_y = data[0].to(device), data[1]

            y_hat = model(batch_x)
            y_hat = torch.tensor([torch.argmax(_) for _ in y_hat])

            test_ids += batch_y.tolist()
            test_pred += y_hat.tolist()

    # 写入文件
    with open(os.path.join(output_folder, "predict.json"), "w") as f:
        for idx, label_idx in enumerate(test_pred):
            one_data = {}
            one_data["id"] = test_ids[idx]
            one_data["pred_label_desc"] = dataset.dictionary.idx2label[label_idx][1]
            json_data = json.dumps(one_data)    # 将字典转为json格式的字符串
            f.write(json_data + "\n")


            
#predict()


Epoch 1/5: 100%|██████████| 1668/1668 [00:43<00:00, 38.02it/s, acc=0.788, loss=0.707]
100%|██████████| 313/313 [00:01<00:00, 240.70it/s]


epoch: 0, train loss: 0.7071, train accuracy: 78.82%, valid accuracy: 34.44%


Epoch 2/5: 100%|██████████| 1668/1668 [00:42<00:00, 38.91it/s, acc=0.806, loss=0.664]
100%|██████████| 313/313 [00:01<00:00, 245.30it/s]


epoch: 1, train loss: 0.6640, train accuracy: 80.58%, valid accuracy: 36.26%


Epoch 3/5: 100%|██████████| 1668/1668 [00:43<00:00, 38.59it/s, acc=0.819, loss=0.629]
100%|██████████| 313/313 [00:01<00:00, 243.46it/s]


epoch: 2, train loss: 0.6287, train accuracy: 81.87%, valid accuracy: 36.03%


Epoch 4/5: 100%|██████████| 1668/1668 [00:44<00:00, 37.40it/s, acc=0.818, loss=0.624]
100%|██████████| 313/313 [00:01<00:00, 224.43it/s]


epoch: 3, train loss: 0.6242, train accuracy: 81.82%, valid accuracy: 35.81%


Epoch 5/5: 100%|██████████| 1668/1668 [00:43<00:00, 38.14it/s, acc=0.816, loss=0.634]
100%|██████████| 313/313 [00:01<00:00, 242.90it/s]

epoch: 4, train loss: 0.6340, train accuracy: 81.56%, valid accuracy: 36.21%



