### 1. 读取数据

In [1]:
# pandas 适合表格类数据读取
import pandas as pd
import numpy as np

In [2]:
# sep: 分隔符
data = pd.read_csv(filepath_or_buffer="samples.tsv", sep="\t").to_numpy()

In [3]:
# 打乱样本顺序
np.random.shuffle(data)

### 2. 打包数据

In [4]:
# 深度学习框架
import torch
# 深度学习中的封装层
from torch import nn
# 引入数据集
from torch.utils.data import Dataset
# 引入数据集加载器
from torch.utils.data import DataLoader

In [5]:
class SentiDataset(Dataset):
    """
        自定义数据集
    """
    def __init__(self, data):
        """
            初始化
        """
        self.data = data
    
    def __getitem__(self, idx):
        """
            按索引获取单个样本
        """
        x, y = self.data[idx]
        return x, y
    
    def __len__(self):
        """
            返回数据集中的样本个数
        """
        return len(self.data)

In [6]:
# 训练集（前4500个作为训练集）
train_dataset = SentiDataset(data=data[:4500])
train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=32)
# 测试集（4500之后的作为测试集）
test_dataset = SentiDataset(data=data[4500:])
test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=32)

In [7]:
for x, y in train_dataloader:
    
    # print(x)
    # print(y)
    break

### 3. 构建模型

In [8]:
# 用于加载 BERT 分词器
from transformers import BertTokenizer
# 用于加载 BERT 序列分类器
from transformers import BertForSequenceClassification

In [9]:
# 从 ModelScope 上下载 
from modelscope import snapshot_download
# 设置 模型id model_id
# 设置 cache_dir 缓存目录
model_dir = snapshot_download(model_id='tiansz/bert-base-chinese', cache_dir="./bert")

Downloading Model to directory: ./bert/hub/tiansz/bert-base-chinese


In [10]:
# 模型地址
model_dir

'./bert/tiansz/bert-base-chinese'

In [11]:
# 加载分词器
tokenizer = BertTokenizer.from_pretrained(model_dir)

In [12]:
# tokenizer

In [13]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [14]:
# 二分类分类器
model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=2)
model.to(device = device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert/tiansz/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# 类别字典
label2idx = {"正面": 0, "负面": 1}
idx2label = {0: "正面", 1: "负面"}

### 4. 训练

In [16]:
from torch import nn
# 损失函数
loss_fn = nn.CrossEntropyLoss()
# 优化器
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-4)
# 定义训练轮次
epochs = 5

In [17]:
def get_acc(dataloader):
    """
        计算准确率
    """
    # 设置为评估模式
    model.eval()
    accs = []
    # 构建一个无梯度的环境
    with torch.no_grad():
        # 逐个批次计算
        for X, y in train_dataloader:
            # 编码
            X = tokenizer.batch_encode_plus(batch_text_or_text_pairs=X, 
                                            padding=True, 
                                            truncation=True,
                                            max_length=100,
                                            return_tensors="pt")
            # 转张量
            y = torch.tensor(data=[label2idx.get(label) for label in y], dtype=torch.long).cuda()
            # 1. 正向传播
            y_pred = model(input_ids=X["input_ids"].to(device=device), 
                           attention_mask=X["attention_mask"].to(device=device))
            # 2. 计算准确率
            acc = (y_pred.logits.argmax(dim=-1) == y).to(dtype=torch.float).mean().item()
            accs.append(acc)
    return sum(accs) / len(accs)

In [18]:
def train():
    """
        训练过程
    """
    # 训练之前：先看看准确率
    train_acc = get_acc(train_dataloader)
    test_acc = get_acc(test_dataloader)
    print(f"初始：Train_Acc: {train_acc}, Test_Acc: {test_acc}")
    # 遍历每一轮
    for epoch in range(epochs):
        model.train()
        # 遍历每个批次
        for X, y in train_dataloader:
            # 编码
            X = tokenizer.batch_encode_plus(batch_text_or_text_pairs=X, 
                                            padding=True, 
                                            truncation=True,
                                            max_length=100,
                                            return_tensors="pt")
            # 转张量
            y = torch.tensor(data=[label2idx.get(label) for label in y], dtype=torch.long).cuda()
            
            # 1. 正向传播
            y_pred = model(input_ids=X["input_ids"].to(device=device), 
                           attention_mask=X["attention_mask"].to(device=device))
                       
            # break
            # 2. 计算损失
            loss = loss_fn(y_pred.logits, y)
            
            # 3. 反向传播
            loss.backward()
            
            # 4. 优化一步
            optimizer.step()
            
            # 5. 清空梯度
            optimizer.zero_grad()
        # 每轮都计算一下准备率
        train_acc = get_acc(train_dataloader)
        test_acc = get_acc(test_dataloader)
        print(f"Epoch: {epoch +1}, Train_Acc: {train_acc}, Test_Acc: {test_acc}")

In [19]:
train()

初始：Train_Acc: 0.5043882981259772, Test_Acc: 0.5039893617021277
Epoch: 1, Train_Acc: 0.9153368794326241, Test_Acc: 0.9157358158564737
Epoch: 2, Train_Acc: 0.9570035460992907, Test_Acc: 0.9570035460992907
Epoch: 3, Train_Acc: 0.9577570921140359, Test_Acc: 0.9578900709219859
Epoch: 4, Train_Acc: 0.988031914893617, Test_Acc: 0.988031914893617
Epoch: 5, Train_Acc: 0.9827127659574468, Test_Acc: 0.982579787149497


### 5. 保存模型

In [27]:
# 保存训练好的模型
model.save_pretrained(save_directory="./sentiment_model")
# 保存分词器
tokenizer.save_pretrained(save_directory="./sentiment_model")

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

### 6. 预测

In [28]:
# 加载分词器
tokenizer = BertTokenizer.from_pretrained("./sentiment_model")
# 加载模型
model = BertForSequenceClassification.from_pretrained("./sentiment_model").cuda()

In [29]:
def predict(text="楼梯老化，室内味道很大，没有电梯"):
    # 设置为评估模式
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text=text,
                           padding=True, 
                           truncation=True,
                           max_length=100,
                           return_tensors="pt")
        y_pred = model(input_ids=inputs["input_ids"].to(device=device), 
                       attention_mask=inputs["attention_mask"].to(device=device))
        y_pred = y_pred.logits.argmax(dim=-1).cpu().numpy()
        result = idx2label.get(y_pred[0])
        return result

In [31]:
predict(text="房价太小，地方有些偏僻，比较吵")

'负面'