### 1. 样本路径和类别读取

In [1]:
"""
    训练数据聚合
"""
import os
train_root = os.path.join("hotel", "train")
train_texts = []
train_labels = []
for label in os.listdir(train_root):
    label_root = os.path.join(train_root, label)
    for file in os.listdir(label_root):
        file_path = os.path.join(label_root, file)
        # 聚合结果
        train_texts.append(file_path)
        train_labels.append(label)
# 打印数据
len(train_texts), len(train_labels)

(5000, 5000)

In [2]:
"""
    测试数据聚合
"""
test_root = os.path.join("hotel", "test")
test_texts = []
test_labels = []
for label in os.listdir(test_root):
    label_root = os.path.join(test_root, label)
    for file in os.listdir(label_root):
        file_path = os.path.join(label_root, file)
        # 聚合结果
        test_texts.append(file_path)
        test_labels.append(label)
# 打印数据
len(test_texts), len(test_labels)

(1000, 1000)

### 2. 构建分词器
- 分词，把句子变 token
- 把所有不同的token聚在一起
- 做 0 ~ N-1 的编码

In [3]:
SEQ_LEN = 85

In [4]:
import jieba
!pip install opencc -U
import opencc





In [5]:
class Tokenizer(object):
    """
        定义一个分词器
    """
    def __init__(self, X, y):
        """
            训练的语料
        """
        self.X = X
        self.y = y
        self.t2s = opencc.OpenCC(config="t2s")
        self._build_dict()

    def _build_dict(self):
        """
            构建字典
        """
        # 1. 获取所有的 token
        words = {"<PAD>", "<UNK>"}
        for file in self.X:
            # 1. 打开文件
            with open(file=file, mode="r", encoding="gbk", errors="ignore") as f:
                text = f.read().replace("\n", "")
                text = self.t2s.convert(text=text)
                words.update(set(jieba.lcut(text)))
        # 2. 构建文本字典
        self.word2idx = {word: idx for idx, word in enumerate(words)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        # 3. 删掉 数据集
        del self.X
        # 4. 构建标签字典
        labels = set(train_labels)
        self.label2idx = {label: idx for idx, label in enumerate(labels)}
        self.idx2label = {idx: label for label, idx in self.label2idx.items()}
        # 5. 删除 数据集
        del self.y
        

    def encode(self, text, seq_len=SEQ_LEN):
        """
            text --> tokens --> ids

            自我扩展:
                - 右侧截断或填充
                - 左边？
                - 随机？
        """
        # 1. 繁体转简体
        text = text.replace("\n", "")
        text = self.t2s.convert(text=text)
        # 2. 分词
        text = jieba.lcut(text)
        # 3. 统一长度
        text = (text + ["<PAD>"] * seq_len)[:seq_len]
        # 4. 转 id
        ids = [self.word2idx.get(word, self.word2idx.get("<UNK>")) for word in text]
        
        return ids
        
    def decode(self, ids):
        """
            ids --> tokens --> text
        """
        text = "".join([self.idx2word.get(_id, "") for _id in ids])
        return text

    def __str__(self):
        """
            输: 分词器基本信息
        """
        return f"""
        Tokenizer Info: 
            --> Num of Tokens: {len(self.word2idx)}
            --> Num of Labels: {len(self.label2idx)}
        """
    def __repr__(self):
        return self.__str__()

### 3. 打包数据

In [6]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch

In [7]:
class HotelCommentDataset(Dataset):
    """
        自定义数据集
    """
    def __init__(self, X, y, seq_len=SEQ_LEN):
        """
            初始化
        """
        self.X = X
        self.y = y
        self.seq_len = seq_len

    def __getitem__(self, idx):
        """
            索引操作
                返回第idx个样本
        """
        # 1. 文本
        file = self.X[idx]
        with open(file=file, mode="r", encoding="gbk", errors="ignore") as f:
            text = f.read()
            ids = tokenizer.encode(text=text, seq_len=self.seq_len)
            ids = torch.tensor(data=ids, dtype=torch.long)
                
        # 2. 标签
        label = self.y[idx]
        label = tokenizer.label2idx.get(label)
        label = torch.tensor(data=label, dtype=torch.long)
        
        return ids, label

    def __len__(self):
        return len(self.X)

In [8]:
# 1. 定义一个分词器
tokenizer = Tokenizer(X=train_texts, y=train_labels)
tokenizer

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\63447\AppData\Local\Temp\jieba.cache
Loading model cost 0.440 seconds.
Prefix dict has been built successfully.



        Tokenizer Info: 
            --> Num of Tokens: 20781
            --> Num of Labels: 2
        

In [9]:
# 打包数据
train_dataset = HotelCommentDataset(X=train_texts, y=train_labels)
train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=128)
test_dataset = HotelCommentDataset(X=test_texts, y=test_labels)
test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=256)

In [10]:
for X, y in test_dataloader:
    print(X.shape)
    print(y.shape)
    break

torch.Size([256, 85])
torch.Size([256])


### 4. 搭建模型

In [11]:
import torch
from torch import nn

In [12]:
"""
    每句话65个词，分为2类

        - 解决？
    
"""

'\n    每句话65个词，分为2类\n\n        - 解决？\n    \n'

In [13]:
class TextCNN(nn.Module):
    """
        搭建模型
            - 卷积？
            # [N, C, L]
            nn.Conv1d()
    """
    def __init__(self, dict_len=len(tokenizer.word2idx), embedding_dim = 256):
        super().__init__()
        # 向量化
        self.embed = nn.Embedding(num_embeddings=dict_len, 
                                  embedding_dim=embedding_dim, 
                                  padding_idx=tokenizer.word2idx.get("<PAD>"))
        # 特征抽取
        self.feature_extractor = nn.Sequential(
            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0),
            nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=1024),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        )
        # 分类
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=1024 * (SEQ_LEN // 2 // 2), out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=2)
        )
        
        
    def forward(self, x):
        # 向量化
        x = self.embed(x)
        x = torch.permute(input=x, dims=(0, 2, 1))
        # 抽特征
        x = self.feature_extractor(x)
        # 做分类
        x = self.classifier(x)
        return x

In [14]:
class TextRNN(nn.Module):
    """
        搭建模型
            - 卷积？
            # [N, C, L]
            nn.Conv1d()
    """
    def __init__(self, dict_len=len(tokenizer.word2idx), embedding_dim = 256):
        super().__init__()
        # 向量化
        self.embed = nn.Embedding(num_embeddings=dict_len, 
                                  embedding_dim=embedding_dim, 
                                  padding_idx=tokenizer.word2idx.get("<PAD>"))
        # 特征抽取
        self.feature_extractor = nn.RNN(input_size=256, hidden_size=512, num_layers=1, bidirectional=False)
        # 分类
        self.classifier = nn.Sequential(
            nn.Linear(in_features=512, out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=2)
        )
        
    def forward(self, x):
        # 向量化 [batch_size, seq_len]
        x = self.embed(x)
        # [batch_size, seq_len, embedding_dim]
        x = torch.permute(input=x, dims=(1, 0, 2))
        # 特征抽取 [seq_len, batch_size, embedding_dim]
        out, hn = self.feature_extractor(x)
        # 分类输出
        x = self.classifier(out.sum(dim=0))
        return x

In [15]:
model= TextRNN()
for X, y in train_dataloader:
    y_pred = model(X)
    print(y_pred.shape)
    break

torch.Size([128, 2])


### 5. 训练模型

In [16]:
# 检测设备
device = "cuda" if torch.cuda.is_available() else "cpu"
# 实例化模型
model = TextRNN().to(device=device)
# 损失函数
loss_fn = nn.CrossEntropyLoss()
# 优化器
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
# 轮次
epochs = 20
import time

In [17]:
"""
    评估
"""
def get_acc(dataloader):
    model.eval()
    accs = []
    with torch.no_grad():
        for X, y in dataloader:
            # 0. 数据搬家
            X = X.to(device=device)
            y = y.to(device=device)
            # 1. 正向传播
            y_pred = model(X)
            # 2. 计算结果
            y_pred = y_pred.argmax(dim=1)
            # 3. 计算准确率
            acc = (y_pred == y).to(dtype=torch.float32).mean().item()
            # 4. 保存结果
            accs.append(acc)
    final_acc = round(number=sum(accs) / len(accs), ndigits=6)
    return final_acc

In [18]:
def train():
    train_acc = get_acc(dataloader=train_dataloader)
    test_acc = get_acc(dataloader=test_dataloader)
    print(f"初始 Train_acc: {train_acc}, Test_acc: {test_acc}")
    
    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        for X, y in train_dataloader:
            # 0. 数据搬家
            X = X.to(device=device)
            y = y.to(device=device)
            # 1. 正向传播
            y_pred = model(X)
            # 2. 计算误差
            loss = loss_fn(y_pred, y)
            # 3. 反向传播
            loss.backward()
            # 4. 优化一步
            optimizer.step()
            # 5. 清空梯度
            optimizer.zero_grad()
        
        stop_time = time.time()
        # 每轮结束后测试一下
        train_acc = get_acc(dataloader=train_dataloader)
        test_acc = get_acc(dataloader=test_dataloader)
        
        print(f"Epoch: {epoch + 1}, Train_acc: {train_acc}, Test_acc: {test_acc}, Train_time: {stop_time-start_time}")

In [19]:
train()

初始 Train_acc: 0.524023, Test_acc: 0.535123
Epoch: 1, Train_acc: 0.813867, Test_acc: 0.716056, Train_time: 8.453396797180176
Epoch: 2, Train_acc: 0.87207, Test_acc: 0.793777, Train_time: 8.43115758895874
Epoch: 3, Train_acc: 0.930469, Test_acc: 0.78647, Train_time: 8.381408929824829
Epoch: 4, Train_acc: 0.90332, Test_acc: 0.788254, Train_time: 8.417727708816528
Epoch: 5, Train_acc: 0.963672, Test_acc: 0.803509, Train_time: 8.386383056640625
Epoch: 6, Train_acc: 0.981445, Test_acc: 0.80489, Train_time: 8.476622343063354
Epoch: 7, Train_acc: 0.984766, Test_acc: 0.815228, Train_time: 8.406984329223633
Epoch: 8, Train_acc: 0.984766, Test_acc: 0.793204, Train_time: 8.861475229263306
Epoch: 9, Train_acc: 0.986914, Test_acc: 0.815564, Train_time: 8.708907127380371
Epoch: 10, Train_acc: 0.998047, Test_acc: 0.815901, Train_time: 8.603615045547485
Epoch: 11, Train_acc: 0.968359, Test_acc: 0.800815, Train_time: 8.52812123298645
Epoch: 12, Train_acc: 0.958398, Test_acc: 0.816272, Train_time: 8.6310