In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import re
import jieba
from opencc import OpenCC

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### 加载模型权重

In [16]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_dim=64):
        super().__init__()
        # 定义词嵌入层，使用 embedding_matrix 初始化
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix),
            padding_idx=0
        )
        self.embedding.weight.requires_grad = False # True 确保嵌入层的参数可训练
        """
        双向 LSTM 层：输入维度为 embedding_dim，输出维度为 hidden_dim。
        batch_first=True : 输入张量的形状为 (batch_size, sequence_length)。
        bidirectional=True : LSTM 会在两个方向上（正向和反向）处理输入序列，以捕捉更多上下文信息
        (因为 LSTM 是双向的，它的输出将是两个隐藏层的连接, 所以实际输出维度为 hidden_dim * 2)
        """
        self.lstm = nn.LSTM(
            embedding_dim,    # 输入特征的维度
            hidden_dim,       # 隐藏状态的维度
            num_layers=2,     # LSTM的层数
            batch_first=True, # 输入和输出的张量的第一个维度是batch_size
            bidirectional=True, # 使用双向LSTM
            dropout=0.5
        )
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)  # 添加一个额外的全连接层
        self.fc2 = nn.Linear(hidden_dim, 2)  # 二分类任务
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded) # 第二个返回值_是LSTM的隐藏状态和单元状态；lstm_out形状： (batch_size, sequence_length, hidden_dim * 2)
        last_hidden = lstm_out[:, -1, :] # 选择每个批次中的最后一个时刻的输出，形状为 (batch_size, hidden_dim * 2)
        dropped = self.dropout(last_hidden)
        fc1_out = F.relu(self.fc1(dropped))
        fc_out = self.fc2(fc1_out)
        return fc_out


# 加载模型参数和权重
checkpoint = torch.load('../models/3_Chinese_Movie_review_Text_Classification/model_checkpoint.pth')

# 提取模型参数
vocab_size = checkpoint['vocab_size']
embedding_dim = checkpoint['embedding_dim']
embedding_matrix = checkpoint['embedding_matrix']
hidden_dim = checkpoint['hidden_dim']
vocab = checkpoint['vocab']

model = TextClassifier(vocab_size, embedding_dim, embedding_matrix, hidden_dim).to(device)

# 加载模型权重
model.load_state_dict(checkpoint['model_state_dict'])

# 将模型设置为评估模式
model.eval()

  checkpoint = torch.load('../models/3_Chinese_Movie_review_Text_Classification/model_checkpoint.pth')


TextClassifier(
  (embedding): Embedding(276419, 32, padding_idx=0)
  (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### 模型推理

In [19]:
# 读取停用词表
with open('../datasets/chinese_movie_reviews/stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# 初始化 OpenCC 转换器（繁转简）
cc = OpenCC('t2s')  # t2s 表示繁体转简体

# 文本预处理函数
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = cc.convert(text)
    text = " ".join(jieba.cut(text))
    return " ".join([word for word in text.split() if word not in stopwords])

def word2vector(text, vocab, max_length=100):
    words = text.split()
    word_indices = [vocab.get(word, 0) for word in words]
    if len(word_indices) < max_length:
        word_indices += [0] * (max_length - len(word_indices))
    else:
        word_indices = word_indices[:max_length]
    return torch.tensor([word_indices], dtype=torch.long)


# 推理函数
def predict_sentiment(text, model, vocab):
    input_tensor = word2vector(text, vocab).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1).item()
    sentiment = 'positive' if predicted_class == 1 else 'negative'
    return sentiment

In [26]:
review = "我特别喜欢这个角色进步！"
print(f'分词后的文本：{preprocess_text(review)}')
print(f'句子向量 : {word2vector(review, vocab)}')
predict_sentiment(review, model, vocab)

分词后的文本：特别 喜欢 角色 进步 ！
句子向量 : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])


'positive'

In [23]:
vocab

{'<PAD>': 0,
 '看': 1,
 '人': 2,
 '但': 3,
 '好': 4,
 '这': 5,
 '还是': 6,
 '还': 7,
 '啊': 8,
 '故事': 9,
 '对': 10,
 '太': 11,
 '说': 12,
 '喜欢': 13,
 '得': 14,
 '能': 15,
 '没': 16,
 '剧情': 17,
 '真的': 18,
 '自己': 19,
 '中': 20,
 '不是': 21,
 '觉得': 22,
 '而': 23,
 '不错': 24,
 '会': 25,
 '一部': 26,
 '要': 27,
 '感觉': 28,
 '拍': 29,
 '更': 30,
 '这部': 31,
 '有点': 32,
 '着': 33,
 '导演': 34,
 '那': 35,
 '去': 36,
 '想': 37,
 '片子': 38,
 '挺': 39,
 '却': 40,
 '个': 41,
 '这种': 42,
 '好看': 43,
 '像': 44,
 '其实': 45,
 '为': 46,
 '我们': 47,
 '看到': 48,
 '爱': 49,
 '来': 50,
 '才': 51,
 '知道': 52,
 '真是': 53,
 '再': 54,
 '片': 55,
 '人物': 56,
 '还有': 57,
 '角色': 58,
 '小': 59,
 '吗': 60,
 '用': 61,
 '很多': 62,
 '影片': 63,
 '只是': 64,
 '一样': 65,
 '完全': 66,
 '镜头': 67,
 '结尾': 68,
 '现在': 69,
 '演员': 70,
 '完': 71,
 '真': 72,
 '世界': 73,
 '并': 74,
 '呢': 75,
 '一': 76,
 '死': 77,
 '为了': 78,
 '有些': 79,
 '开始': 80,
 '它': 81,
 '应该': 82,
 '情节': 83,
 '比较': 84,
 '实在': 85,
 '音乐': 86,
 '看过': 87,
 '戏': 88,
 '观众': 89,
 '时': 90,
 '只': 91,
 '过': 92,
 '出来': 93,
 '爱情': 94,
 '年': 95,