In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# 加载文件

In [2]:
with open('symptom_BSI.txt', 'r', encoding='utf-8') as file:
  sentences = file.readlines()
  # sentences = file.readlines()[:100] # ⚠️⚠️⚠️测试的时候可以只看前100条数据⚠️⚠️⚠️
print('文本条数: ', len(sentences))
print('预览第一条: ', sentences[0])

文本条数:  18
预览第一条:  头晕或晕倒



In [3]:
# 加载预训练模型和tokenizer
# 模型名字直接写入bert-base-chinese这个简化模型名就可以了，https://huggingface.co/google-bert/bert-base-chinese
# 如果无法用梯子的话，可以本地下载：huggingface-cli download --resume-download bert-base-chinese
model_name = "E:/comorbidity/feature_extraction/MentalBERT/chinese-mentalbert"
# model_name = "hfl/chinese-bert-wwm"

# 也可以试试使用哈工大的模型，model_name = "hfl/chinese-bert-wwm"
# 注意提前需要下载huggingface-cli download --resume-download hfl/chinese-bert-wwm

# 加载模型
# 会从huggingface中下载模型
# 源码：class PreTrainedModel(nn.Module....)
# 所以，这里创建的既是PreTrainedModel类的实例，也是torch.nn.Module的实例
# 对于警告Some weights of the model checkpoint，对与我们的任务，可以不用在意
# 相关讨论：https://blog.csdn.net/PolarisRisingWar/article/details/123974645   https://huggingface.co/google-bert/bert-base-uncased/discussions/4

# 从镜像站下载
# conda activate test
# pip install -U huggingface_hub
# $env:HF_ENDPOINT = "https://hf-mirror.com"
# huggingface-cli download --resume-download bert-base-chinese   或者使用哈工大模型   huggingface-cli download --resume-download hfl/chinese-bert-wwm
model = BertModel.from_pretrained(model_name)

# 加载tokenizer
# 使用Tokenizer，就是为了将输入的句子加工为bert模型可以处理的格式
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of BertModel were not initialized from the model checkpoint at E:/comorbidity/feature_extraction/MentalBERT/chinese-mentalbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 将模型放置在GPU上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 把模型放到cpu或gpu
model.to(device)
# 将模型设置为评估模式，https://blog.csdn.net/weixin_45275599/article/details/131524189
model.eval()

# 切分数据
batch_size = 16  # 批大小
data_loader = DataLoader(sentences, batch_size=batch_size)
for batch in data_loader:
    print(len(batch), batch)

16 ['头晕或晕倒\n', '对事物不感兴趣\n', '神经过敏，心中不踏实\n', '胸痛\n', '感到孤独\n', '感到紧张或容易紧张\n', '恶心或胃部不舒服\n', '感到苦闷\n', '无缘无故地突然感到害怕\n', '呼吸有困难\n', '感到自己没有什么价值\n', '一阵阵恐惧或惊恐\n', '身体发麻或刺痛\n', '感到没有前途没有希望\n', '感到坐立不安心神不定\n', '感到身体的某一部分软弱无力\n']
2 ['想结束自己的生命\n', '感到害怕\n']


In [5]:
# ---- 文本转向量 ----
# 生成的向量存放在这里
cls_embeddings = []

# 使用tqdm显示处理进度
# tqdm b站教程：https://www.bilibili.com/video/BV1ZG411M7Ge/?spm_id_from=333.337.search-card.all.click&vd_source=eace37b0970f8d3d597d32f39dec89d8
for batch_sentences in tqdm(data_loader):
    # tokenizer官方文档：https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
    # truncation=True，对输入句子进行截断，这里确保最大长度不超过512个字
    # max_length：不设置的话，默认会截断到该模型可接受的最大长度
    # padding=True 或 padding='longest': 将所有句子填充到批次中最长句子的长度
    # padding="max_length": 将所有句子填充到由 max_length 参数指定的长度
    inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    # print(123, inputs.input_ids[0], tokenizer.decode(inputs.input_ids[0]))
    
    # 把编码好的数据，也放在device上，It is necessary to have both the model, and the data on the same device, either CPU or GPU
    # https://huggingface.co/docs/transformers/v4.39.2/en/main_classes/tokenizer#transformers.BatchEncoding.to
    # https://stackoverflow.com/questions/63061779/pytorch-when-do-i-need-to-use-todevice-on-a-model-or-tensor
    inputs.to(device)

    # 设置不要计算梯度
    # 一般来说，如果我们只是用模型进行“预测”，而不涉及对模型进行更新时，就不需要计算梯度，以此来节约内存，增加运算效率
    # with上下文中，对model的调用将遵循torch.no_grad()，即不会计算梯度
    with torch.no_grad():
        outputs = model(**inputs)

    # 把这一批词向量存入cls_embeddings容器中
    # tensor.cpu() 将张量移动到 CPU
    # tensor.numpy() 将 CPU 上的张量转换为 NumPy 数组
    cls_embeddings.append(outputs.last_hidden_state[:, 0].cpu().numpy()) # 只取CLS对应的向量

    # print('pt格式', type(outputs.last_hidden_state[:, 0].shape), outputs.last_hidden_state[:, 0].shape)
    print('numpy格式', type(outputs.last_hidden_state[:, 0].cpu().numpy()), outputs.last_hidden_state[:, 0].cpu().numpy().shape)

# 合并句子向量
print('batch个数：', len(cls_embeddings))
cls_embeddings_np = np.vstack(cls_embeddings)
print('最终生成的词向量', type(cls_embeddings_np), cls_embeddings_np.shape)

# ---- 保存词嵌入向量 ----
# 保存句子向量到npy文件
# 官方文档：https://numpy.org/doc/stable/reference/generated/numpy.save.html
output_file = "emb-chinese-mentalbert_BSI.npy"
np.save(output_file, cls_embeddings_np)
print("词向量存储于: ", output_file)

embeddings = np.load(output_file)
print("加载回来，验证一下：", type(embeddings), embeddings.shape)

100%|██████████| 2/2 [00:00<00:00,  3.75it/s]

numpy格式 <class 'numpy.ndarray'> (16, 768)
numpy格式 <class 'numpy.ndarray'> (2, 768)
batch个数： 2
最终生成的词向量 <class 'numpy.ndarray'> (18, 768)
词向量存储于:  emb-chinese-mentalbert_BSI.npy
加载回来，验证一下： <class 'numpy.ndarray'> (18, 768)





In [6]:
embeddings

array([[-0.81493896,  0.35796282,  0.39984718, ...,  0.22862037,
        -0.66333354, -0.19406016],
       [-0.357309  , -0.00480721,  0.69890904, ..., -0.01500717,
         0.06109452, -0.10706255],
       [ 0.08008588, -0.05150903,  0.32173806, ...,  0.8915945 ,
         0.0101707 ,  0.15663666],
       ...,
       [-0.56677085,  0.35215136, -0.10006391, ...,  0.06081013,
        -0.50333923, -0.3458504 ],
       [-0.01231871,  0.76290154,  0.5574713 , ..., -0.31001216,
        -0.46291214, -0.27651986],
       [-0.07064212,  0.46961373,  0.11719873, ..., -0.30919895,
        -0.57912475,  0.2584027 ]], dtype=float32)