# 获取预训练模型据进行句子编码

In [1]:
from transformers import BertTokenizer, BertModel
from transformers import BertModel, BertConfig
import torch

# 下载并加载预训练的中文BERT模型和分词器
def load_pretrained_bert_model():
    # tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    # model = BertModel.from_pretrained("bert-base-chinese")


    # 指定模型权重文件路径和配置文件路径
    model_weights_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\model\pytorch_model.bin"
    model_config_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\model\config.json"
        # 指定词汇表文件路径
    vocab_file_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\model\vocab.txt"

    # 使用词汇表文件路径加载分词器
    tokenizer = BertTokenizer.from_pretrained(vocab_file_path)

    # 加载BERT配置
    config = BertConfig.from_pretrained(model_config_path)

    # 使用本地路径加载BERT模型
    model = BertModel.from_pretrained(model_weights_path, config=config)
    return tokenizer, model

# 加载预训练的BERT模型
tokenizer, model = load_pretrained_bert_model()

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [10]:
# 对句子进行编码
def encode_sentence(sentence, tokenizer, model):
    # 使用分词器将句子分词并添加特殊标记
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    
    # 将输入传递给BERT模型并获取表示向量
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取句子的表示向量（CLS token的输出）
    sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return sentence_embedding[0].tolist()


In [7]:


# 对句子进行编码
def encode_sentence(sentence, tokenizer, model):
    # 使用分词器将句子分词并添加特殊标记
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    
    # 将输入传递给BERT模型并获取表示向量
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 获取句子的表示向量（CLS token的输出）
    sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    
    return sentence_embedding[0].tolist()

# 示例句子
example_sentence = "我爱自然语言处理，这是一个很有趣的领域，我希望能够深入学习并取得进步。"

# 加载预训练的BERT模型
tokenizer, model = load_pretrained_bert_model()

# 对示例句子进行编码
encoded_sentence = encode_sentence(example_sentence, tokenizer, model)

len(encoded_sentence)




768

In [8]:
encoded_sentence

[0.46283578872680664,
 -0.17855283617973328,
 0.5876517295837402,
 0.2005506455898285,
 0.10617582499980927,
 0.4033048748970032,
 0.03780314326286316,
 0.2060239017009735,
 -0.10630224645137787,
 -0.09003029763698578,
 -0.873487651348114,
 -0.3241155445575714,
 -0.10882815718650818,
 0.24245424568653107,
 -0.2772134840488434,
 0.10021370649337769,
 0.21823550760746002,
 0.18160279095172882,
 0.48677223920822144,
 0.17916251718997955,
 -0.16899532079696655,
 -0.17601926624774933,
 0.41788291931152344,
 -0.06866268068552017,
 0.296078622341156,
 -0.4317943751811981,
 -0.13419754803180695,
 -0.8369889259338379,
 0.42213791608810425,
 -0.5626455545425415,
 0.1345367133617401,
 0.43702465295791626,
 -0.32421091198921204,
 0.0028959610499441624,
 -0.19611358642578125,
 -0.5884252786636353,
 -0.4924357533454895,
 0.15527431666851044,
 -0.5412684082984924,
 -0.5271993279457092,
 -0.05955730751156807,
 -0.6810088753700256,
 -0.11806588619947433,
 -0.0557992123067379,
 -0.11257988959550858,
 0.

# 增量训练BERT

In [10]:
import os
import torch
from transformers import BertTokenizer, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import ConcatDataset


# 设置GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer, model = load_pretrained_bert_model()

# 设置微调参数
output_dir = "./fine_tuned_bert_model"
batch_size = 8
num_train_epochs = 3

# 准备用于微调的数据集
data_dir = r"C:\Users\LJH\Desktop\txt存放\飞书"  # 你的数据目录
# 创建一个包含所有训练数据集的列表
train_datasets = []

# 遍历所有txt文件，并为每个文件创建一个LineByLineTextDataset
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_dir, filename)
        train_datasets.append(LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=256,  # 将 block_size 设置为 256，确保每个样本的长度不超过512个token
        ))
# 将所有数据集合并成一个
merged_train_dataset = ConcatDataset(train_datasets)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# 设置微调参数
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# 使用Trainer进行微调
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=merged_train_dataset,
)

# 开始微调BERT模型
trainer.train()

# 保存微调后的BERT模型
model.save_pretrained(output_dir)




KeyboardInterrupt: 

In [11]:
import os
import torch
from transformers import BertTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import ConcatDataset

# 设置GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer, model = load_pretrained_bert_model()

# 设置微调参数
output_dir = "./fine_tuned_bert_model"
batch_size = 8
num_train_epochs = 3

# 准备用于微调的数据集
data_dir = r"C:\Users\LJH\Desktop\txt存放\飞书"  # 你的数据目录

# 创建一个包含所有训练数据集的列表
train_datasets = []

# 遍历所有txt文件，并为每个文件创建一个LineByLineTextDataset
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_dir, filename)
        train_datasets.append(LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=256,  # 将 block_size 设置为 256，确保每个样本的长度不超过512个token
        ))

# 将所有数据集合并成一个
merged_train_dataset = ConcatDataset(train_datasets)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# 设置微调参数
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',  # 设置日志文件存储目录
    logging_steps=500,  # 每500个训练步骤记录一次日志
    logging_first_step=True,  # 在第一次训练步骤时记录日志
    logging_verbose=True  # 启用详细的日志记录
)

# 使用Trainer进行微调
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=merged_train_dataset,
)

# 开始微调BERT模型
trainer.train()

# 保存微调后的BERT模型
model.save_pretrained(output_dir)


KeyboardInterrupt: 

In [9]:
torch.cuda.is_available()

True

# 加载增强训练的模型

In [9]:
from transformers import BertTokenizer, BertModel
from transformers import BertModel, BertConfig
import torch

# 下载并加载预训练的中文BERT模型和分词器
def load_pretrained_bert_model():
    # tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    # model = BertModel.from_pretrained("bert-base-chinese")


    # 指定模型权重文件路径和配置文件路径
    model_weights_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\bert_ft_checkpoint-84500\pytorch_model.bin"
    model_config_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\bert_ft_checkpoint-84500\config.json"
        # 指定词汇表文件路径
    vocab_file_path = r"E:\obsidian\Master\fund_stream_project\codes\bert\model\vocab.txt"

    # 使用词汇表文件路径加载分词器
    tokenizer = BertTokenizer.from_pretrained(vocab_file_path)

    # 加载BERT配置
    config = BertConfig.from_pretrained(model_config_path)

    # 使用本地路径加载BERT模型
    model = BertModel.from_pretrained(model_weights_path, config=config)
    return tokenizer, model

# 加载预训练的BERT模型
tokenizer, model = load_pretrained_bert_model()

Some weights of BertModel were not initialized from the model checkpoint at E:\obsidian\Master\fund_stream_project\codes\bert\bert_ft_checkpoint-84500\pytorch_model.bin and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 示例句子
example_sentence = "我爱自然语言处理，这是一个很有趣的领域，我希望能够深入学习并取得进步。"



# 对示例句子进行编码
encoded_sentence = encode_sentence(example_sentence, tokenizer, model)
encoded_sentence

[0.717182457447052,
 -0.41662341356277466,
 0.5299432277679443,
 0.2987190783023834,
 0.11066993325948715,
 0.019121956080198288,
 -0.35343948006629944,
 0.4006330966949463,
 0.09625176340341568,
 -0.22542014718055725,
 -1.0740147829055786,
 0.295207679271698,
 0.13141460716724396,
 -0.21347811818122864,
 -0.4536829888820648,
 0.11440753936767578,
 -0.04552561417222023,
 0.652789294719696,
 0.4582778215408325,
 0.06845243275165558,
 -0.11564276367425919,
 -0.06365945935249329,
 -0.12925027310848236,
 -0.3374503552913666,
 0.34227195382118225,
 -0.7340418100357056,
 0.03328901529312134,
 -0.9234859347343445,
 0.31954947113990784,
 -0.4369802176952362,
 0.3526992201805115,
 0.4182863235473633,
 -0.26276642084121704,
 0.15289083123207092,
 -0.04965375363826752,
 -0.5104486346244812,
 -0.08860406279563904,
 0.4965195655822754,
 -0.45352956652641296,
 -0.8987458944320679,
 0.12773944437503815,
 -0.4509150981903076,
 0.0021445131860673428,
 -0.6528545022010803,
 -0.012520529329776764,
 -0.18

In [None]:
# 加载微调后的BERT模型
fine_tuned_model = BertForMaskedLM.from_pretrained(output_dir).to(device)
fine_tuned_model.eval()

# 编码指定句子
def encode_sentence(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = fine_tuned_model(input_ids)
    sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
    return sentence_embedding

# 测试句子编码
test_sentence = "这是一个测试句子"
encoded_sentence = encode_sentence(test_sentence)
print(encoded_sentence)

# 对bert训练文本的处理
1. 不能有空行
2. 每一行不能超512token的限制

In [12]:
# 删除空行
with open(r"C:\Users\LJH\Desktop\txt存放\飞书 - 副本\total.txt", "r") as file:
    lines = file.readlines()

# 删除空行
non_empty_lines = [line.strip() for line in lines if line.strip()]

# 将非空行写入到新文件中
with open(r"C:\Users\LJH\Desktop\txt存放\飞书 - 副本\total_deleted_blank.txt", "w") as file:
    file.write("\n".join(non_empty_lines))


In [14]:
def process_line(line, max_length=450):
    if len(line) <= max_length:
        return [line]
    else:
        # 从450个字符的位置向前找到最近的“。”
        index = max_length
        while (index >= 0) and (line[index] != "。")and(line[index] != "?")and(line[index] != "？")and(line[index] != "！")and(line[index] != "!"):
            index -= 1
        
        # 如果没有找到句号，则找逗号
        if index == -1:
            index = max_length
            while (index >= 0) and (line[index] != ",")and (line[index] != "，"):
                index -= 1
        # 如果都没找到，则直接将字符串切割
        if index == -1:
            return [line[:max_length]] + process_line(line[max_length:], max_length)
        # 否则将句号之后的内容作为新行
        else:
            return [line[:index + 1]] + process_line(line[index + 1:], max_length)

def process_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    processed_lines = []
    for line in lines:
        processed_lines.extend(process_line(line.strip()))

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(processed_lines))

# 使用示例
input_file = r"C:\Users\LJH\Desktop\txt存放\飞书 - 副本\total_deleted_blank.txt"
output_file = r"C:\Users\LJH\Desktop\txt存放\飞书 - 副本\total_deleted_blank_less_512.txt"
process_file(input_file, output_file)
