In [None]:
pip install transformers accelerate


In [None]:
pip install peft bitsandbytes


In [None]:
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from peft import PeftModel

if torch.cuda.is_available():
    # 设置 BitsAndBytesConfig 以启用 4-bit 量化
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,        # 启用 4-bit 量化
        bnb_4bit_quant_type="nf4", # 选择量化类型（常见的是 'fp4' 或 'nf4'）
        bnb_4bit_compute_dtype=torch.float16, # 设置计算类型为 float16
        llm_int8_enable_fp32_cpu_offload=True  # 开启 CPU FP32 offload

    )
    model_id = "LLM-Research/Meta-Llama-3.1-70B-Instruct"
    #加载模型
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",  # 自动分配到可用的 GPU
        quantization_config=bnb_config,
        resume_download=True  # 尝试断点续传

    )
    
    # 加载微调完的 LoRA adapter（你微调保存的路径）
    #model = PeftModel.from_pretrained(base_model, "./llama3-70b-qlora-triples/checkpoint-347")

    # 加载 Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # 可选的系统 prompt 设置，具体看你的需求
    tokenizer.use_default_system_prompt = False

In [None]:
# 推理时记得加上,微调时无需执行
model.eval()

In [None]:
tokenizer.chat_template = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
    "{% if messages[0]['role'] == 'system' %}{{ messages[0]['content'] }}{% endif %}<|eot_id|>"
    "{% for message in messages[1:] %}"
    "<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{{ message['content'] }}<|eot_id|>"
    "{% endfor %}"
)

In [None]:
import csv
import ast
import re
import difflib
from collections import defaultdict

# 从 txt 文件读取数据，只读取前两列（head 和 relation）
def read_triples_from_file(file_path):
    triples = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) >= 2:
                head = row[0].strip()
                relation = row[1].strip()
                triples.append((head, relation))  # 只保留前两列
    return triples

def predict_triples(batch):
    system_prompt = "You are a knowledge graph expert. Given a list of (head, relation), return only the completed triples in Python list format: [(head, relation, tail), ...]."
#     system_prompt=   """You are a knowledge graph expert. You will be given a list of (head, relation) pairs, and your task is to predict the correct tail entity to complete each triple. 

#                     Return the result **only** as a list of Python-style tuples in the form:
#                     [("head1", "relation1", "tail1"), ("head2", "relation2", "tail2"), ...]

#                     Do NOT include explanations, extra words, or repeated tokens. Be precise. No extra quotation marks or line breaks."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please complete the tail entity:: {batch}"}
    ]

    # 使用 tokenizer 生成 input_ids
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    # 推理生成（控制显存占用）
    # with torch.inference_mode():
    #     output_ids = model.generate(
    #         input_ids,
    #         max_new_tokens=256,
    #         do_sample=False,
    #         temperature=0.7,
    #         # top_k=40,
    #         # top_p=0.9,
    #         repetition_penalty=1.2,               # ✅ 强力抑制重复
    #         no_repeat_ngram_size=2,               # ✅ 允许控制重复 n-gram 粒度
    #         early_stopping=True,                  # ✅ 更早终止生成，减少胡言乱语
    #         eos_token_id=tokenizer.eos_token_id,
    #         pad_token_id=tokenizer.pad_token_id,
    #         use_cache=True
    #     )
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=256,
            do_sample=False,
            use_cache=True
        )

    # 解码输出
    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
    print("LLaMA:", response.strip())
    all_predictions.extend(parse_triplets(response))

def parse_triplets(raw_string):
    raw_string ="["+ raw_string.split("[")[1]
    raw_string = raw_string.split("]")[0] + "]"
    # 匹配括号中三个字段组成的三元组
    pattern = r"\(\s*\"?([^,]+?)\"?\s*,\s*([^,]+?)\s*,\s*\"?([^)]+?)\"?\s*\)"
    matches = re.findall(pattern, raw_string)
    # 去掉首尾空格
    triplets = [(h.strip(), r.strip(), t.strip()) for h, r, t in matches]
    return triplets

def output_result(filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            for triplet in all_predictions:
                # 格式化三元组并写入文件
                f.write(f"{triplet[0]}\t{triplet[1]}\t{triplet[2]}\n")
        print(f"三元组已成功写入到 '{filename}' 文件")
    except Exception as e:
        print(f"写入文件时出错: {e}")

def output__wrong_result(filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            for triplet in wrong_list:
                # 格式化三元组并写入文件
                f.write(f"{triplet[0]}\t{triplet[1]}\t{triplet[2]}\n")
        print(f"三元组已成功写入到 '{filename}' 文件")
    except Exception as e:
        print(f"写入文件时出错: {e}")



def load_gold_triples(file_path):
    gold_set = set()
    gold_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                h, r, t = parts[0].strip(), parts[1].strip(), parts[2].strip()
                gold_set.add((h, r, t))
                gold_dict[(h, r)] = t
    return gold_set, gold_dict

def load_predicted_best(file_path):
    pred_dict = defaultdict(lambda: ("", -1.0))
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                h, r, t  = parts
                #prob = float(prob)                     
                key = (h.strip(), r.strip())
                #if prob > pred_dict[key][1]:
                    #pred_dict[key] = (t.strip(), prob)
                pred_dict[key] =  t 
    return {k: v for k, v in pred_dict.items()}

def evaluate_predictions(gold_file, pred_file):

    gold_set, gold_dict = load_gold_triples(gold_file)
    pred_dict = load_predicted_best(pred_file)
    correct = 0
    total_gold = len(gold_set)
    total_pred = len(pred_dict)
    for (h, r), pred_tail in pred_dict.items():
        gold_tail = gold_dict.get((h, r))
        if gold_tail:
            # 判断关系是否是 starred_actors，如果是则默认正确
            if r == "starred_actors" or r == "has_tags" or r == "has_genre":
                correct += 1
            elif is_similar(gold_tail, pred_tail):
                correct += 1
            else:
                wrong_list.append((h,r,pred_tail))
                print(f"✘ ({h}, {r})")
                print(f"    predicted: {pred_tail}")
                print(f"    gold:      {gold_tail}")
        else:
            print(f"⚠ ({h}, {r}) not found in gold data.")

    precision = correct / total_pred if total_pred > 0 else 0.0
    recall = correct / total_gold if total_gold > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    print("\n=== Evaluation Summary ===")
    print(f"Correct (fuzzy match): {correct}")
    print(f"Total Gold: {total_gold}")
    print(f"Total Predicted: {total_pred}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    

def normalize(text):
    # 转小写并去除标点符号，确保统一格式
    return text.lower().strip().replace(".", "").replace(",", "")

def is_similar(pred_tail, gold_tail, threshold=0.4):
    gold_tail_norm = normalize(gold_tail)
    pred_tail_norm = normalize(pred_tail)

    # 首先使用 difflib 检查相似度
    ratio = difflib.SequenceMatcher(None, pred_tail_norm, gold_tail_norm).ratio()
    if ratio >= threshold:
        return True
    
    # 如果相似度低于阈值，检查是否包含
    if gold_tail_norm in pred_tail_norm or pred_tail_norm in gold_tail_norm:
        return True

    return False


In [None]:
# 文件路径，假设你有一个名为 triples.txt 的文件
file_path = 'testtiny.txt'

# 读取文件中的三元组（只取 head 和 relation）
triples = read_triples_from_file(file_path)

# 将数据分成批次
batch_size = 10
batches = [triples[i:i + batch_size] for i in range(0, len(triples), batch_size)]

# 执行批量预测
all_predictions = []
error_list = []

In [None]:
for batch in batches:
   # try:
        predict_triples(batch)
    # except Exception:
    #     error_list.append(batch)
    #     print(f"预测异常：{batch}")
output_result("output_result.txt")

In [None]:
def singlePredit(head, relation):

    system_prompt = "You are a knowledge graph expert. Given (head, relation), return only the completed triples in Python format: (head, relation, tail)."
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please complete the tail entity: {head} | {relation} | ?"}
    ]

    # 使用 tokenizer 生成 input_ids
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    # 推理生成（控制显存占用）
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=256,
            top_k=50, #qlora添加  
            top_p=0.95, #qlora添加  
            do_sample=False,
            use_cache=True
        )

    # 解码输出
    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
    print("LLaMA:", response.strip())
    return response.strip()

In [None]:
result_aa = []
# 文件路径，假设你有一个名为 triples.txt 的文件
file_path = 'testtiny.txt'

# 读取文件中的三元组（只取 head 和 relation）
triples = read_triples_from_file(file_path)

# 将数据分成批次
batch_size = 10
batches = triples

for batch in batches:
    #try:
        singlePredit(batch[0], batch[1])
    # except Exception:
    #     error_list.append(batch)
    #     print(f"预测异常：{batch}")
output_result("output_result.txt")

In [None]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def clean_field(field):
    field = field.strip()
    while field.startswith(("'", '"', "(")):
        field = field[1:]
    while field.endswith(("'", '"', ")")):
        field = field[:-1]
    return field.strip()

def clean_and_validate_quadruples(input_path, output_clean_path, output_invalid_path):
    clean_data = []
    invalid_lines = []

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t')

            # Case 1: 标准四列格式
            if len(parts) == 3:
                head, relation, tail = parts
                if "_" in relation :
                    head = clean_field(head)
                    relation = clean_field(relation)
                    tail = clean_field(tail)
                    if "_" in relation:
                        clean_data.append((head, relation, tail))
                    else:
                        invalid_lines.append([clean_field(p) for p in parts])
                else:
                    invalid_lines.append([clean_field(p) for p in parts])
            else:
                invalid_lines.append([clean_field(p) for p in parts])

    # 写入干净数据
    with open(output_clean_path, 'w', encoding='utf-8') as f:
        for quad in clean_data:
            f.write(f"{quad[0]}\t{quad[1]}\t{quad[2]}\n")

    # 写入异常数据
    with open(output_invalid_path, 'w', encoding='utf-8') as f:
        for parts in invalid_lines:
            f.write('\t'.join(parts) + '\n')

    print(f"✅ 清洗完成：有效四元组 {len(clean_data)} 条，异常记录 {len(invalid_lines)} 条")

In [None]:
clean_and_validate_quadruples(
    input_path='output_result.txt',
    output_clean_path='cleaned_quadruples.txt',
    output_invalid_path='invalid_quadruples.txt'
)


In [None]:
wrong_list = []

In [None]:
#获取预测结果
evaluate_predictions("test.txt", "cleaned_quadruples.txt")

In [None]:
wrong_list[:20]

In [None]:
output__wrong_result("wrong_predict")

开始微调========================================================

In [None]:
model.train()

In [None]:
from datasets import Dataset

def load_triples(filepath):
    prompts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                head, relation, tail = parts
                prompts.append(f"Please complete the tail entity: {head} | {relation} | ?")
                labels.append(tail)
    data = {"prompt": prompts, "label": labels}
    return Dataset.from_dict(data)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # LLaMA 常用模块
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, peft_config)


In [None]:
# 加载数据集
train_dataset = load_triples("train.txt")
train_dataset = Dataset.from_list(train_dataset)


In [None]:

def tokenize_function(example):
    prompt = example["prompt"]
    label = example["label"]
    
    full_text = prompt + " " + label
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=64
    )
    
    prompt_ids = tokenizer(prompt, truncation=True, max_length=64)["input_ids"]
    label_ids = tokenizer(label, truncation=True, max_length=64)["input_ids"]

    labels = [-100] * len(prompt_ids) + label_ids
    labels = labels[:64]
    labels += [-100] * (64 - len(labels))

    tokenized["labels"] = labels
    return tokenized


tokenizer.pad_token = tokenizer.eos_token
train_dataset = train_dataset.map(tokenize_function, remove_columns=["prompt", "label"])


In [None]:
#tokenized_example = tokenize_function(train_dataset[0])
print(tokenized_example)

In [None]:
print(train_dataset[0])
print(type(train_dataset[0]['labels'][0]))

In [None]:
from transformers import AdamW

# 设置 AdamW 优化器
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.001)

# ========== 5. 训练参数 ==========
training_args = TrainingArguments(
    output_dir="./llama3-70b-qlora-triples",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    fp16=True,
    bf16=False,
    report_to="none"
)



In [None]:
# ========== 6. 启动 Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None)  # 只指定优化器，不指定调度器
)

trainer.train()