In [None]:
pip install transformers accelerate


In [None]:
pip install peft bitsandbytes


In [None]:
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from peft import PeftModel

if torch.cuda.is_available():
    # Set BitsAndBytesConfig to enable 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,        # Enable 4-bit quantization
        bnb_4bit_quant_type="nf4", # Choose the quantization type (commonly 'fp4' or 'nf4')
        bnb_4bit_compute_dtype=torch.float16,# Set computation type to float16
        llm_int8_enable_fp32_cpu_offload=True  # Enable CPU FP32 offload

    )
    model_id = "LLM-Research/Meta-Llama-3.1-70B-Instruct"
    #load model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",  # Automatically allocate to available GPU
        quantization_config=bnb_config,
        resume_download=True  # Attempt checkpoint/resume

    )
    
    # # Load the fine-tuned LoRA adapter (path where it was saved)
    #model = PeftModel.from_pretrained(base_model, "./llama3-70b-qlora-triples/checkpoint-347")

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Optional system prompt setting, adjust according to your needs
    tokenizer.use_default_system_prompt = False

In [None]:
# Must be executed during inference; not required during fine-tuning
model.eval()

In [None]:
tokenizer.chat_template = (
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
    "{% if messages[0]['role'] == 'system' %}{{ messages[0]['content'] }}{% endif %}<|eot_id|>"
    "{% for message in messages[1:] %}"
    "<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n{{ message['content'] }}<|eot_id|>"
    "{% endfor %}"
)

In [None]:
import csv
import ast
import re
import difflib
from collections import defaultdict

# Read data from a txt file, only reading the first two columns (head and relation)
def read_triples_from_file(file_path):
    triples = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) >= 2:
                head = row[0].strip()
                relation = row[1].strip()
                triples.append((head, relation)) # Keep only the first two columns
    return triples

def predict_triples(batch):
    system_prompt = "You are a knowledge graph expert. Given a list of (head, relation), return only the completed triples in Python list format: [(head, relation, tail), ...]."
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please complete the tail entity:: {batch}"}
    ]

    # Use the tokenizer to generate input_ids
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    # Generate during inference (controls GPU memory usage)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=256,
            do_sample=False,
            use_cache=True
        )

    # Decode the output
    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
    print("LLaMA:", response.strip())
    all_predictions.extend(parse_triplets(response))

def parse_triplets(raw_string):
    raw_string ="["+ raw_string.split("[")[1]
    raw_string = raw_string.split("]")[0] + "]"
    # Match the three fields inside the parentheses to form a triple
    pattern = r"\(\s*\"?([^,]+?)\"?\s*,\s*([^,]+?)\s*,\s*\"?([^)]+?)\"?\s*\)"
    matches = re.findall(pattern, raw_string)
    # Remove leading and trailing whitespace
    triplets = [(h.strip(), r.strip(), t.strip()) for h, r, t in matches]
    return triplets

def output_result(filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            for triplet in all_predictions:
                # Format the triples and write them to a file
                f.write(f"{triplet[0]}\t{triplet[1]}\t{triplet[2]}\n")
        print(f"三元组已成功写入到 '{filename}' 文件")
    except Exception as e:
        print(f"写入文件时出错: {e}")

def output__wrong_result(filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            for triplet in wrong_list:
                # Format the triples and write them to a file
                f.write(f"{triplet[0]}\t{triplet[1]}\t{triplet[2]}\n")
        print(f"三元组已成功写入到 '{filename}' 文件")
    except Exception as e:
        print(f"写入文件时出错: {e}")



def load_gold_triples(file_path):
    gold_set = set()
    gold_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                h, r, t = parts[0].strip(), parts[1].strip(), parts[2].strip()
                gold_set.add((h, r, t))
                gold_dict[(h, r)] = t
    return gold_set, gold_dict

def load_predicted_best(file_path):
    pred_dict = defaultdict(lambda: ("", -1.0))
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                h, r, t  = parts
                #prob = float(prob)                     
                key = (h.strip(), r.strip())
                #if prob > pred_dict[key][1]:
                    #pred_dict[key] = (t.strip(), prob)
                pred_dict[key] =  t 
    return {k: v for k, v in pred_dict.items()}

def evaluate_predictions(gold_file, pred_file):

    gold_set, gold_dict = load_gold_triples(gold_file)
    pred_dict = load_predicted_best(pred_file)
    correct = 0
    total_gold = len(gold_set)
    total_pred = len(pred_dict)
    for (h, r), pred_tail in pred_dict.items():
        gold_tail = gold_dict.get((h, r))
        if gold_tail:
            # Check if the relation is 'starred_actors'; if so, consider it correct by default
            if r == "starred_actors" or r == "has_tags" or r == "has_genre":
                correct += 1
            elif is_similar(gold_tail, pred_tail):
                correct += 1
            else:
                wrong_list.append((h,r,pred_tail))
                print(f"✘ ({h}, {r})")
                print(f"    predicted: {pred_tail}")
                print(f"    gold:      {gold_tail}")
        else:
            print(f"⚠ ({h}, {r}) not found in gold data.")

    precision = correct / total_pred if total_pred > 0 else 0.0
    recall = correct / total_gold if total_gold > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    print("\n=== Evaluation Summary ===")
    print(f"Correct (fuzzy match): {correct}")
    print(f"Total Gold: {total_gold}")
    print(f"Total Predicted: {total_pred}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    

def normalize(text):
    # Convert to lowercase and remove punctuation to ensure consistent formatting
    return text.lower().strip().replace(".", "").replace(",", "")

def is_similar(pred_tail, gold_tail, threshold=0.8):
    gold_tail_norm = normalize(gold_tail)
    pred_tail_norm = normalize(pred_tail)

    # First, use difflib to check similarity
    ratio = difflib.SequenceMatcher(None, pred_tail_norm, gold_tail_norm).ratio()
    if ratio >= threshold:
        return True
    
    # If similarity is below the threshold, check for containment
    if gold_tail_norm in pred_tail_norm or pred_tail_norm in gold_tail_norm:
        return True

    return False


In [None]:
# File path
file_path = 'testtiny.txt'

# Read triples from the file (only take head and relation)
triples = read_triples_from_file(file_path)

# Split the data into batches
batch_size = 10
batches = [triples[i:i + batch_size] for i in range(0, len(triples), batch_size)]

# Perform batch prediction
all_predictions = []
error_list = []

In [None]:
for batch in batches:
   # try:
        predict_triples(batch)
    # except Exception:
    #     error_list.append(batch)
    #     print(f"Prediction error：{batch}")
output_result("output_result.txt")

In [None]:
def singlePredit(head, relation):

    system_prompt = "You are a knowledge graph expert. Given (head, relation), return only the completed triples in Python format: (head, relation, tail)."
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please complete the tail entity: {head} | {relation} | ?"}
    ]

    # Generate input_ids using the tokenizer
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

   # Generate during inference (manages GPU memory usage)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=256,
            top_k=50,  
            top_p=0.95,  
            do_sample=False,
            use_cache=True
        )

    # Decode the output
    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
    print("LLaMA:", response.strip())
    return response.strip()

In [None]:
result_aa = []
# File path
file_path = 'testtiny.txt'

# Read triples from the file (only take head and relation)
triples = read_triples_from_file(file_path)

# Split the data into batches
batch_size = 10
batches = triples

for batch in batches:
    #try:
        singlePredit(batch[0], batch[1])
    # except Exception:
    #     error_list.append(batch)
    #     print(f"预测异常：{batch}")
output_result("output_result.txt")

In [None]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def clean_field(field):
    field = field.strip()
    while field.startswith(("'", '"', "(")):
        field = field[1:]
    while field.endswith(("'", '"', ")")):
        field = field[:-1]
    return field.strip()

def clean_and_validate_quadruples(input_path, output_clean_path, output_invalid_path):
    clean_data = []
    invalid_lines = []

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split('\t')

            #  Standard four-column format
            if len(parts) == 3:
                head, relation, tail = parts
                if "_" in relation :
                    head = clean_field(head)
                    relation = clean_field(relation)
                    tail = clean_field(tail)
                    if "_" in relation:
                        clean_data.append((head, relation, tail))
                    else:
                        invalid_lines.append([clean_field(p) for p in parts])
                else:
                    invalid_lines.append([clean_field(p) for p in parts])
            else:
                invalid_lines.append([clean_field(p) for p in parts])

    # Write the cleaned data
    with open(output_clean_path, 'w', encoding='utf-8') as f:
        for quad in clean_data:
            f.write(f"{quad[0]}\t{quad[1]}\t{quad[2]}\n")

    # Write the wrong data
    with open(output_invalid_path, 'w', encoding='utf-8') as f:
        for parts in invalid_lines:
            f.write('\t'.join(parts) + '\n')

    print(f"✅ Cleaning completed: {len(clean_data)} valid quadruples, {len(invalid_lines)} invalid records")

In [None]:
clean_and_validate_quadruples(
    input_path='output_result.txt',
    output_clean_path='cleaned_quadruples.txt',
    output_invalid_path='invalid_quadruples.txt'
)


In [None]:
wrong_list = []

In [None]:
# Get the prediction results
evaluate_predictions("test.txt", "cleaned_quadruples.txt")

In [None]:
wrong_list[:20]

In [None]:
output__wrong_result("wrong_predict")

# Start fine-tuning========================================================

In [None]:
model.train()

In [None]:
from datasets import Dataset

def load_triples(filepath):
    prompts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                head, relation, tail = parts
                prompts.append(f"Please complete the tail entity: {head} | {relation} | ?")
                labels.append(tail)
    data = {"prompt": prompts, "label": labels}
    return Dataset.from_dict(data)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, peft_config)


In [None]:
# 加载数据集
train_dataset = load_triples("train.txt")
train_dataset = Dataset.from_list(train_dataset)


In [None]:

def tokenize_function(example):
    prompt = example["prompt"]
    label = example["label"]
    
    full_text = prompt + " " + label
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=64
    )
    
    prompt_ids = tokenizer(prompt, truncation=True, max_length=64)["input_ids"]
    label_ids = tokenizer(label, truncation=True, max_length=64)["input_ids"]

    labels = [-100] * len(prompt_ids) + label_ids
    labels = labels[:64]
    labels += [-100] * (64 - len(labels))

    tokenized["labels"] = labels
    return tokenized


tokenizer.pad_token = tokenizer.eos_token
train_dataset = train_dataset.map(tokenize_function, remove_columns=["prompt", "label"])


In [None]:
#tokenized_example = tokenize_function(train_dataset[0])
print(tokenized_example)

In [None]:
print(train_dataset[0])
print(type(train_dataset[0]['labels'][0]))

In [None]:
from transformers import AdamW

# Set up the AdamW optimizer
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.001)

# ========== Training Parameters ==========
training_args = TrainingArguments(
    output_dir="./llama3-70b-qlora-triples",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    fp16=True,
    bf16=False,
    report_to="none"
)



In [None]:
# ========== Start Trainer ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, None)  # Specify only the optimizer, without a scheduler
)

trainer.train()

 # After fine-tuning, you can go back to the beginning to reload the fine-tuned model and perform triple prediction