In [5]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


In [6]:
import json
import os
import re
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    T5ForConditionalGeneration, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import nltk
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import Dataset, concatenate_datasets
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
import torch
from sklearn.model_selection import train_test_split

### Training code

In [7]:
# !pip install accelerate -U

In [8]:
# !pip install evaluate nltk rouge_score bert_score transformers[torch]

In [1]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


import os
os.environ["WANDB_MODE"] = "offline"

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import torch
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
nltk.download("punkt", quiet=True)

# 1. LOAD AND PREPARE DATA
DATA_FILE = "training_data.json"
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {DATA_FILE}")
print("Example record:", data[0])

# Create multi-task training pairs
inputs, targets = [], []
for record in data:
    question = record.get("question", "")
    entity = record.get("entity", "")
    sparql = record.get("sparql", "")
#     response = record.get("sparql_response", "")
#     explanation = record.get("explanation", "")
    
    # Task 1: NL to SPARQL
    if question and entity and sparql:
        inputs.append(f"task: generate_sparql\ninput: {question}\nentity{entity}")
        targets.append(sparql)
    
#     # Task 2: Summarize response
#     if question and response and explanation:
#         inputs.append(f"task: summarize_response\nquestion: {question}\nresponse: {response}")
#         targets.append(explanation)

print(f"Generated {len(inputs)} total training pairs from {len(data)} records.")
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=42
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

# Save splits
train_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(train_inputs, train_targets)]
val_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(val_inputs, val_targets)]
with open("train_data_April.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("val_data_April.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
print("Saved train_data_April.json and val_data_April.json!")

# Build datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({"input_text": train_inputs, "target_text": train_targets}),
    "validation": Dataset.from_dict({"input_text": val_inputs, "target_text": val_targets})
})
print("Train sample:", raw_datasets["train"][3])
print("Validation sample:", raw_datasets["validation"][3])

# 2. LOAD MODEL & TOKENIZER
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Add custom tokens once
# custom_tokens = []
# with open("all_relations_and_classes.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# with open("output_entities.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# num_added_tokens = tokenizer.add_tokens(custom_tokens)
# model.resize_token_embeddings(len(tokenizer))
# print(f"Added {num_added_tokens} new tokens to the tokenizer!")

# 3. PREPROCESSING
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. EVALUATION METRICS
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    results = {}
    rouge_result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    results.update(rouge_result)
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    results["bleu"] = bleu_result["bleu"]
    meteor_result = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)
    results["meteor"] = meteor_result["meteor"]
    bertscore_result = metric_bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    results["bertscore_precision"] = np.mean(bertscore_result["precision"])
    results["bertscore_recall"] = np.mean(bertscore_result["recall"])
    results["bertscore_f1"] = np.mean(bertscore_result["f1"])
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    results["gen_len"] = np.mean(prediction_lens)
    return results

# 5. TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./v1",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    logging_dir="./v1",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=True,  # Enable if GPU supports
    report_to=["tensorboard"],
    warmup_steps=500,
    lr_scheduler_type="cosine",
)

# 6. TRAINER SETUP
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 7. TRAIN
trainer.train()

Loaded 87963 records from training_data.json
Example record: {'question': 'What is the average cooling command reading for AHU 01 over the last hour?', 'entity': 'bldg:bldg1.AHU.AHU01.CCV', 'sparql': 'SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }'}
Generated 86193 total training pairs from 87963 records.
Train size: 77573 | Validation size: 8620
Saved train_data_April.json and val_data_April.json!
Train sample: {'input_text': 'task: generate_sparql\ninput: Tell me the name or label  of the unoccupied cooling temperature deadband setpoint in the Classroom.\nentitybldg:Classroom \n brick:Unoccupied_Cooling_Temperature_Deadband_Setpoint', 'target_text': 'SELECT ?label WHERE { ?sensor a brick:Unoccupied_Cooling_Temperature_Deadband_Setpoint ; brick:hasLocation bldg:Classroom ; rdfs:label ?label . }'}
Validation sample: {'input_text': 'task: generate_sparql\ninput: Where is the LPG

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/77573 [00:00<?, ? examples/s]

Map:   0%|          | 0/8620 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
1,0.3018,0.015346,0.536662,0.466102,0.534615,0.534699,0.10753,0.346213,0.926892,0.867733,0.896082,19.0
2,0.0175,0.007627,0.542567,0.47897,0.54187,0.541942,0.11023,0.35437,0.928465,0.868757,0.897363,19.0
3,0.0104,0.005042,0.544722,0.484055,0.544101,0.544127,0.111394,0.357442,0.929089,0.869268,0.897927,19.0
4,0.0074,0.003816,0.54662,0.48735,0.546122,0.546136,0.111966,0.35861,0.929511,0.869629,0.898318,19.0
5,0.0058,0.003109,0.547635,0.489003,0.547049,0.547008,0.112414,0.359769,0.92968,0.869721,0.898445,19.0
6,0.0047,0.002623,0.548636,0.49148,0.548114,0.548096,0.112994,0.360515,0.929868,0.869887,0.898621,19.0
7,0.0039,0.002171,0.549185,0.492612,0.548623,0.548562,0.113224,0.361203,0.929998,0.869972,0.898727,19.0
8,0.0033,0.00184,0.549981,0.494543,0.549534,0.549474,0.113421,0.362006,0.930186,0.870131,0.8989,19.0
9,0.0029,0.00163,0.550342,0.495326,0.549858,0.549811,0.113724,0.362451,0.930221,0.870144,0.898924,19.0
10,0.0025,0.001487,0.550424,0.495585,0.549913,0.549911,0.113759,0.362604,0.930254,0.870188,0.898963,19.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=223031, training_loss=0.01638356991428153, metrics={'train_runtime': 20711.7177, 'train_samples_per_second': 187.268, 'train_steps_per_second': 23.409, 'total_flos': 6.036850000212787e+16, 'train_loss': 0.01638356991428153, 'epoch': 23.0})

In [None]:
## V2 with adding curly braces

In [1]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


import os
os.environ["WANDB_MODE"] = "offline"

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import torch
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
nltk.download("punkt", quiet=True)

# 1. LOAD AND PREPARE DATA
DATA_FILE = "training_data.json"
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {DATA_FILE}")
print("Example record:", data[0])

# Create multi-task training pairs
inputs, targets = [], []
for record in data:
    question = record.get("question", "")
    entity = record.get("entity", "")
    sparql = record.get("sparql", "")
#     response = record.get("sparql_response", "")
#     explanation = record.get("explanation", "")
    
    # Task 1: NL to SPARQL
    if question and entity and sparql:
        inputs.append(f"task: generate_sparql\ninput: {question}\nentity{entity}")
        targets.append(sparql)
    
#     # Task 2: Summarize response
#     if question and response and explanation:
#         inputs.append(f"task: summarize_response\nquestion: {question}\nresponse: {response}")
#         targets.append(explanation)

print(f"Generated {len(inputs)} total training pairs from {len(data)} records.")
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=42
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

# Save splits
train_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(train_inputs, train_targets)]
val_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(val_inputs, val_targets)]
with open("train_data_April.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("val_data_April.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
print("Saved train_data_April.json and val_data_April.json!")

# Build datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({"input_text": train_inputs, "target_text": train_targets}),
    "validation": Dataset.from_dict({"input_text": val_inputs, "target_text": val_targets})
})
print("Train sample:", raw_datasets["train"][3])
print("Validation sample:", raw_datasets["validation"][3])

# 2. LOAD MODEL & TOKENIZER
model_name = "./v1/checkpoint-223031"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Add custom tokens once

new_tokens = ["{", "}"]
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))


# custom_tokens = []
# with open("all_relations_and_classes.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# with open("output_entities.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# num_added_tokens = tokenizer.add_tokens(custom_tokens)
# model.resize_token_embeddings(len(tokenizer))
# print(f"Added {num_added_tokens} new tokens to the tokenizer!")

# 3. PREPROCESSING
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. EVALUATION METRICS
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    results = {}
    rouge_result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    results.update(rouge_result)
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    results["bleu"] = bleu_result["bleu"]
    meteor_result = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)
    results["meteor"] = meteor_result["meteor"]
    bertscore_result = metric_bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    results["bertscore_precision"] = np.mean(bertscore_result["precision"])
    results["bertscore_recall"] = np.mean(bertscore_result["recall"])
    results["bertscore_f1"] = np.mean(bertscore_result["f1"])
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    results["gen_len"] = np.mean(prediction_lens)
    return results

# 5. TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    logging_dir="./v2",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=True,  # Enable if GPU supports
    report_to=["tensorboard"],
    warmup_steps=500,
    lr_scheduler_type="cosine",
)

# 6. TRAINER SETUP
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 7. TRAIN
trainer.train()
print('training completed')


Loaded 87963 records from training_data.json
Example record: {'question': 'What is the average cooling command reading for AHU 01 over the last hour?', 'entity': 'bldg:bldg1.AHU.AHU01.CCV', 'sparql': 'SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }'}
Generated 86193 total training pairs from 87963 records.
Train size: 77573 | Validation size: 8620
Saved train_data_April.json and val_data_April.json!
Train sample: {'input_text': 'task: generate_sparql\ninput: Tell me the name or label  of the unoccupied cooling temperature deadband setpoint in the Classroom.\nentitybldg:Classroom \n brick:Unoccupied_Cooling_Temperature_Deadband_Setpoint', 'target_text': 'SELECT ?label WHERE { ?sensor a brick:Unoccupied_Cooling_Temperature_Deadband_Setpoint ; brick:hasLocation bldg:Classroom ; rdfs:label ?label . }'}
Validation sample: {'input_text': 'task: generate_sparql\ninput: Where is the LPG

Map:   0%|          | 0/77573 [00:00<?, ? examples/s]

Map:   0%|          | 0/8620 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
1,0.0615,0.004297,0.573329,0.524408,0.572891,0.572989,0.142809,0.396221,0.928643,0.870524,0.898422,18.998724
2,0.004,0.001125,0.573235,0.524157,0.572717,0.572807,0.1428,0.396098,0.928633,0.87051,0.89841,18.998724
3,0.0016,0.000813,0.573369,0.52458,0.572886,0.573012,0.142894,0.396181,0.928667,0.870557,0.898451,18.998724
4,0.0012,0.000626,0.573327,0.524403,0.57285,0.572966,0.142839,0.396092,0.928636,0.870544,0.898429,18.998724


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=38788, training_loss=0.017085847724795745, metrics={'train_runtime': 3717.6967, 'train_samples_per_second': 1043.294, 'train_steps_per_second': 130.417, 'total_flos': 1.0498869565587456e+16, 'train_loss': 0.017085847724795745, 'epoch': 4.0})

In [None]:
### V3 adding timeseriesId with storedAt

In [1]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


import os
os.environ["WANDB_MODE"] = "offline"

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import torch
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
nltk.download("punkt", quiet=True)

# 1. LOAD AND PREPARE DATA
DATA_FILE = "training_data.json"
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {DATA_FILE}")
print("Example record:", data[0])

# Create multi-task training pairs
inputs, targets = [], []
for record in data:
    question = record.get("question", "")
    entity = record.get("entity", "")
    sparql = record.get("sparql", "")
#     response = record.get("sparql_response", "")
#     explanation = record.get("explanation", "")
    
    # Task 1: NL to SPARQL
    if question and entity and sparql:
        inputs.append(f"task: generate_sparql\ninput: {question}\nentity{entity}")
        targets.append(sparql)
    
#     # Task 2: Summarize response
#     if question and response and explanation:
#         inputs.append(f"task: summarize_response\nquestion: {question}\nresponse: {response}")
#         targets.append(explanation)

print(f"Generated {len(inputs)} total training pairs from {len(data)} records.")
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=42
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

# Save splits
train_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(train_inputs, train_targets)]
val_data = [{"input_text": inp, "target_text": tgt} for inp, tgt in zip(val_inputs, val_targets)]
with open("train_data_April.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("val_data_April.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
print("Saved train_data_April.json and val_data_April.json!")

# Build datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({"input_text": train_inputs, "target_text": train_targets}),
    "validation": Dataset.from_dict({"input_text": val_inputs, "target_text": val_targets})
})
print("Train sample:", raw_datasets["train"][3])
print("Validation sample:", raw_datasets["validation"][3])

# 2. LOAD MODEL & TOKENIZER
model_name = "./v2/checkpoint-9697"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Add custom tokens once

new_tokens = ["{", "}"]
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))


# custom_tokens = []
# with open("all_relations_and_classes.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# with open("output_entities.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# num_added_tokens = tokenizer.add_tokens(custom_tokens)
# model.resize_token_embeddings(len(tokenizer))
# print(f"Added {num_added_tokens} new tokens to the tokenizer!")

# 3. PREPROCESSING
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. EVALUATION METRICS
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    
    results = {}
    rouge_result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    results.update(rouge_result)
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    results["bleu"] = bleu_result["bleu"]
    meteor_result = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)
    results["meteor"] = meteor_result["meteor"]
    bertscore_result = metric_bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    results["bertscore_precision"] = np.mean(bertscore_result["precision"])
    results["bertscore_recall"] = np.mean(bertscore_result["recall"])
    results["bertscore_f1"] = np.mean(bertscore_result["f1"])
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    results["gen_len"] = np.mean(prediction_lens)
    return results

# 5. TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./v3",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    logging_dir="./v3",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable if GPU supports
    report_to=["tensorboard"],
    warmup_steps=500,
    lr_scheduler_type="cosine",
)

# 6. TRAINER SETUP
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 7. TRAIN
trainer.train()
print('training completed')


Loaded 126160 records from training_data.json
Example record: {'question': 'Where is airq5.01 located?', 'entity': 'bldg:airq5.01', 'sparql': 'SELECT ?location WHERE { bldg:airq5.01 brick:hasLocation ?location . }'}
Generated 125110 total training pairs from 126160 records.
Train size: 112599 | Validation size: 12511
Saved train_data_April.json and val_data_April.json!
Train sample: {'input_text': 'task: generate_sparql\ninput: Show me the sensor name for Illuminance Sensor 5.03.\nentitybldg:Illuminance_Sensor_5.03', 'target_text': 'SELECT ?label WHERE { bldg:Illuminance_Sensor_5.03 rdfs:label ?label . }'}
Validation sample: {'input_text': 'task: generate_sparql\ninput: Tell me the name or label of the heating supply air temperature deadband setpoint in the Storage Room.\nentitybldg:Storage_Room \n brick:Heating_Supply_Air_Temperature_Deadband_Setpoint', 'target_text': 'SELECT ?label WHERE { ?sensor a brick:Heating_Supply_Air_Temperature_Deadband_Setpoint ; brick:hasLocation bldg:Stora

Map:   0%|          | 0/112599 [00:00<?, ? examples/s]

Map:   0%|          | 0/12511 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
1,0.0014,6.4e-05,0.474529,0.428383,0.474523,0.474515,0.090352,0.302615,0.915429,0.834711,0.873153,19.0
2,0.0001,6e-06,0.474529,0.428383,0.474523,0.474515,0.090352,0.302615,0.915429,0.834711,0.873153,19.0
3,0.0,1e-06,0.474529,0.428383,0.474523,0.474515,0.090352,0.302615,0.915429,0.834711,0.873153,19.0
4,0.0,1e-06,0.474529,0.428383,0.474523,0.474515,0.090352,0.302615,0.915429,0.834711,0.873153,19.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


training completed


In [2]:
# test sparql generation

In [None]:
checkpoint-223031

### Evaluation code

### test on new data 

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load your trained model and tokenizer from the saved directory.
model_dir = "./v3/checkpoint-56300" 
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_sparql(question, entity):
    """
    Generate a SPARQL query given a natural language question and an entity.
    """
    # Format the input as it was during training.
    input_text = f"task: generate_sparql\ninput: {question}\nentity{entity}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    # Generate output using beam search.
    outputs = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    generated_sparql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sparql

# Example input data
input_data = {
    "question": "Provide the definition for Heating Ventilation Air Conditioning System.",
    "entity": "brick:Heating_Ventilation_Air_Conditioning_System",
    "sparql": "SELECT ?definition WHERE { brick:Heating_Ventilation_Air_Conditioning_System skos:definition ?definition . }"
}

# Get outputs for both tasks
sparql_output = generate_sparql(input_data["question"], input_data["entity"])

print("Generated SPARQL Query:")
print(sparql_output)


Generated SPARQL Query:
SELECT?definition WHERE { brick:Heating_Ventilation_Air_Conditioning_System skos:definition?definition. }


In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load your trained model and tokenizer from the saved directory.
# model_dir = "./v2/checkpoint-9697"  # Update this path if needed
# tokenizer = T5Tokenizer.from_pretrained(model_dir)
# model = T5ForConditionalGeneration.from_pretrained(model_dir)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

def generate_sparql(question, entity):
    """
    Generate a SPARQL query given a natural language question and an entity.
    """
    # Format the input as it was during training.
    input_text = f"task: generate_sparql\ninput: {question}\nentity: {entity}"  # Added colon after "entity" for consistency
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    # Generate output using beam search.
    outputs = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    generated_sparql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sparql

# JSON data (you can also load this from a file using json.load())
test_data = [
    {
        "question": "What is the average cooling command reading for AHU 01 over the last hour?",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    },
    {
        "question": "Show me the most recent cooling command value for AHU 01.",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    },
    {
        "question": "What is the current cooling command status of AHU 01?",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    }
]

# Test the model with the JSON data
for i, item in enumerate(test_data, 1):
    question = item["question"]
    entity = item["entity"]
    original_sparql = item["sparql"]
    
    # Generate SPARQL query
    generated_sparql = generate_sparql(question, entity)
    
    # Print results
    print(f"\nTest Case {i}:")
#     print(f"Question: {question}")
    print(f"Entity: {entity}")
    print("Original SPARQL Query:")
    print(original_sparql)
    print("Generated SPARQL Query:")
    print(generated_sparql)
    print("-" * 80)


Test Case 1:
Entity: bldg:bldg1.AHU.AHU01.CCV
Original SPARQL Query:
SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }
Generated SPARQL Query:
SELECT?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference?ref.?ref a ref:TimeseriesReference ; ref:hasTimeseriesId?timeseriesId. }
--------------------------------------------------------------------------------

Test Case 2:
Entity: bldg:bldg1.AHU.AHU01.CCV
Original SPARQL Query:
SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }
Generated SPARQL Query:
SELECT?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference?ref.?ref a ref:TimeseriesReference ; ref:hasTimeseriesId?timeseriesId. }
--------------------------------------------------------------------------------

Test Case 3:
Entity: bldg:bldg1.A

In [19]:
from transformers import T5Tokenizer

# tokenizer = T5Tokenizer.from_pretrained("t5-base")
text = "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"

# Get token ids (including special tokens)
token_ids = tokenizer.encode(text, add_special_tokens=True)
print("Token IDs:", token_ids)

# If you want to see the tokens corresponding to the ids:
tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("Tokens:", tokens)


Token IDs: [3, 23143, 14196, 3, 58, 715, 10833, 7, 196, 26, 549, 17444, 427, 3, 2, 3, 115, 40, 26, 122, 10, 115, 40, 26, 122, 5411, 188, 17861, 5, 188, 17861, 10068, 2823, 553, 6273, 10, 10293, 5420, 2947, 138, 1649, 11788, 3, 58, 60, 89, 3, 5, 3, 58, 60, 89, 3, 9, 6273, 10, 13368, 10833, 7, 1649, 11788, 3, 117, 6273, 10, 10293, 13368, 10833, 7, 196, 26, 3, 58, 715, 10833, 7, 196, 26, 3, 5, 3, 2, 1]
Tokens: ['▁', 'SEL', 'ECT', '▁', '?', 'time', 'serie', 's', 'I', 'd', '▁W', 'HER', 'E', '▁', '<unk>', '▁', 'b', 'l', 'd', 'g', ':', 'b', 'l', 'd', 'g', '1.', 'A', 'HU', '.', 'A', 'HU', '01.', 'CC', 'V', '▁ref', ':', 'has', 'Ex', 'tern', 'al', 'Re', 'ference', '▁', '?', 're', 'f', '▁', '.', '▁', '?', 're', 'f', '▁', 'a', '▁ref', ':', 'Time', 'serie', 's', 'Re', 'ference', '▁', ';', '▁ref', ':', 'has', 'Time', 'serie', 's', 'I', 'd', '▁', '?', 'time', 'serie', 's', 'I', 'd', '▁', '.', '▁', '<unk>', '</s>']


In [23]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json
import re

# Load your trained model and tokenizer from the saved directory.
model_dir = "./v1/checkpoint-223031"  # Update this path if needed
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_sparql(question, entity):
    """
    Generate a SPARQL query given a natural language question and an entity.
    """
    # Format the input as it was during training.
    input_text = f"task: generate_sparql\ninput: {question}\nentity: {entity}"  # Added colon after "entity" for consistency
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    
    # Generate output using beam search.
    outputs = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    generated_sparql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sparql

# JSON data (you can also load this from a file using json.load())
test_data = [
    {
        "question": "What is the average cooling command reading for AHU 01 over the last hour?",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    },
    {
        "question": "Show me the most recent cooling command value for AHU 01.",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    },
    {
        "question": "What is the current cooling command status of AHU 01?",
        "entity": "bldg:bldg1.AHU.AHU01.CCV",
        "sparql": "SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }"
    }
]

# Test the model with the JSON data
for i, item in enumerate(test_data, 1):
    question = item["question"]
    entity = item["entity"]
    original_sparql = item["sparql"]
    
    # Generate SPARQL query
    generated_sparql = generate_sparql(question, entity)
    

    def add_space_before_question_mark(text):
        return re.sub(r'(?<!\s)\?', ' ?', text)

    generated_sparql = add_space_before_question_mark(generated_sparql)

    # Print results
#     print(f"\nTest Case {i}:")
#     print(f"Question: {question}")
#     print(f"Entity: {entity}")
    print("Original SPARQL Query:")
    print(original_sparql)
    print("Generated SPARQL Query:")
    print(generated_sparql)
    print("-" * 80)

Original SPARQL Query:
SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }
Generated SPARQL Query:
SELECT ?timeseriesId WHERE  bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref. ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId.
--------------------------------------------------------------------------------
Original SPARQL Query:
SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId . }
Generated SPARQL Query:
SELECT ?timeseriesId WHERE  bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref. ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId.
--------------------------------------------------------------------------------
Original SPARQL Query:
SELECT ?timeseriesId WHERE { bldg:bldg1.AHU.AHU01.CCV ref:hasExternalReference ?ref . ?ref a ref:Timese

In [None]:

import json
import random
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score

# Load your trained model and tokenizer
model_dir = "./v3/checkpoint-56300"  # Update this path if needed
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# Load the sample.json file
json_file = "training_data.json"  # Update with your file path
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {json_file}")

# Filter entries with required keys and select 50 random samples
required_keys = {"question", "entity", "sparql"}
filtered_data = [entry for entry in data if all(key in entry for key in required_keys)]
if len(filtered_data) < 50:
    raise ValueError(f"Insufficient entries with required keys. Found {len(filtered_data)}, need 50.")
sample_data = random.sample(filtered_data, 500)
print(f"Selected 500 random samples for evaluation.")

# Function to generate SPARQL query
def generate_sparql(question, entity):
    input_text = f"task: generate_sparql\ninput: {question}\nentity{entity}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    generated_sparql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return generated_sparql

# Generate predictions for all 50 samples
predictions = []
references = []
for entry in sample_data:
    pred = generate_sparql(entry["question"], entry["entity"])
    predictions.append(pred)
    references.append(entry["sparql"].strip())

# Load evaluation metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")
bertscore_metric = evaluate.load("bertscore")

# Compute token-level Precision, Recall, F1 (treating SPARQL as sequences of tokens)
def compute_token_prf(predictions, references):
    pred_tokens = [pred.split() for pred in predictions]
    ref_tokens = [ref.split() for ref in references]
    
    # Flatten and binarize for token-level comparison
    all_pred_tokens = set([token for sublist in pred_tokens for token in sublist])
    all_ref_tokens = set([token for sublist in ref_tokens for token in sublist])
    
    # Create binary labels for each unique token
    y_true = [1 if token in all_ref_tokens else 0 for token in all_pred_tokens]
    y_pred = [1] * len(all_pred_tokens)  # Predicted tokens are all "positive" in this context
    
    # Handle edge case where no tokens overlap
    if not y_true or not y_pred:
        return 0.0, 0.0, 0.0
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return precision, recall, f1

# Compute all metrics
def compute_metrics(predictions, references):
    results = {}
    
    # ROUGE
    rouge_results = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
    results.update(rouge_results)
    
    # BLEU
    bleu_results = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
    results["bleu"] = bleu_results["bleu"]
    
    # METEOR
    meteor_results = meteor_metric.compute(predictions=predictions, references=references)
    results["meteor"] = meteor_results["meteor"]
    
    # BERTScore
    bertscore_results = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
    results["bertscore_precision"] = np.mean(bertscore_results["precision"])
    results["bertscore_recall"] = np.mean(bertscore_results["recall"])
    results["bertscore_f1"] = np.mean(bertscore_results["f1"])
    
    # Token-level Precision, Recall, F1
    precision, recall, f1 = compute_token_prf(predictions, references)
    results["token_precision"] = precision
    results["token_recall"] = recall
    results["token_f1"] = f1
    
    # Average generation length
    results["gen_len"] = np.mean([len(pred.split()) for pred in predictions])
    
    return results

# Compute metrics
metrics = compute_metrics(predictions, references)

# Display results
print("\n=== Evaluation Results for 50 Random Samples ===")
print("\nSample Outputs (Original vs T5-Generated):")
for i, (ref, pred) in enumerate(zip(references[:5], predictions[:5]), 1):  # Show first 5 for brevity
    print(f"\nSample {i}:")
    print(f"Original SPARQL:  {ref}")
    print(f"T5-Generated:     {pred}")
    print("-" * 80)

print("\nEvaluation Metrics:")
for metric, value in metrics.items():
    if isinstance(value, float):
        print(f"{metric}: {value:.4f}")
    else:
        print(f"{metric}: {value}")

# Clear CUDA cache (optional)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\nCUDA cache cleared.")