In [None]:
# Install / upgrade core dependencies (safe to re-run). Comment out versions to use latest.
%pip install \
  transformers==4.44.2 \
  datasets==2.21.0 \
  evaluate==0.4.2 \
  accelerate==0.34.2 \
  sentencepiece==0.2.0 \
  nltk==3.9.1 \
  rouge-score==0.1.2 \
  bert-score==0.3.13 \
  sacrebleu==2.4.3 \
  scikit-learn==1.5.2 \
  pandas==2.2.2 \
  numpy==1.26.4 \
  torch --upgrade --quiet

import nltk
nltk.download('punkt', quiet=True)
print('Dependencies installed and punkt tokenizer downloaded.')

### Environment & Dependencies

Run the next cell once per environment to ensure all required packages (with tested versions) are installed before training/fine-tuning.

Core libraries:
- transformers (seq2seq model + trainer)
- datasets (dataset handling)
- evaluate (metrics hub)
- accelerate (efficient distributed / mixed precision)
- sentencepiece (tokenization backend for T5)
- nltk (optional: sentence tokenization, metrics)
- rouge_score, bert_score, sacrebleu (evaluation)
- scikit-learn (train/validation split)
- numpy, pandas (general utilities)

Pin versions if you need reproducibility; here we choose recent stable versions that work well with T5-base. Adjust as needed for CUDA / platform constraints.

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")

In [None]:
import json
import os
import re
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
import nltk
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import Dataset, concatenate_datasets
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
import torch
from sklearn.model_selection import train_test_split

2025-04-06 01:25:46.548464: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 01:25:46.565218: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743902746.585703    7321 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743902746.591963    7321 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 01:25:46.613267: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Training code

In [3]:
!pip install accelerate -U

Collecting accelerate
  Using cached accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.10.1-py3-none-any.whl (374 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.10.1


In [5]:
!pip install evaluate nltk rouge_score bert_score transformers[torch]

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
## data preparation and making decisions for additional tokens

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(
    tokenizer.tokenize(
        "brick:Natural_Gas rdfs:label ?label SELECT WHERE brick:Air_Flow_Sensor"
    )
)
# Output: ['brick', ':', 'Natural', '_', 'Gas', 'rdfs', ':', 'label', '?', 'label']

['▁brick', ':', 'N', 'atura', 'l', '_', 'G', 'a', 's', '▁', 'r', 'd', 'f', 's', ':', 'l', 'abel', '▁', '?', 'l', 'abel', '▁', 'SEL', 'ECT', '▁W', 'HER', 'E', '▁brick', ':', 'Air', '_', 'Flow', '_', 'S', 'en', 's', 'or']


In [None]:
import json

# Load the first JSON file
with open("updated_bldg_question_pairs_entities.json", "r") as f1:
    data1 = json.load(f1)

# Load the second JSON file
with open("updated_combined_output_with_entity.json", "r") as f2:
    data2 = json.load(f2)


# Function to filter and extract only the required keys
def filter_entry(entry):
    return {
        "question": entry["question"],
        "entity": entry["entity"],
        "sparql": entry["sparql"],
    }


# Process both datasets and combine them
merged_data = []

# Filter entries from file1.json
for entry in data1:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Filter entries from file2.json
for entry in data2:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Save the merged dataset to a new JSON file
with open("merged_output1.json", "w") as f:
    json.dump(merged_data, f, indent=4)

print(
    "Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'."
)

Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'.


In [None]:
import json

# Load the first JSON file
with open("abacws_bldg_question_pairs_entities.json", "r") as f1:
    data1 = json.load(f1)

# Load the second JSON file
with open("abacws_bldg_timeseries_question_pairs_entities.json", "r") as f2:
    data2 = json.load(f2)

#  # Load the second JSON file
# with open("updated_bldg_question_pairs_entities.json", "r") as f3:
#     data3 = json.load(f3)


# Function to filter and extract only the required keys
def filter_entry(entry):
    return {
        "question": entry["question"],
        "entity": entry["entity"],
        "sparql": entry["sparql"],
    }


# Process both datasets and combine them
merged_data = []

# Filter entries from file1.json
for entry in data1:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Filter entries from file2.json
for entry in data2:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Filter entries from file2.json
# for entry in data3:
#     filtered_entry = filter_entry(entry)
#     merged_data.append(filtered_entry)

# Save the merged dataset to a new JSON file
with open("merged_output2.json", "w") as f:
    json.dump(merged_data, f, indent=4)

print(
    "Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'."
)

Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'.


In [None]:
import json

# Load the first JSON file
with open("merged_output2.json", "r") as f1:
    data1 = json.load(f1)
  
# Load the second JSON file
with open("merged_output3.json", "r") as f2:
    data2 = json.load(f2)


# Function to filter and extract only the required keys
def filter_entry(entry):
    return {
        "question": entry["question"],
        "entity": entry["entity"],
        "sparql": entry["sparql"],
    }


# Process both datasets and combine them
merged_data = []

# Filter entries from file1.json
for entry in data1:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Filter entries from file2.json
for entry in data2:
    filtered_entry = filter_entry(entry)
    merged_data.append(filtered_entry)

# Save the merged dataset to a new JSON file
with open("training_data.json", "w") as f:
    json.dump(merged_data, f, indent=4)

print(
    "Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'."
)

Files merged successfully into 'merged_output.json' with keys: 'question', 'entity', and 'sparql'.


In [None]:
import json


# Function to read names from a file (one per line)
def load_names_from_file(filename):
    with open(filename, "r") as f:
        # Strip whitespace and filter out empty lines
        names = [line.strip() for line in f if line.strip()]
    return names


# Load sensors and zones from files
sensors = load_names_from_file("brick_sensors.txt")
zones = load_names_from_file("bldg_zones.txt")

# Add 'brick:' prefix to sensors and 'bldg:' prefix to zones if not already present
sensors = [f"brick:{s}" if not s.startswith("brick:") else s for s in sensors]
zones = [f"bldg:{z}" if not z.startswith("bldg:") else z for z in zones]


# Function to generate SPARQL queries and dataset entries
def generate_sparql_dataset(sensors_list, zones_list):
    dataset = []

    # Base entity template (e.g., "bldg:<Zone> \n brick:<SensorName>")
    def get_entity(sensor, zone):
        sensor_name = sensor.split(":")[1]  # Extract sensor name after "brick:"
        zone_name = zone.split(":")[1]  # Extract zone name after "bldg:"
        return f"bldg:{zone_name} \n brick:{sensor_name}"  # Newline-separated format

    # Loop over each sensor and each zone
    for sensor in sensors_list:
        for zone in zones_list:
            selected_entity = get_entity(sensor, zone)

            # Create the question string with sensor and zone details
            question = (
                f"Tell me the name or label of the {sensor.split(':')[1].replace('_', ' ').lower()} "
                f"in the {zone.split(':')[1].replace('-', ' ').replace('_', ' ')}."
            )

            # Create the corresponding SPARQL query
            sparql_query = f"SELECT ?label WHERE {{ ?sensor a {sensor} ; brick:hasLocation {zone} ; rdfs:label ?label . }}"

            # Create an entry and append it to the dataset
            entry = {
                "question": question,
                "entity": selected_entity,
                "sparql": sparql_query,
            }
            dataset.append(entry)

    return dataset


# Generate the dataset (will produce 5000 entries if there are 100 sensors and 50 zones)
dataset = generate_sparql_dataset(sensors, zones)

# Save to JSON file
with open("merged_output3.json", "w") as f:
    json.dump(dataset, f, indent=4)

print("Dataset generated and saved to 'sparql_dataset1.json'.")

Dataset generated and saved to 'sparql_dataset1.json'.


In [None]:
!pip install evaluate

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


import os

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import torch
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
nltk.download("punkt", quiet=True)

# 1. LOAD AND PREPARE DATA
DATA_FILE = "updated_combined_output_with_entity.json"
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {DATA_FILE}")
print("Example record:", data[0])

# Create multi-task training pairs
inputs, targets = [], []
for record in data:
    question = record.get("question", "")
    entity = record.get("entity", "")
    sparql = record.get("sparql", "")
    # response = record.get("sparql_response", "")
    # explanation = record.get("explanation", "")

    # Task 1: NL to SPARQL
    if question and entity and sparql:
        inputs.append(f"task: generate_sparql\ninput: {question}\nentity{entity}")
        targets.append(sparql)

    # # Task 2: Summarize response
    # if question and response and explanation:
    #     inputs.append(f"task: summarize_response\nquestion: {question}\nresponse: {response}")
    #     targets.append(explanation)

print(f"Generated {len(inputs)} total training pairs from {len(data)} records.")
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=42
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

# Save splits
train_data = [
    {"input_text": inp, "target_text": tgt}
    for inp, tgt in zip(train_inputs, train_targets)
]
val_data = [
    {"input_text": inp, "target_text": tgt} for inp, tgt in zip(val_inputs, val_targets)
]
with open("train_data_2April.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("val_data_2April.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
print("Saved train_data_2April.json and val_data_2April.json!")

# Build datasets
raw_datasets = DatasetDict(
    {
        "train": Dataset.from_dict(
            {"input_text": train_inputs, "target_text": train_targets}
        ),
        "validation": Dataset.from_dict(
            {"input_text": val_inputs, "target_text": val_targets}
        ),
    }
)
print("Train sample:", raw_datasets["train"][3])
print("Validation sample:", raw_datasets["validation"][3])

# 2. LOAD MODEL & TOKENIZER
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Add custom tokens once
# custom_tokens = []
# with open("all_relations_and_classes.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# with open("output_entities.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# num_added_tokens = tokenizer.add_tokens(custom_tokens)
# model.resize_token_embeddings(len(tokenizer))
# print(f"Added {num_added_tokens} new tokens to the tokenizer!")


# 3. PREPROCESSING
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], max_length=512, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. EVALUATION METRICS
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    results = {}
    rouge_result = metric_rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    results.update(rouge_result)
    bleu_result = metric_bleu.compute(
        predictions=decoded_preds, references=[[label] for label in decoded_labels]
    )
    results["bleu"] = bleu_result["bleu"]
    meteor_result = metric_meteor.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    results["meteor"] = meteor_result["meteor"]
    bertscore_result = metric_bertscore.compute(
        predictions=decoded_preds, references=decoded_labels, lang="en"
    )
    results["bertscore_precision"] = np.mean(bertscore_result["precision"])
    results["bertscore_recall"] = np.mean(bertscore_result["recall"])
    results["bertscore_f1"] = np.mean(bertscore_result["f1"])
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    results["gen_len"] = np.mean(prediction_lens)
    return results


# 5. TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./training_t5small",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    logging_dir="./training_t5small",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,  # Enable if GPU supports
    report_to=["tensorboard"],
    warmup_steps=500,
    lr_scheduler_type="cosine",
)

# 6. TRAINER SETUP
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7. TRAIN
trainer.train()

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared and memory freed.")


import os

os.environ["WANDB_MODE"] = "offline"

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import torch
import nltk
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
nltk.download("punkt", quiet=True)

# 1. LOAD AND PREPARE DATA
DATA_FILE = "updated_combined_output_with_entity.json"
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {DATA_FILE}")
print("Example record:", data[0])

# Create multi-task training pairs
inputs, targets = [], []
for record in data:
    question = record.get("question", "")
    entity = record.get("entity", "")
    sparql = record.get("sparql", "")
    response = record.get("sparql_response", "")
    explanation = record.get("explanation", "")

    # Task 1: NL to SPARQL
    if question and entity and sparql:
        inputs.append(f"task: generate_sparql\ninput: {question}\nentity{entity}")
        targets.append(sparql)

    # Task 2: Summarize response
    if question and response and explanation:
        inputs.append(
            f"task: summarize_response\nquestion: {question}\nresponse: {response}"
        )
        targets.append(explanation)

print(f"Generated {len(inputs)} total training pairs from {len(data)} records.")
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=40
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

# Save splits
train_data = [
    {"input_text": inp, "target_text": tgt}
    for inp, tgt in zip(train_inputs, train_targets)
]
val_data = [
    {"input_text": inp, "target_text": tgt} for inp, tgt in zip(val_inputs, val_targets)
]
with open("train_data_2April.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
with open("val_data_2April.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)
print("Saved train_data_2April.json and val_data_2April.json!")

# Build datasets
raw_datasets = DatasetDict(
    {
        "train": Dataset.from_dict(
            {"input_text": train_inputs, "target_text": train_targets}
        ),
        "validation": Dataset.from_dict(
            {"input_text": val_inputs, "target_text": val_targets}
        ),
    }
)
print("Train sample:", raw_datasets["train"][3])
print("Validation sample:", raw_datasets["validation"][3])

# 2. LOAD MODEL & TOKENIZER
model_name = "./training_t5smallv2/checkpoint-343920"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Add custom tokens once
# custom_tokens = []
# with open("all_relations_and_classes.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# with open("output_entities.txt", "r", encoding="utf-8") as f:
#     custom_tokens.extend([line.strip() for line in f.readlines()])
# num_added_tokens = tokenizer.add_tokens(custom_tokens)
# model.resize_token_embeddings(len(tokenizer))
# print(f"Added {num_added_tokens} new tokens to the tokenizer!")


# 3. PREPROCESSING
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], max_length=512, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. EVALUATION METRICS
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    results = {}
    rouge_result = metric_rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    results.update(rouge_result)
    bleu_result = metric_bleu.compute(
        predictions=decoded_preds, references=[[label] for label in decoded_labels]
    )
    results["bleu"] = bleu_result["bleu"]
    meteor_result = metric_meteor.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    results["meteor"] = meteor_result["meteor"]
    bertscore_result = metric_bertscore.compute(
        predictions=decoded_preds, references=decoded_labels, lang="en"
    )
    results["bertscore_precision"] = np.mean(bertscore_result["precision"])
    results["bertscore_recall"] = np.mean(bertscore_result["recall"])
    results["bertscore_f1"] = np.mean(bertscore_result["f1"])
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    results["gen_len"] = np.mean(prediction_lens)
    return results


# 5. TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./training_t5small",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    logging_strategy="epoch",
    logging_dir="./training_t5small",
    logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,  # Enable if GPU supports
    report_to=["tensorboard"],
    warmup_steps=500,
    lr_scheduler_type="cosine",
)

# 6. TRAINER SETUP
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7. TRAIN
trainer.train()

Loaded 52599 records from updated_combined_output_with_entity.json
Example record: {'question': 'What is the area of building bldg1?', 'entity': 'brick:area', 'sparql': 'SELECT ?value ?unit WHERE { bldg:bldg1 brick:area ?area . ?area brick:value ?value . ?area brick:hasUnits ?unit . }', 'sparql_response': '[{"value": "9973^^xsd:integer", "unit": "FT_2"}]', 'explanation': 'The area of building bldg1 is 9,973 square feet (FT_2). This means that the total floor space occupied by this building is approximately 9,973 square feet. This measurement is commonly used for real estate and construction purposes to describe the size of a building or space. In this case, the number "9,973" represents the precise area of building bldg1 as obtained from the smart building data created using Brickschema ontology. The "FT_2" notation signifies that the unit of measurement is square feet (foot squared).', 'id': 1}
Generated 101897 total training pairs from 52599 records.
Train size: 91707 | Validation si

Map:   0%|          | 0/91707 [00:00<?, ? examples/s]

Map:   0%|          | 0/10190 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
1,0.2957,0.248385,0.504155,0.433181,0.49606,0.49605,0.005084,0.480016,0.934897,0.888933,0.910661,18.019725
2,0.2943,0.248268,0.507756,0.438752,0.499885,0.499957,0.005157,0.482675,0.935279,0.889683,0.911232,18.021295
3,0.293,0.248041,0.511299,0.443744,0.503168,0.503253,0.005315,0.484156,0.9361,0.89065,0.912128,18.02051


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

### Evaluation code

In [None]:
import json
import torch
import numpy as np
from transformers import T5ForConditionalGeneration
import os
import torch
import numpy as np
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
)
import gc
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import evaluate
import nltk
import numpy as np

# -----------------------------------------------------------------------------
# 1. LOAD THE FINE-TUNED MODEL AND TOKENIZER
# -----------------------------------------------------------------------------
model_name = "./training_t5small/checkpoint-343920"  # Use your latest checkpoint
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# -----------------------------------------------------------------------------
# 2. LOAD EVALUATION DATA
# -----------------------------------------------------------------------------
# Option 1: Load from validation split saved during training
val_data_file = "val_data_2April.json"
with open(val_data_file, "r", encoding="utf-8") as f:
    val_data = json.load(f)
print(f"Loaded {len(val_data)} validation records from {val_data_file}")

# Separate tasks
sparql_inputs = []
sparql_targets = []
summarization_inputs = []
summarization_targets = []

for record in val_data:
    input_text = record["input_text"]
    target_text = record["target_text"]

    if "task: generate_sparql" in input_text:
        sparql_inputs.append(input_text)
        sparql_targets.append(target_text)
    elif "task: summarize_response" in input_text:
        summarization_inputs.append(input_text)
        summarization_targets.append(target_text)

print(f"SPARQL task: {len(sparql_inputs)} examples")
print(f"Summarization task: {len(summarization_inputs)} examples")


# -----------------------------------------------------------------------------
# 3. DEFINE HELPER FUNCTIONS
# -----------------------------------------------------------------------------
def generate_predictions(input_texts, max_length=512):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in input_texts:
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding="max_length",
                max_length=512,
            ).to(device)
            output_ids = model.generate(
                inputs["input_ids"],
                max_length=max_length,
                num_beams=4,  # Beam search for better quality
                early_stopping=True,
            )
            pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            predictions.append(pred.strip())
    return predictions


# -----------------------------------------------------------------------------
# 4. LOAD EVALUATION METRICS
# -----------------------------------------------------------------------------
metric_rouge = evaluate.load("rouge")
metric_bleu = evaluate.load("bleu")
metric_meteor = evaluate.load("meteor")
metric_bertscore = evaluate.load("bertscore")


def compute_metrics(predictions, references, task_name=""):
    results = {}
    # ROUGE
    rouge_result = metric_rouge.compute(
        predictions=predictions, references=references, use_stemmer=True
    )
    results.update({f"{task_name}_rouge_{k}": v for k, v in rouge_result.items()})

    # BLEU
    bleu_result = metric_bleu.compute(
        predictions=predictions, references=[[ref] for ref in references]
    )
    results[f"{task_name}_bleu"] = bleu_result["bleu"]

    # METEOR
    meteor_result = metric_meteor.compute(
        predictions=predictions, references=references
    )
    results[f"{task_name}_meteor"] = meteor_result["meteor"]

    # BERTScore
    bertscore_result = metric_bertscore.compute(
        predictions=predictions, references=references, lang="en"
    )
    results[f"{task_name}_bertscore_precision"] = np.mean(bertscore_result["precision"])
    results[f"{task_name}_bertscore_recall"] = np.mean(bertscore_result["recall"])
    results[f"{task_name}_bertscore_f1"] = np.mean(bertscore_result["f1"])

    # Average generation length
    prediction_lens = [len(pred.split()) for pred in predictions]
    results[f"{task_name}_gen_len"] = np.mean(prediction_lens)

    return results


# -----------------------------------------------------------------------------
# 5. EVALUATE SPARQL GENERATION TASK
# -----------------------------------------------------------------------------
print("\n=== Evaluating SPARQL Generation Task ===")
sparql_predictions = generate_predictions(sparql_inputs, max_length=128)

# Print a few examples
for i, (inp, pred, tgt) in enumerate(
    zip(sparql_inputs[:3], sparql_predictions[:3], sparql_targets[:3])
):
    print(f"Example {i+1}:")
    print("Input:     ", inp)
    print("Prediction:", pred)
    print("Target:    ", tgt)
    print("-" * 50)

# Compute metrics
sparql_metrics = compute_metrics(sparql_predictions, sparql_targets, task_name="sparql")
print("\nSPARQL Generation Metrics:")
for k, v in sparql_metrics.items():
    print(f"{k}: {v:.4f}")

# -----------------------------------------------------------------------------
# 6. EVALUATE SUMMARIZATION TASK
# -----------------------------------------------------------------------------
print("\n=== Evaluating Summarization Task ===")
summarization_predictions = generate_predictions(summarization_inputs, max_length=256)

# Print a few examples
for i, (inp, pred, tgt) in enumerate(
    zip(
        summarization_inputs[:3],
        summarization_predictions[:3],
        summarization_targets[:3],
    )
):
    print(f"Example {i+1}:")
    print("Input:     ", inp)
    print("Prediction:", pred)
    print("Target:    ", tgt)
    print("-" * 50)

# Compute metrics
summarization_metrics = compute_metrics(
    summarization_predictions, summarization_targets, task_name="summarization"
)
print("\nSummarization Metrics:")
for k, v in summarization_metrics.items():
    print(f"{k}: {v:.4f}")

# -----------------------------------------------------------------------------
# 7. CLEAR CUDA CACHE (Optional)
# -----------------------------------------------------------------------------
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared.")

Model loaded on cuda
Loaded 10190 validation records from val_data_2April.json
SPARQL task: 5187 examples
Summarization task: 5003 examples


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!



=== Evaluating SPARQL Generation Task ===
Example 1:
Input:      task: generate_sparql
input: What types is the Occupied Heating Mode Status?
entitybrick:Occupied_Heating_Mode_Status
Prediction: SELECT?definition WHERE?quantity skos:definition?definition.
Target:     SELECT ?type WHERE { brick:Occupied_Heating_Mode_Status a ?type . }
--------------------------------------------------
Example 2:
Input:      task: generate_sparql
input: Provide the definition for Air Flow Setpoint.
entitybrick:Air_Flow_Setpoint
Prediction: SELECT?definition WHERE brick:Fan skos:definition?definition.
Target:     SELECT ?definition WHERE { brick:Air_Flow_Setpoint skos:definition ?definition . }
--------------------------------------------------
Example 3:
Input:      task: generate_sparql
input: Can you provide the definition for the Return Air Humidity Sensor?
entitybrick:Return_Air_Humidity_Sensor
Prediction: SELECT?definition WHERE?parent skos:definition?definition.
Target:     SELECT ?definition WHER

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



SPARQL Generation Metrics:
sparql_rouge_rouge1: 0.6714
sparql_rouge_rouge2: 0.5177
sparql_rouge_rougeL: 0.6709
sparql_rouge_rougeLsum: 0.6709
sparql_bleu: 0.3783
sparql_meteor: 0.6631
sparql_bertscore_precision: 0.9139
sparql_bertscore_recall: 0.8740
sparql_bertscore_f1: 0.8934
sparql_gen_len: 3.8359

=== Evaluating Summarization Task ===
Example 1:
Input:      task: summarize_response
question: What is the quantity linked to the Return Air Enthalpy Sensor?
response: [{"quantity": "Enthalpy"}]
Prediction: The Return Air Enthalpy Sensor in this smart building system measures the enthalpy of the air being returned to the HVAC (Heating, Ventilation, and Air Conditioning) system. Enthalpy is a thermodynamic property that describes the total energy content of a substance (in this case, air) in a given volume of air. In simpler terms, the Return Air Enthalpy Sensor measures the enthalpy of the air being returned to the heating, ventilation, and air conditioning (HVAC) system after it has ci

### test on new data 

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# -----------------------------------------------------------------------------
# 1. LOAD THE TRAINED MODEL AND TOKENIZER
# -----------------------------------------------------------------------------
model_name = "./training_t5small/checkpoint-57320"  # Update with your checkpoint path
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")


# -----------------------------------------------------------------------------
# 2. HELPER FUNCTION TO GENERATE RESPONSES
# -----------------------------------------------------------------------------
def generate_response(input_text, max_length=256):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=512,
        ).to(device)
        output_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_beams=4,  # Beam search for better quality
            early_stopping=True,
        )
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response.strip()


# -----------------------------------------------------------------------------
# 3. INTERACTIVE TESTING LOOP
# -----------------------------------------------------------------------------
def test_model():
    print("\n=== Interactive Model Testing ===")
    print("Enter inputs for SPARQL generation or summarization tasks.")
    print(
        "For SPARQL: Provide a natural language question (e.g., 'What is the capital of France?')"
    )
    print(
        "For Summarization: Provide a question and response (e.g., 'What is the weather like? [Weather data]')"
    )
    print("Type 'exit' to quit.\n")

    while True:
        # Get user input
        task_choice = input("Choose task (1 for SPARQL, 2 for Summarization): ").strip()

        if task_choice.lower() == "exit":
            break

        if task_choice not in ["1", "2"]:
            print("Invalid choice. Please enter 1 for SPARQL or 2 for Summarization.")
            continue

        # Task 1: SPARQL Generation
        if task_choice == "1":
            question = input("Enter your question: ").strip()
            if not question:
                print("Please provide a question.")
                continue

            # Add entity if provided (optional)
            entity = input("Enter entity (optional, press Enter to skip): ").strip()
            input_text = f"task: generate_sparql\ninput: {question}"
            if entity:
                input_text += f"\nentity: {entity}"

            # Generate SPARQL
            sparql_response = generate_response(input_text, max_length=128)
            print("\nGenerated SPARQL:")
            print(sparql_response)
            print("-" * 50)

        # Task 2: Summarization
        elif task_choice == "2":
            question = input("Enter your question: ").strip()
            response = input("Enter the response to summarize: ").strip()
            if not question or not response:
                print("Please provide both a question and a response.")
                continue

            input_text = (
                f"task: summarize_response\nquestion: {question}\nresponse: {response}"
            )

            # Generate summary
            summary_response = generate_response(input_text, max_length=256)
            print("\nGenerated Summary:")
            print(summary_response)
            print("-" * 50)


# -----------------------------------------------------------------------------
# 4. RUN THE TEST
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    test_model()

    # Clear CUDA cache after testing (optional)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("CUDA cache cleared.")

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load your trained model and tokenizer from the saved directory.
model_dir = "./training_t5smallv2/checkpoint-343920"  # Update this path if needed
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def generate_sparql(question, entity):
    """
    Generate a SPARQL query given a natural language question and an entity.
    """
    # Format the input as it was during training.
    input_text = f"task: generate_sparql\ninput: {question}\nentity{entity}"
    input_ids = tokenizer.encode(
        input_text, return_tensors="pt", truncation=True, max_length=512
    ).to(device)

    # Generate output using beam search.
    outputs = model.generate(
        input_ids, max_length=150, num_beams=5, early_stopping=True
    )
    generated_sparql = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sparql


def summarize_response(question, response):
    """
    Generate a summary (explanation) based on the question and response text.
    """
    # Format the input as it was during training.
    input_text = f"task: summarize_response\nquestion: {question}\nresponse: {response}"
    input_ids = tokenizer.encode(
        input_text, return_tensors="pt", truncation=True, max_length=512
    ).to(device)

    # Generate summary output.
    outputs = model.generate(
        input_ids, max_length=256, num_beams=5, early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary


# Example input data
input_data = {
    "question": "Provide the definition for Heating Ventilation Air Conditioning System.",
    "entity": "brick:Heating_Ventilation_Air_Conditioning_System",
    "sparql": "SELECT ?definition WHERE { brick:Heating_Ventilation_Air_Conditioning_System skos:definition ?definition . }",
    "sparql_response": '[{"definition": "The equipment, distribution systems and terminals that provide, either collectively or individually, the processes of heating, ventilating or air conditioning to a building or portion of a building"}]',
    "explanation": (
        "The Heating Ventilation Air Conditioning (HVAC) System is a set of equipment, systems, and components that work together "
        "to control and maintain comfortable conditions within a building. This includes processes such as heating the interior space during "
        "cold weather, cooling it during warm weather, moving air for ventilation, and maintaining the desired indoor air quality. The HVAC "
        "system is crucial for providing a comfortable and healthy environment in any building, be it residential or commercial. It can "
        "consist of various components like air handlers, ducts, vents, thermostats, boilers, furnaces, cooling towers, heat pumps, and more, "
        "depending on the specific needs of the building. The HVAC system is designed to ensure that the temperature, humidity, and air quality "
        "within the building are maintained at optimal levels for the comfort and health of its occupants."
    ),
    "id": 49568,
}

# Get outputs for both tasks
sparql_output = generate_sparql(input_data["question"], input_data["entity"])
summary_output = summarize_response(
    input_data["question"], input_data["sparql_response"]
)

print("Generated SPARQL Query:")
print(sparql_output)
print("\nGenerated Summary:")
print(summary_output)

Generated SPARQL Query:
SELECT?definition WHERE brick:Heating_Ventilation_Air_Conditioning_System skos:definition?definition.

Generated Summary:
A Heating Ventilation Air Conditioning (HVAC) System refers to the collection of equipment, distribution systems, and terminals that provide, collectively or individually, the processes of heating, ventilating, or air conditioning to a specific building or portion of a building. This system is responsible for maintaining a comfortable indoor environment by regulating temperature, humidity, and air quality. The primary function of a HVAC system is to maintain a comfortable temperature for occupants while also ensuring energy efficiency.


In [3]:
def add_prefix(input_file: str, output_file: str):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            # Remove any trailing newline or whitespace
            line = line.strip()
            if line:
                # Prepend "bldg:" to the line and write it to the output file
                outfile.write(f" - {line}\n")


if __name__ == "__main__":
    add_prefix("bldg_sensors.txt", "bldg_sensors_sufix.txt")
    print("bldg_sensors_dash.txt")

bldg_sensors_dash.txt


# Continued Fine-Tuning: Combined Extended + Schema Datasets

This section loads the two newly generated datasets:
- `bldg3_dataset_extended.json` (semantic sensor + analytic queries)
- `Transformers/t5_base/training/bldg3/bldg3_schema_dataset.json` (ontology/TBox + structural queries)

It merges them into a single training corpus, converts multi-entity lists into a flat string for the model input, and continues fine-tuning from the latest prior checkpoint (if available) or the base `t5-base` model. New checkpoints will be written under `./trained/combined_t5/`.

You can adjust hyperparameters (epochs, batch size, lr) in the following cells if needed.

In [2]:
# Install / upgrade core dependencies (safe to re-run). Comment out versions to use latest.
%pip install \
  transformers==4.44.2 \
  datasets==2.21.0 \
  evaluate==0.4.2 \
  accelerate==0.34.2 \
  sentencepiece==0.2.0 \
  nltk==3.9.1 \
  rouge-score==0.1.2 \
  bert-score==0.3.13 \
  sacrebleu==2.4.3 \
  scikit-learn==1.5.2 \
  pandas==2.2.2 \
  numpy==1.26.4 \
  torch --upgrade --quiet

import nltk
nltk.download('punkt', quiet=True)
print('Dependencies installed and punkt tokenizer downloaded.')

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Dependencies installed and punkt tokenizer downloaded.


In [3]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda

In [4]:
!pip install transformers datasets evaluate rouge-score sacrebleu nltk bert-score scikit-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [8]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


[0mLooking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (780.5 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.3 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.5.1+cu121 torchaudio-2.5.1+cu121 torchvision-0.20.1+cu121
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [9]:
!python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"None\"}')"


CUDA: True
GPU: NVIDIA RTX A6000


In [11]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.21,>=2.20 (from tf-keras)
  Downloading tensorflow-2.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting protobuf>=5.28.0 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading protobuf-6.32.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting tensorboard~=2.20.0 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting keras>=3.10.0 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading keras-3.11.3-py3-none-any.whl.metadata (5.9 kB)
Collecting h5py>=3.11.0 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading h5py-3.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting ml_dtypes<

# Training

In [13]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.22.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading gitpython-3.1.45-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3 (from wandb)
  Downloading pydantic-2.11.9-py3-none-any.whl.metadata (68 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.39.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3->wandb)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.2 (from pydantic<3->wandb)
  Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting typing-extensions<5,>=4.8 (from wandb)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting typing-inspection>=0.4.0 (from pydantic<3->wandb)
  Downloading typing_inspection-0.4.2-py3-none-any.whl.metadata (2.6 kB)


In [1]:
%sx wandb login 2eda0d96fb7751d51e6cf57d224d6f0792d83fec

['wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc',
 'wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin']

In [2]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33msuhasdevmane[0m ([33msuhasdevmane-cardiff-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import os, json, random, math, gc, sys, time, statistics, traceback
from pathlib import Path
import torch
from torch import nn
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback, TrainerCallback
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate

# --- NEW: Import wandb for logging ---
try:
    import wandb
    WANDB_AVAILABLE = True
    print("W&B imported successfully.")
except ImportError:
    WANDB_AVAILABLE = False
    print("WARNING: wandb not installed. Install with 'pip install wandb' to enable online logging.")
# --------------------------------------


SEED = 123
random.seed(SEED)
try:
    import numpy as np
    np.random.seed(SEED)
except ImportError:
    pass
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

EXTENDED_PATH = Path('./training/raw_merged_extended_datasets.json')
SCHEMA_PATH = Path('./training/raw_merged_schema_datasets.json')
assert EXTENDED_PATH.exists(), f"Missing {EXTENDED_PATH}"
assert SCHEMA_PATH.exists(), f"Missing {SCHEMA_PATH}"

with EXTENDED_PATH.open('r', encoding='utf-8') as f:
    extended_data = json.load(f)
with SCHEMA_PATH.open('r', encoding='utf-8') as f:
    schema_data = json.load(f)

print(f"Loaded extended examples: {len(extended_data)} | schema examples: {len(schema_data)}")

def normalize(entry):
    q = entry.get('question','').strip()
    sparql = entry.get('sparql','').strip()
    entities = entry.get('entities') or entry.get('entity') or []
    if isinstance(entities, str):
        entities_list = [e.strip() for e in entities.split('\n') if e.strip()]
    else:
        entities_list = entities
    entity_block = '\n'.join(entities_list) if entities_list else ''
    return {
        'question': q,
        'entities_list': entities_list,
        'entity_block': entity_block,
        'sparql': sparql
    }

normalized = [normalize(e) for e in extended_data] + [normalize(e) for e in schema_data]
print(f"Total normalized records: {len(normalized)}")

inputs, targets = [], []
for rec in normalized:
    if rec['question'] and rec['sparql']:
        ent_part = f"\nentity: {rec['entity_block']}" if rec['entity_block'] else ''
        inputs.append(f"task: generate_sparql\ninput: {rec['question']}{ent_part}")
        targets.append(rec['sparql'])

print(f"Prepared training pairs: {len(inputs)} (questions with SPARQL)")

train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.05, random_state=SEED
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

raw_datasets = DatasetDict({
    'train': Dataset.from_dict({'input_text': train_inputs, 'target_text': train_targets}),
    'validation': Dataset.from_dict({'input_text': val_inputs, 'target_text': val_targets}),
})

with open('merged_combined_corpus.json', 'w', encoding='utf-8') as f:
    json.dump([
        {'input_text': i, 'target_text': t} for i, t in zip(inputs, targets)
    ], f, ensure_ascii=False, indent=2)
print('Saved merged_combined_corpus.json')

POSSIBLE_PREV = [
    './trained/checkpoint-2',
]
model_source = None
for path in POSSIBLE_PREV:
    if Path(path).exists():
        model_source = path
        break
if model_source is None:
    model_source = 't5-base'
print(f"Continuing fine-tune from: {model_source}")

try:
    tokenizer = T5Tokenizer.from_pretrained(model_source)
    print(f"Loaded tokenizer from {model_source}")
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
except Exception as e:
    print(f"Tokenizer load failed from {model_source}, falling back to t5-base: {e}")
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")

try:
    model = T5ForConditionalGeneration.from_pretrained(model_source, device_map=None)
    print(f"Successfully loaded model from {model_source}")
    print(f"Model embedding size: {model.get_input_embeddings().weight.shape[0]}")
    if len(tokenizer) != model.get_input_embeddings().weight.shape[0]:
        print(f"WARNING: Vocabulary size mismatch!")
        print(f"  Tokenizer vocab size: {len(tokenizer)}")
        print(f"  Model embedding size: {model.get_input_embeddings().weight.shape[0]}")
        print("  This will cause IndexError during generation!")
        sys.exit(1)
    else:
        print(f"✓ Vocabulary sizes match: {len(tokenizer)}")
except Exception as e:
    print(f"Model load failed from {model_source}, falling back to t5-base: {e}")
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    print(f"Model embedding size: {model.get_input_embeddings().weight.shape[0]}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}\nDEVICE CONFIGURATION\n{'='*60}")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"{'='*60}\n")
else:
    print("WARNING: No GPU detected! Training will be very slow on CPU.")
    print("Make sure you have:")
    print("  1. A CUDA-capable GPU")
    print("  2. CUDA toolkit installed")
    print("  3. PyTorch with CUDA support:")
    print("      pip uninstall torch torchvision torchaudio -y")
    print("      pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
    print(f"{'='*60}\n")
    response = input("Continue with CPU training? (yes/no): ")
    if response.lower() != 'yes':
        print("Exiting. Please install CUDA support and try again.")
        sys.exit(1)

model.to(device)
print(f"Model moved to {device}")

# ----- T5 generation settings: prevents invalid pad tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.pad_token_id
if hasattr(model, "generation_config"):
    model.generation_config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.eos_token_id = tokenizer.eos_token_id

SPECIAL_TOKENS = []
if SPECIAL_TOKENS:
    added = tokenizer.add_tokens([t for t in SPECIAL_TOKENS if t not in tokenizer.get_vocab()])
    if added:
        model.resize_token_embeddings(len(tokenizer))
        print(f"Added {added} special tokens")

def safe_load_metric(name, pip_pkg=None, alt=None):
    if name in ['rouge', 'bleu', 'meteor'] and alt:
        print(f"Using direct fallback implementation for '{name}' metric")
        return alt, True
    try:
        metric = evaluate.load(name)
        def _call(preds, refs):
            return metric.compute(predictions=preds, references=refs)
        return _call, False
    except Exception as e:
        print(f"Failed to load metric '{name}' from evaluate: {e}")
        if pip_pkg:
            try:
                import subprocess
                subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', pip_pkg], check=False)
                metric = evaluate.load(name)
                def _call(preds, refs):
                    return metric.compute(predictions=preds, references=refs)
                return _call, False
            except Exception as e2:
                print(f"Failed to load metric '{name}' after install attempt: {e2}")
        if alt:
            print(f"Using fallback implementation for '{name}'")
            return alt, True
        else:
            def _noop(preds, refs):
                return {}
            print(f"No fallback available for metric '{name}'. Returning empty dict.")
            return _noop, True

def rouge_fallback(preds, refs):
    try:
        from rouge_score import rouge_scorer
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'rouge-score'], check=True)
        from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    r1, r2, rl = [], [], []
    for p, r in zip(preds, refs):
        scores = scorer.score(r, p)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rl.append(scores['rougeL'].fmeasure)
    return {
        'rouge1': sum(r1)/len(r1) if r1 else 0.0,
        'rouge2': sum(r2)/len(r2) if r2 else 0.0,
        'rougeL': sum(rl)/len(rl) if rl else 0.0,
    }

def bleu_fallback(preds, refs):
    try:
        import sacrebleu
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'sacrebleu'], check=True)
        import sacrebleu
    bleu = sacrebleu.corpus_bleu(preds, [refs])
    return {'bleu': bleu.score}

def meteor_fallback(preds, refs):
    try:
        import nltk
        from nltk.translate.meteor_score import meteor_score
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'nltk'], check=True)
        import nltk
        from nltk.translate.meteor_score import meteor_score
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
    # Now split tokens for each sentence
    scores = [meteor_score([r.split()], p.split()) for p, r in zip(preds, refs)]
    return {'meteor': sum(scores)/len(scores) if scores else 0.0}

def bertscore_fallback(preds, refs):
    try:
        from bert_score import score as bert_score
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'bert-score'], check=False)
        from bert_score import score as bert_score
    P, R, F = bert_score(preds, refs, lang='en', verbose=False)
    return {
        'bertscore_precision': float(P.mean()),
        'bertscore_recall': float(R.mean()),
        'bertscore_f1': float(F.mean())
    }

metric_rouge, rouge_fallback_used = safe_load_metric('rouge', pip_pkg='rouge-score', alt=rouge_fallback)
metric_bleu, bleu_fallback_used = safe_load_metric('bleu', pip_pkg='sacrebleu', alt=bleu_fallback)
metric_meteor, meteor_fallback_used = safe_load_metric('meteor', pip_pkg='nltk', alt=meteor_fallback)
metric_bertscore, bertscore_fallback_used = safe_load_metric('bertscore', pip_pkg='bert-score', alt=bertscore_fallback)

print("Metric loaders ready. Fallback usage:")
print({
    'rouge_fallback': rouge_fallback_used,
    'bleu_fallback': bleu_fallback_used,
    'meteor_fallback': meteor_fallback_used,
    'bertscore_fallback': bertscore_fallback_used,
})

max_source_len = 512
max_target_len = 256

print(f"Using tokenizer vocabulary size: {len(tokenizer)}")
label_pad_token_id = -100

def preprocess(batch):
    model_inputs = tokenizer(
        batch['input_text'],
        max_length=max_source_len,
        truncation=True,
        padding='max_length'
    )
    labels = tokenizer(
        text_target=batch['target_text'],
        max_length=max_target_len,
        truncation=True,
        padding='max_length'
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized = raw_datasets.map(preprocess, batched=True, remove_columns=['input_text','target_text'])

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id)

class MetricsTableCallback(TrainerCallback):
    """Custom callback to log metrics locally to a CSV file."""
    def __init__(self, save_csv_path='epoch_metrics_log.csv'):
        self.save_csv_path = save_csv_path
        self.rows = []
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Hugging Face Trainer logs 'loss' and 'eval_loss'.
        # Rename 'eval_loss' to 'validation_loss' for consistency with plotting script.
        row = {
            'epoch': state.epoch, # Add epoch for better CSV plotting
            'step': state.global_step,
            'training_loss': metrics.get('loss', 0.0),
            'validation_loss': metrics.get('eval_loss', 0.0)
        }
        
        # Add all computed metrics
        for k,v in metrics.items():
            if isinstance(v, (int, float)):
                 # Only keep metrics not already captured and not starting with 'eval_'
                if not k.startswith('eval_') and k not in ['loss']:
                    row[k] = v
                # Capture all eval metrics
                elif k.startswith('eval_') and k != 'eval_loss':
                    row[k.replace('eval_', '')] = v
                    
        self.rows.append(row)
        return control
        
    def on_train_end(self, args, state, control, **kwargs):
        if self.rows:
            try:
                import csv
                # Ensure 'epoch' and 'step' are first in the header
                all_keys = {k for r in self.rows for k in r.keys()}
                
                # Define preferred order
                keys_ordered = ['epoch', 'step', 'training_loss', 'validation_loss']
                
                # Append all other keys, sorted, ensuring no duplicates
                remaining_keys = sorted(list(all_keys - set(keys_ordered)))
                keys = keys_ordered + remaining_keys
                
                with open(self.save_csv_path, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=keys)
                    writer.writeheader()
                    for r in self.rows:
                        # Ensure all rows have all keys for CSV
                        writer.writerow({k: r.get(k, '') for k in keys}) 
                print(f"Saved metrics log to {self.save_csv_path}")
            except Exception as e:
                print(f"Could not save metrics CSV: {e}")

metrics_callback = MetricsTableCallback()

# ----------- SAFE compute_metrics for T5 with decode patch -----------
def compute_metrics(eval_pred):
    import numpy as np
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    preds = np.asarray(predictions, dtype=np.int64)
    labs = np.asarray(labels, dtype=np.int64)

    invalid_pred_mask = (preds < 0) | (preds >= tokenizer.vocab_size)
    if invalid_pred_mask.any():
        preds[invalid_pred_mask] = tokenizer.pad_token_id

    labs = np.where(labs == -100, tokenizer.pad_token_id, labs)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labs, skip_special_tokens=True)
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    gen_lens = [len(tokenizer.encode(p)) for p in decoded_preds]
    avg_len = sum(gen_lens)/len(gen_lens) if gen_lens else 0

    rouge_res = metric_rouge(decoded_preds, decoded_labels)
    bleu_res = metric_bleu(decoded_preds, decoded_labels)
    meteor_res = metric_meteor(decoded_preds, decoded_labels)
    bert_res = metric_bertscore(decoded_preds, decoded_labels)
    metrics = {}
    metrics.update({k: float(v) for k,v in rouge_res.items()})
    metrics.update({k: float(v) for k,v in bleu_res.items()})
    metrics.update({k: float(v) for k,v in meteor_res.items()})
    metrics.update({k: float(v) for k,v in bert_res.items()})
    metrics['gen_len'] = avg_len
    
    # Add prefix 'eval_' for all custom metrics to distinguish from training loss
    # The Trainer automatically prefixes its loss metrics.
    return {f"eval_{k}": v for k, v in metrics.items()}


# --- UPDATED TRAINING ARGUMENTS FOR W&B ---
# Define W&B specific settings
WANDB_PROJECT_NAME = "nl2sparql"
WANDB_RUN_NAME = f"T5-base-ft-{time.strftime('%Y%m%d-%H%M%S')}"

training_args = Seq2SeqTrainingArguments(
    output_dir='./trained/checkpoint-4',
    evaluation_strategy='epoch',
    logging_strategy='steps',
    save_strategy='epoch',
    save_total_limit=3,
    logging_steps=250,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=3e-4,
    warmup_steps=200,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    generation_max_length=512,
    generation_num_beams=1,
    # --- W&B INTEGRATION START ---
    report_to=["wandb"] if WANDB_AVAILABLE else ["none"], # Tell the Trainer to use W&B
    run_name=WANDB_RUN_NAME, # Name for the specific run on the dashboard
    # Optional: set project name. If not set, it uses the WANDB_PROJECT environment variable.
    # project_name=WANDB_PROJECT_NAME, 
    # --- W&B INTEGRATION END ---
    load_best_model_at_end=True,
    metric_for_best_model='eval_rougeL', # Use 'eval_' prefix for the metric calculated in compute_metrics
    greater_is_better=True,
    seed=SEED,
    dataloader_num_workers=2 if not sys.platform.startswith('win') else 0,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), metrics_callback]
)

print("\n" + "="*60)
print("TRAINING CONFIGURATION")
print("="*60)
print(f"Training on: {device}")
print(f"Mixed precision (FP16): {training_args.fp16}")
print(f"Batch size per device: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Training samples: {len(tokenized['train'])}")
print(f"Validation samples: {len(tokenized['validation'])}")
print(f"W&B Logging: {'Enabled' if WANDB_AVAILABLE else 'Disabled (install wandb)'}")
if WANDB_AVAILABLE:
    print(f"W&B Run Name: {WANDB_RUN_NAME}")
print("="*60 + "\n")

# Initialize W&B manually if you want to set the project name or other config outside of Trainer args
# Note: The Trainer handles wandb.init() automatically when report_to="wandb" is set,
# but if you need specific config, you can call it before trainer.train()

trainer.train()

# Optional: end the W&B run manually when training is done
if WANDB_AVAILABLE:
    wandb.finish()


2025-10-04 12:09:55.512530: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-04 12:09:55.553414: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-04 12:09:56.603285: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


W&B imported successfully.
Loaded extended examples: 32573 | schema examples: 4423
Total normalized records: 36996
Prepared training pairs: 36996 (questions with SPARQL)
Train size: 35146 | Validation size: 1850
Saved merged_combined_corpus.json
Continuing fine-tune from: ./trained/checkpoint-2
Loaded tokenizer from ./trained/checkpoint-2
Tokenizer vocabulary size: 32102
Successfully loaded model from ./trained/checkpoint-2
Model embedding size: 32102
✓ Vocabulary sizes match: 32102

DEVICE CONFIGURATION
Using device: cuda
GPU Device: NVIDIA RTX A6000
GPU Memory Available: 47.40 GB
CUDA Version: 12.1

Model moved to cuda
Using direct fallback implementation for 'rouge' metric
Using direct fallback implementation for 'bleu' metric
Using direct fallback implementation for 'meteor' metric
Failed to load metric 'bertscore' from evaluate: Couldn't find a module script at /tf/notebooks/Transformers/T5_base/bertscore/bertscore.py. Module 'bertscore' doesn't exist on the Hugging Face Hub eithe

[0m

Failed to load metric 'bertscore' after install attempt: Couldn't find a module script at /tf/notebooks/Transformers/T5_base/bertscore/bertscore.py. Module 'bertscore' doesn't exist on the Hugging Face Hub either.
Using fallback implementation for 'bertscore'
Metric loaders ready. Fallback usage:
{'rouge_fallback': True, 'bleu_fallback': True, 'meteor_fallback': True, 'bertscore_fallback': True}
Using tokenizer vocabulary size: 32102


Map:   0%|          | 0/35146 [00:00<?, ? examples/s]

Map:   0%|          | 0/1850 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



TRAINING CONFIGURATION
Training on: cuda
Mixed precision (FP16): True
Batch size per device: 4
Gradient accumulation steps: 4
Effective batch size: 16
Number of epochs: 5
Learning rate: 0.0003
Training samples: 35146
Validation samples: 1850
W&B Logging: Enabled
W&B Run Name: T5-base-ft-20251004-121019



[34m[1mwandb[0m: Currently logged in as: [33msuhasdevmane[0m ([33msuhasdevmane-cardiff-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
0,0.0005,0.001357,0.999676,0.999459,0.999632,87.185665,0.831378,0.986361,0.973922,0.980094,75.247027
1,0.0008,0.001322,0.999676,0.999459,0.999632,87.185665,0.831378,0.986361,0.973922,0.980094,75.247027
2,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## try reduced learning rate

In [3]:
import os, json, random, math, gc, sys, time, statistics, traceback
from pathlib import Path
import torch
from torch import nn
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback, TrainerCallback
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate

# --- NEW: Import wandb for logging ---
try:
    import wandb
    WANDB_AVAILABLE = True
    print("W&B imported successfully.")
except ImportError:
    WANDB_AVAILABLE = False
    print("WARNING: wandb not installed. Install with 'pip install wandb' to enable online logging.")
# --------------------------------------


SEED = 123
random.seed(SEED)
try:
    import numpy as np
    np.random.seed(SEED)
except ImportError:
    pass
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

EXTENDED_PATH = Path('./training/raw_merged_extended_datasets.json')
SCHEMA_PATH = Path('./training/raw_merged_schema_datasets.json')
assert EXTENDED_PATH.exists(), f"Missing {EXTENDED_PATH}"
assert SCHEMA_PATH.exists(), f"Missing {SCHEMA_PATH}"

with EXTENDED_PATH.open('r', encoding='utf-8') as f:
    extended_data = json.load(f)
with SCHEMA_PATH.open('r', encoding='utf-8') as f:
    schema_data = json.load(f)

print(f"Loaded extended examples: {len(extended_data)} | schema examples: {len(schema_data)}")

def normalize(entry):
    q = entry.get('question','').strip()
    sparql = entry.get('sparql','').strip()
    entities = entry.get('entities') or entry.get('entity') or []
    if isinstance(entities, str):
        entities_list = [e.strip() for e in entities.split('\n') if e.strip()]
    else:
        entities_list = entities
    entity_block = '\n'.join(entities_list) if entities_list else ''
    return {
        'question': q,
        'entities_list': entities_list,
        'entity_block': entity_block,
        'sparql': sparql
    }

normalized = [normalize(e) for e in extended_data] + [normalize(e) for e in schema_data]
print(f"Total normalized records: {len(normalized)}")

# --- UPDATED DATA PREPARATION TO SAVE ALL FIELDS ---
inputs, targets = [], []
corpus_records = [] # New list to hold records for the JSON dump
for rec in normalized:
    if rec['question'] and rec['sparql']:
        ent_part = f"\nentity: {rec['entity_block']}" if rec['entity_block'] else ''
        
        input_text = f"task: generate_sparql\ninput: {rec['question']}{ent_part}"
        target_text = rec['sparql']
        
        # Add to training lists
        inputs.append(input_text)
        targets.append(target_text)

        # Add to corpus list for saving, including separate fields
        corpus_records.append({
            'input_text': input_text,
            'target_text': target_text,
            'question': rec['question'],
            'entities_list': rec['entities_list'],
            'entity_block': rec['entity_block'],
        })
# ---------------------------------------------------

print(f"Prepared training pairs: {len(inputs)} (questions with SPARQL)")

train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.05, random_state=SEED
)
print(f"Train size: {len(train_inputs)} | Validation size: {len(val_inputs)}")

raw_datasets = DatasetDict({
    'train': Dataset.from_dict({'input_text': train_inputs, 'target_text': train_targets}),
    'validation': Dataset.from_dict({'input_text': val_inputs, 'target_text': val_targets}),
})

# --- UPDATED JSON DUMPING ---
with open('merged_combined_corpus.json', 'w', encoding='utf-8') as f:
    json.dump(corpus_records, f, ensure_ascii=False, indent=2)
print('Saved merged_combined_corpus.json with input text, target SPARQL, question, and entities.')
# ----------------------------

POSSIBLE_PREV = [
    './trained/checkpoint-2',
]
model_source = None
for path in POSSIBLE_PREV:
    if Path(path).exists():
        model_source = path
        break
if model_source is None:
    model_source = 't5-base'
print(f"Continuing fine-tune from: {model_source}")

try:
    tokenizer = T5Tokenizer.from_pretrained(model_source)
    print(f"Loaded tokenizer from {model_source}")
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
except Exception as e:
    print(f"Tokenizer load failed from {model_source}, falling back to t5-base: {e}")
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")

try:
    model = T5ForConditionalGeneration.from_pretrained(model_source, device_map=None)
    print(f"Successfully loaded model from {model_source}")
    print(f"Model embedding size: {model.get_input_embeddings().weight.shape[0]}")
    if len(tokenizer) != model.get_input_embeddings().weight.shape[0]:
        print(f"WARNING: Vocabulary size mismatch!")
        print(f"  Tokenizer vocab size: {len(tokenizer)}")
        print(f"  Model embedding size: {model.get_input_embeddings().weight.shape[0]}")
        print("  This will cause IndexError during generation!")
        sys.exit(1)
    else:
        print(f"✓ Vocabulary sizes match: {len(tokenizer)}")
except Exception as e:
    print(f"Model load failed from {model_source}, falling back to t5-base: {e}")
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    print(f"Model embedding size: {model.get_input_embeddings().weight.shape[0]}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'='*60}\nDEVICE CONFIGURATION\n{'='*60}")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"{'='*60}\n")
else:
    print("WARNING: No GPU detected! Training will be very slow on CPU.")
    print("Make sure you have:")
    print("  1. A CUDA-capable GPU")
    print("  2. CUDA toolkit installed")
    print("  3. PyTorch with CUDA support:")
    print("      pip uninstall torch torchvision torchaudio -y")
    print("      pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
    print(f"{'='*60}\n")
    response = input("Continue with CPU training? (yes/no): ")
    if response.lower() != 'yes':
        print("Exiting. Please install CUDA support and try again.")
        sys.exit(1)

model.to(device)
print(f"Model moved to {device}")

# ----- T5 generation settings: prevents invalid pad tokens
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.pad_token_id
if hasattr(model, "generation_config"):
    model.generation_config.pad_token_id = tokenizer.pad_token_id
    model.generation_config.eos_token_id = tokenizer.eos_token_id

SPECIAL_TOKENS = []
if SPECIAL_TOKENS:
    added = tokenizer.add_tokens([t for t in SPECIAL_TOKENS if t not in tokenizer.get_vocab()])
    if added:
        model.resize_token_embeddings(len(tokenizer))
        print(f"Added {added} special tokens")

def safe_load_metric(name, pip_pkg=None, alt=None):
    if name in ['rouge', 'bleu', 'meteor'] and alt:
        print(f"Using direct fallback implementation for '{name}' metric")
        return alt, True
    try:
        metric = evaluate.load(name)
        def _call(preds, refs):
            return metric.compute(predictions=preds, references=refs)
        return _call, False
    except Exception as e:
        print(f"Failed to load metric '{name}' from evaluate: {e}")
        if pip_pkg:
            try:
                import subprocess
                subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', pip_pkg], check=False)
                metric = evaluate.load(name)
                def _call(preds, refs):
                    return metric.compute(predictions=preds, references=refs)
                return _call, False
            except Exception as e2:
                print(f"Failed to load metric '{name}' after install attempt: {e2}")
        if alt:
            print(f"Using fallback implementation for '{name}'")
            return alt, True
        else:
            def _noop(preds, refs):
                return {}
            print(f"No fallback available for metric '{name}'. Returning empty dict.")
            return _noop, True

def rouge_fallback(preds, refs):
    try:
        from rouge_score import rouge_scorer
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'rouge-score'], check=True)
        from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
    r1, r2, rl = [], [], []
    for p, r in zip(preds, refs):
        scores = scorer.score(r, p)
        r1.append(scores['rouge1'].fmeasure)
        r2.append(scores['rouge2'].fmeasure)
        rl.append(scores['rougeL'].fmeasure)
    return {
        'rouge1': sum(r1)/len(r1) if r1 else 0.0,
        'rouge2': sum(r2)/len(r2) if r2 else 0.0,
        'rougeL': sum(rl)/len(rl) if rl else 0.0,
    }

def bleu_fallback(preds, refs):
    try:
        import sacrebleu
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'sacrebleu'], check=True)
        import sacrebleu
    bleu = sacrebleu.corpus_bleu(preds, [refs])
    return {'bleu': bleu.score}

def meteor_fallback(preds, refs):
    try:
        import nltk
        from nltk.translate.meteor_score import meteor_score
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'nltk'], check=True)
        import nltk
        from nltk.translate.meteor_score import meteor_score
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet', quiet=True)
    # Now split tokens for each sentence
    scores = [meteor_score([r.split()], p.split()) for p, r in zip(preds, refs)]
    return {'meteor': sum(scores)/len(scores) if scores else 0.0}

def bertscore_fallback(preds, refs):
    try:
        from bert_score import score as bert_score
    except ImportError:
        import subprocess
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--quiet', 'bert-score'], check=False)
        from bert_score import score as bert_score
    P, R, F = bert_score(preds, refs, lang='en', verbose=False)
    return {
        'bertscore_precision': float(P.mean()),
        'bertscore_recall': float(R.mean()),
        'bertscore_f1': float(F.mean())
    }

metric_rouge, rouge_fallback_used = safe_load_metric('rouge', pip_pkg='rouge-score', alt=rouge_fallback)
metric_bleu, bleu_fallback_used = safe_load_metric('bleu', pip_pkg='sacrebleu', alt=bleu_fallback)
metric_meteor, meteor_fallback_used = safe_load_metric('meteor', pip_pkg='nltk', alt=meteor_fallback)
metric_bertscore, bertscore_fallback_used = safe_load_metric('bertscore', pip_pkg='bert-score', alt=bertscore_fallback)

print("Metric loaders ready. Fallback usage:")
print({
    'rouge_fallback': rouge_fallback_used,
    'bleu_fallback': bleu_fallback_used,
    'meteor_fallback': meteor_fallback_used,
    'bertscore_fallback': bertscore_fallback_used,
})

max_source_len = 512
max_target_len = 256

print(f"Using tokenizer vocabulary size: {len(tokenizer)}")
label_pad_token_id = -100

def preprocess(batch):
    model_inputs = tokenizer(
        batch['input_text'],
        max_length=max_source_len,
        truncation=True,
        padding='max_length'
    )
    labels = tokenizer(
        text_target=batch['target_text'],
        max_length=max_target_len,
        truncation=True,
        padding='max_length'
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized = raw_datasets.map(preprocess, batched=True, remove_columns=['input_text','target_text'])

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id)

class MetricsTableCallback(TrainerCallback):
    """Custom callback to log metrics locally to a CSV file."""
    def __init__(self, save_csv_path='epoch_metrics_log.csv'):
        self.save_csv_path = save_csv_path
        self.rows = []
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # Hugging Face Trainer logs 'loss' and 'eval_loss'.
        # Rename 'eval_loss' to 'validation_loss' for consistency with plotting script.
        row = {
            'epoch': state.epoch, # Add epoch for better CSV plotting
            'step': state.global_step,
            'training_loss': metrics.get('loss', 0.0),
            'validation_loss': metrics.get('eval_loss', 0.0)
        }
        
        # Add all computed metrics
        for k,v in metrics.items():
            if isinstance(v, (int, float)):
                 # Only keep metrics not already captured and not starting with 'eval_'
                if not k.startswith('eval_') and k not in ['loss']:
                    row[k] = v
                # Capture all eval metrics
                elif k.startswith('eval_') and k != 'eval_loss':
                    row[k.replace('eval_', '')] = v
                    
        self.rows.append(row)
        return control
        
    def on_train_end(self, args, state, control, **kwargs):
        if self.rows:
            try:
                import csv
                # Ensure 'epoch' and 'step' are first in the header
                all_keys = {k for r in self.rows for k in r.keys()}
                
                # Define preferred order
                keys_ordered = ['epoch', 'step', 'training_loss', 'validation_loss']
                
                # Append all other keys, sorted, ensuring no duplicates
                remaining_keys = sorted(list(all_keys - set(keys_ordered)))
                keys = keys_ordered + remaining_keys
                
                with open(self.save_csv_path, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.DictWriter(f, fieldnames=keys)
                    writer.writeheader()
                    for r in self.rows:
                        # Ensure all rows have all keys for CSV
                        writer.writerow({k: r.get(k, '') for k in keys}) 
                print(f"Saved metrics log to {self.save_csv_path}")
            except Exception as e:
                print(f"Could not save metrics CSV: {e}")

metrics_callback = MetricsTableCallback()

# ----------- SAFE compute_metrics for T5 with decode patch -----------
def compute_metrics(eval_pred):
    import numpy as np
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]

    preds = np.asarray(predictions, dtype=np.int64)
    labs = np.asarray(labels, dtype=np.int64)

    invalid_pred_mask = (preds < 0) | (preds >= tokenizer.vocab_size)
    if invalid_pred_mask.any():
        preds[invalid_pred_mask] = tokenizer.pad_token_id

    labs = np.where(labs == -100, tokenizer.pad_token_id, labs)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labs, skip_special_tokens=True)
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    gen_lens = [len(tokenizer.encode(p)) for p in decoded_preds]
    avg_len = sum(gen_lens)/len(gen_lens) if gen_lens else 0

    rouge_res = metric_rouge(decoded_preds, decoded_labels)
    bleu_res = metric_bleu(decoded_preds, decoded_labels)
    meteor_res = metric_meteor(decoded_preds, decoded_labels)
    bert_res = metric_bertscore(decoded_preds, decoded_labels)
    metrics = {}
    metrics.update({k: float(v) for k,v in rouge_res.items()})
    metrics.update({k: float(v) for k,v in bleu_res.items()})
    metrics.update({k: float(v) for k,v in meteor_res.items()})
    metrics.update({k: float(v) for k,v in bert_res.items()})
    metrics['gen_len'] = avg_len
    
    # Add prefix 'eval_' for all custom metrics to distinguish from training loss
    # The Trainer automatically prefixes its loss metrics.
    return {f"eval_{k}": v for k, v in metrics.items()}


# --- UPDATED TRAINING ARGUMENTS FOR W&B ---
# Define W&B specific settings
WANDB_PROJECT_NAME = "SPARQL-CodeGen-T5"
WANDB_RUN_NAME = f"T5-base-ft-{time.strftime('%Y%m%d-%H%M%S')}"

training_args = Seq2SeqTrainingArguments(
    output_dir='./trained/checkpoint-5',
    evaluation_strategy='epoch',
    logging_strategy='steps',
    save_strategy='epoch',
    save_total_limit=3,
    logging_steps=250,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    # --- REDUCED LEARNING RATE FOR STABILITY ---
    learning_rate=1e-4, 
    # ------------------------------------------
    warmup_steps=200,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    generation_max_length=512,
    generation_num_beams=1,
    # --- W&B INTEGRATION START ---
    report_to=["wandb"] if WANDB_AVAILABLE else ["none"], # Tell the Trainer to use W&B
    run_name=WANDB_RUN_NAME, # Name for the specific run on the dashboard
    # Optional: set project name. If not set, it uses the WANDB_PROJECT environment variable.
    # project_name=WANDB_PROJECT_NAME, 
    # --- W&B INTEGRATION END ---
    load_best_model_at_end=True,
    metric_for_best_model='eval_rougeL', # Use 'eval_' prefix for the metric calculated in compute_metrics
    greater_is_better=True,
    seed=SEED,
    dataloader_num_workers=2 if not sys.platform.startswith('win') else 0,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), metrics_callback]
)

print("\n" + "="*60)
print("TRAINING CONFIGURATION")
print("="*60)
print(f"Training on: {device}")
print(f"Mixed precision (FP16): {training_args.fp16}")
print(f"Batch size per device: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Training samples: {len(tokenized['train'])}")
print(f"Validation samples: {len(tokenized['validation'])}")
print(f"W&B Logging: {'Enabled' if WANDB_AVAILABLE else 'Disabled (install wandb)'}")
if WANDB_AVAILABLE:
    print(f"W&B Run Name: {WANDB_RUN_NAME}")
print("="*60 + "\n")

# Initialize W&B manually if you want to set the project name or other config outside of Trainer args
# Note: The Trainer handles wandb.init() automatically when report_to="wandb" is set,
# but if you need specific config, you can call it before trainer.train()

trainer.train()

# Optional: end the W&B run manually when training is done
if WANDB_AVAILABLE:
    wandb.finish()


2025-10-04 14:52:40.262357: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-04 14:52:40.303217: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-04 14:52:41.580165: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


W&B imported successfully.
Loaded extended examples: 32573 | schema examples: 4423
Total normalized records: 36996
Prepared training pairs: 36996 (questions with SPARQL)
Train size: 35146 | Validation size: 1850
Saved merged_combined_corpus.json with input text, target SPARQL, question, and entities.
Continuing fine-tune from: ./trained/checkpoint-2
Loaded tokenizer from ./trained/checkpoint-2
Tokenizer vocabulary size: 32102
Successfully loaded model from ./trained/checkpoint-2
Model embedding size: 32102
✓ Vocabulary sizes match: 32102

DEVICE CONFIGURATION
Using device: cuda
GPU Device: NVIDIA RTX A6000
GPU Memory Available: 47.40 GB
CUDA Version: 12.1

Model moved to cuda
Using direct fallback implementation for 'rouge' metric
Using direct fallback implementation for 'bleu' metric
Using direct fallback implementation for 'meteor' metric
Failed to load metric 'bertscore' from evaluate: Couldn't find a module script at /tf/notebooks/Transformers/T5_base/bertscore/bertscore.py. Module

[0m

Failed to load metric 'bertscore' after install attempt: Couldn't find a module script at /tf/notebooks/Transformers/T5_base/bertscore/bertscore.py. Module 'bertscore' doesn't exist on the Hugging Face Hub either.
Using fallback implementation for 'bertscore'
Metric loaders ready. Fallback usage:
{'rouge_fallback': True, 'bleu_fallback': True, 'meteor_fallback': True, 'bertscore_fallback': True}
Using tokenizer vocabulary size: 32102


Map:   0%|          | 0/35146 [00:00<?, ? examples/s]

Map:   0%|          | 0/1850 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



TRAINING CONFIGURATION
Training on: cuda
Mixed precision (FP16): True
Batch size per device: 4
Gradient accumulation steps: 4
Effective batch size: 16
Number of epochs: 5
Learning rate: 0.0001
Training samples: 35146
Validation samples: 1850
W&B Logging: Enabled
W&B Run Name: T5-base-ft-20251004-145305



[34m[1mwandb[0m: Currently logged in as: [33msuhasdevmane[0m ([33msuhasdevmane-cardiff-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor,Bertscore Precision,Bertscore Recall,Bertscore F1,Gen Len
0,0.0001,1e-05,0.999928,0.999884,0.999928,87.203594,0.831681,0.98641,0.973968,0.980142,75.257297
1,0.0004,1.1e-05,0.999961,0.999958,0.999961,87.205611,0.831723,0.986428,0.973974,0.980153,75.251892
2,0.0004,1.1e-05,0.999961,0.999958,0.999961,87.205611,0.831723,0.986428,0.973974,0.980153,75.251892
4,0.0004,1.1e-05,0.999961,0.999958,0.999961,87.205611,0.831723,0.986428,0.973974,0.980153,75.251892


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Saved metrics log to epoch_metrics_log.csv


0,1
eval/bertscore_f1,▁████
eval/bertscore_precision,▁████
eval/bertscore_recall,▁████
eval/bleu,▁████
eval/gen_len,█▁▁▁▁
eval/loss,▁▇███
eval/meteor,▁████
eval/rouge1,▁████
eval/rouge2,▁████
eval/rougeL,▁████

0,1
eval/bertscore_f1,0.98015
eval/bertscore_precision,0.98643
eval/bertscore_recall,0.97397
eval/bleu,87.20561
eval/gen_len,75.25189
eval/loss,1e-05
eval/meteor,0.83172
eval/rouge1,0.99996
eval/rouge2,0.99996
eval/rougeL,0.99996


# without wandb run

In [3]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
[0m

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import savgol_filter
import os
import math # Added for subplot calculation

# Publication settings
sns.set(style="whitegrid")
custom_palette = sns.color_palette("Set2", 8)
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = (9, 6)
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['font.size'] = 16
plt.rcParams['axes.labelsize'] = 19
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['legend.fontsize'] = 17
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15

# Load CSV (Assuming this file exists in the execution environment)
try:
    df = pd.read_csv('epoch_metrics_log.csv')
    df = df.fillna(0)
except FileNotFoundError:
    print("Error: 'epoch_metrics_log.csv' not found. Creating a dummy DataFrame.")
    # Create dummy data for demonstration if file is missing
    data = {
        'step': np.arange(1, 11) * 100,
        'epoch': np.arange(1, 11),
        'training_loss': np.linspace(2.0, 0.5, 10) + np.random.rand(10) * 0.1,
        'validation_loss': np.linspace(2.5, 0.8, 10) + np.random.rand(10) * 0.1,
        'ROUGE-1': np.linspace(0.2, 0.45, 10) + np.random.rand(10) * 0.05,
        'ROUGE-2': np.linspace(0.05, 0.2, 10) + np.random.rand(10) * 0.03,
        'ROUGE-L': np.linspace(0.15, 0.4, 10) + np.random.rand(10) * 0.05,
        'BLEU': np.linspace(0.1, 0.35, 10) + np.random.rand(10) * 0.04,
    }
    df = pd.DataFrame(data)
    df.iloc[7, df.columns.get_loc('validation_loss')] = 0.75 # Lower min val loss
    df.iloc[8, df.columns.get_loc('ROUGE-L')] = 0.48 # Higher max metric
    df = df.round(4)


metrics = [col for col in df.columns if col not in ['step', 'epoch', 'training_loss', 'validation_loss']]
score_metrics = [m for m in metrics if any(x in m.lower() for x in ['rouge', 'bleu', 'meteor', 'bert'])]

# --- Loss Plot Function ---
def plot_loss(df):
    """Plots training and validation loss curves, annotating minimums."""
    fig, ax = plt.subplots(figsize=(8, 6))
    
    if 'epoch' in df.columns and len(df['epoch'].unique()) > 1:
        x = df['epoch']
        x_label = 'Epoch'
    elif 'step' in df.columns and len(df['step'].unique()) > 1:
        x = df['step']
        x_label = 'Step'
    else:
        x = np.arange(len(df))
        x_label = 'Index'
        
    # Training Loss Plotting
    if 'training_loss' in df and df['training_loss'].count() > 1:
        ax.plot(x, df['training_loss'], color=custom_palette[0], marker='o', linestyle='-', label='Training Loss', markersize=6, alpha=0.7)
        min_tr = df['training_loss'].min()
        min_tr_idx = df['training_loss'].idxmin()
        ax.annotate(f"Min {min_tr:.3f}", (x.iloc[min_tr_idx], min_tr),
                    textcoords="offset points", xytext=(0,-20), ha='center', fontsize=15, color=custom_palette[0],
                    bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.6, ec=custom_palette[0]))
        
    # Validation Loss Plotting
    if 'validation_loss' in df and df['validation_loss'].count() > 1:
        ax.plot(x, df['validation_loss'], color=custom_palette[1], marker='s', linestyle='-', label='Validation Loss', markersize=6, alpha=0.7)
        min_val = df['validation_loss'].min()
        min_val_idx = df['validation_loss'].idxmin()
        ax.annotate(f"Min {min_val:.3f}", (x.iloc[min_val_idx], min_val), 
                    textcoords="offset points", xytext=(0,20), ha='center', fontsize=15, color=custom_palette[1],
                    bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.6, ec=custom_palette[1]))

    ax.set_xlabel(x_label)
    ax.set_ylabel('Loss')
    ax.set_title('Training and Validation Loss Curves')
    ax.legend(loc='best', frameon=True, shadow=True)
    ax.grid(visible=True, which='major', linestyle='-', alpha=0.6)
    ax.grid(visible=True, which='minor', linestyle='--', alpha=0.3)
    ax.minorticks_on()
    
    plt.tight_layout()
    plt.savefig('loss_curves.png', transparent=True)
    plt.savefig('loss_curves.pdf', transparent=True)
    plt.close()

# --- Metrics Plot Function (Improved Grouping) ---
def plot_metrics(df, metrics):
    """Plots score metrics over epochs, grouping related metrics if necessary."""
    
    # Simple strategy: Group ROUGE metrics, others go to a second group
    rouge_metrics = sorted([m for m in metrics if 'rouge' in m.lower()])
    other_metrics = sorted([m for m in metrics if m not in rouge_metrics])

    metric_groups = []
    if rouge_metrics:
        metric_groups.append(('ROUGE Scores by Epoch', rouge_metrics))
    if other_metrics:
        metric_groups.append(('Other Scores by Epoch', other_metrics))

    if not metric_groups:
        print("No score metrics found to plot.")
        return

    x = df['epoch'] if 'epoch' in df.columns else np.arange(len(df))
    x_label = 'Epoch' if 'epoch' in df.columns else 'Index'

    for plot_title, current_metrics in metric_groups:
        if not current_metrics: continue

        fig, ax = plt.subplots(figsize=(9, 6))
        
        for i, metric in enumerate(current_metrics):
            if metric in df:
                y = df[metric]
                # Optionally smooth curves for >4 epochs/steps
                y_smooth = y
                if len(y) > 4:
                    try:
                        # Ensures window is odd and less than or equal to the length of data
                        window = min(7, len(y) // 2 * 2 + 1 if len(y) % 2 == 0 else len(y))
                        if window < 3: window = 3 # Minimum window length
                        y_smooth = savgol_filter(y, window_length=window, polyorder=2)
                    except Exception as e:
                        # print(f"Smoothing failed for {metric}: {e}")
                        y_smooth = y
                        
                color = custom_palette[i % len(custom_palette)]
                ax.plot(x, y_smooth, label=metric, color=color, linewidth=2, linestyle='-')
                
                # Plot original data points (optional, for visibility)
                # ax.plot(x, y, color=color, marker='o', linestyle='', alpha=0.4, markersize=5)

                # Annotate best score on the original data for accuracy
                best_val = y.max()
                best_idx = y.idxmax()
                
                # Check for multiple max values and pick the latest one for annotation clarity
                latest_best_idx = y[y == best_val].index[-1]
                
                ax.annotate(f"Max {best_val:.3f}", (x.iloc[latest_best_idx], y.iloc[latest_best_idx]), 
                            textcoords="offset points", xytext=(0,10), ha='center', fontsize=14, color='black',
                            bbox=dict(boxstyle="round,pad=0.2", fc=color, alpha=0.3, ec=color))
        
        ax.set_xlabel(x_label)
        ax.set_ylabel('Score')
        ax.set_title(plot_title)
        ax.legend(loc='best', frameon=True, shadow=True)
        ax.grid(visible=True, which='major', linestyle='-', alpha=0.6)
        ax.grid(visible=True, which='minor', linestyle='--', alpha=0.3)
        ax.minorticks_on()
        
        plt.tight_layout()
        filename_base = plot_title.split(' ')[0].lower() # e.g., 'rouge' or 'other'
        plt.savefig(f'{filename_base}_metrics_by_epoch.png', transparent=True)
        plt.savefig(f'{filename_base}_metrics_by_epoch.pdf', transparent=True)
        plt.close()

# --- Best Scores Bar Plot Function ---
def plot_best_scores(df, metrics):
    """Plots best metric scores from the epoch with minimum validation loss."""
    # Find the row corresponding to the minimum validation loss
    best_idx = df['validation_loss'].idxmin() if 'validation_loss' in df.columns and df['validation_loss'].count() > 0 else df.index[-1]
    best_row = df.iloc[best_idx]
    
    scores = {metric: best_row[metric] for metric in metrics if metric in best_row and pd.notna(best_row[metric])}
    
    if not scores:
        print("No valid scores found for the best model to plot.")
        return

    fig, ax = plt.subplots(figsize=(8,6))
    
    # Sort scores alphabetically by metric name for consistent plotting
    sorted_scores = dict(sorted(scores.items()))
    
    bars = sns.barplot(x=list(sorted_scores.keys()), y=list(sorted_scores.values()), 
                       palette=custom_palette, ax=ax, saturation=0.7)
    
    for bar, value in zip(bars.patches, sorted_scores.values()):
        # Annotate values above the bars
        ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() * 1.01, 
                f'{value:.4f}', ha='center', va='bottom', fontsize=16, color='black')

    ax.set_ylabel('Score')
    ax.set_xlabel('Metric')
    ax.set_title(f'Performance Metrics at Epoch {best_row.get("epoch", best_idx)}')
    ax.set_ylim(0, max(scores.values()) * 1.1 + 0.05) # Better Y-axis scaling

    plt.tight_layout()
    plt.savefig('best_score_barplot.png', transparent=True)
    plt.savefig('best_score_barplot.pdf', transparent=True)
    plt.close()

# --- Summary Table Function ---
def plot_summary_table(df, metrics):
    """Creates a visually styled summary table of the best metric scores."""
    # Find the row corresponding to the minimum validation loss
    best_idx = df['validation_loss'].idxmin() if 'validation_loss' in df.columns and df['validation_loss'].count() > 0 else df.index[-1]
    best_row = df.iloc[best_idx]
    
    cell_text = []
    row_labels = []
    
    # Collect valid scores, sorted
    valid_metrics = sorted([m for m in metrics if m in best_row and pd.notna(best_row[m])])
    
    for metric in valid_metrics:
        score = best_row[metric]
        cell_text.append([f"{score:.4f}"])
        row_labels.append(metric)

    if not row_labels:
        print("No valid scores found for the summary table.")
        return

    # Calculate figure size dynamically
    fig_height = 1.5 + len(cell_text) * 0.4
    fig_width = 6
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    
    fig.patch.set_visible(False)
    ax.axis('off')
    
    # Create table
    table = ax.table(cellText=cell_text, rowLabels=row_labels, colLabels=['Score'], 
                     loc='center', cellLoc='center', fontsize=17,
                     cellColours=[['#f5f5f5']]*len(cell_text),
                     colColours=[custom_palette[5]],
                     rowColours=[custom_palette[6]]*len(cell_text))
                     
    table.auto_set_font_size(False)
    table.set_fontsize(16)
    table.scale(1.2, 1.5) # Scale slightly for better visibility
    
    plt.title('Best Model Performance Metrics', fontsize=20, y=0.95)
    plt.tight_layout()
    plt.savefig('metrics_summary_table.png', transparent=True)
    plt.savefig('metrics_summary_table.pdf', transparent=True)
    plt.close()

# --- NEW: Combined Dashboard Plot Function ---
def create_dashboard(df, metrics):
    """
    Creates a multi-panel figure suitable for publication, combining
    Loss Curves and key Metric Scores.
    """
    if 'validation_loss' not in df.columns or df['validation_loss'].count() < 2 or not metrics:
        print("Insufficient data for dashboard (need validation loss and score metrics).")
        return

    # Determine which x-axis to use
    if 'epoch' in df.columns and len(df['epoch'].unique()) > 1:
        x = df['epoch']
        x_label = 'Epoch'
    elif 'step' in df.columns and len(df['step'].unique()) > 1:
        x = df['step']
        x_label = 'Step'
    else:
        x = np.arange(len(df))
        x_label = 'Index'

    # Setup the figure (1 row, 2 columns)
    fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # A wider figure
    
    # --- Panel 1: Loss Curves ---
    ax1 = axes[0]
    
    # Training Loss
    if 'training_loss' in df and df['training_loss'].count() > 1:
        ax1.plot(x, df['training_loss'], color=custom_palette[0], linestyle='-', 
                 label='Training Loss', linewidth=2, alpha=0.7)
        min_tr = df['training_loss'].min()
        min_tr_idx = df['training_loss'].idxmin()
        ax1.plot(x.iloc[min_tr_idx], min_tr, 'o', color=custom_palette[0], markersize=8)
        
    # Validation Loss
    if 'validation_loss' in df and df['validation_loss'].count() > 1:
        ax1.plot(x, df['validation_loss'], color=custom_palette[1], linestyle='-', 
                 label='Validation Loss', linewidth=2, alpha=0.9)
        min_val = df['validation_loss'].min()
        min_val_idx = df['validation_loss'].idxmin()
        ax1.plot(x.iloc[min_val_idx], min_val, 's', color=custom_palette[1], markersize=8) # Square marker for min
        
    ax1.set_xlabel(x_label)
    ax1.set_ylabel('Loss')
    ax1.set_title('(a) Training and Validation Loss')
    ax1.legend(loc='best', frameon=True)
    ax1.grid(visible=True, which='major', linestyle='-', alpha=0.6)
    ax1.minorticks_on()


    # --- Panel 2: Metric Scores ---
    ax2 = axes[1]
    
    # Prioritize ROUGE-L and/or BLEU as key metrics if they exist, otherwise use the first two
    key_metrics = []
    if 'ROUGE-L' in metrics: key_metrics.append('ROUGE-L')
    if 'BLEU' in metrics and 'BLEU' not in key_metrics: key_metrics.append('BLEU')
    if len(key_metrics) < 2 and metrics:
        for m in metrics:
            if m not in key_metrics:
                key_metrics.append(m)
                if len(key_metrics) == 2: break
    
    current_metrics = key_metrics # Only plot 2-3 key metrics in the dashboard for clarity
    
    for i, metric in enumerate(current_metrics):
        if metric in df:
            y = df[metric]
            # Smooth the curve
            y_smooth = y
            if len(y) > 4:
                try:
                    window = min(7, len(y) // 2 * 2 + 1 if len(y) % 2 == 0 else len(y))
                    if window < 3: window = 3
                    y_smooth = savgol_filter(y, window_length=window, polyorder=2)
                except Exception:
                    y_smooth = y
            
            color = custom_palette[i % len(custom_palette) + 2] # Use different colors than loss
            ax2.plot(x, y_smooth, label=metric, color=color, linewidth=2, linestyle='-')
            
            # Annotate best score
            best_val = y.max()
            best_idx = y.idxmax()
            latest_best_idx = y[y == best_val].index[-1]
            ax2.plot(x.iloc[latest_best_idx], y.iloc[latest_best_idx], 'D', color=color, markersize=8) # Diamond marker for max
            
    ax2.set_xlabel(x_label)
    ax2.set_ylabel('Score')
    ax2.set_title('(b) Key Evaluation Scores')
    ax2.legend(loc='best', frameon=True)
    ax2.grid(visible=True, which='major', linestyle='-', alpha=0.6)
    ax2.minorticks_on()
    
    # Adjust overall layout
    plt.suptitle('Model Performance Over Training', fontsize=24, y=1.02)
    plt.tight_layout(rect=[0, 0, 1, 0.95]) # Make room for suptitle
    plt.savefig('performance_dashboard.png', transparent=True)
    plt.savefig('performance_dashboard.pdf', transparent=True)
    plt.close()

# --- Execution ---
plot_loss(df)
plot_metrics(df, score_metrics)
plot_best_scores(df, score_metrics)
plot_summary_table(df, score_metrics)
create_dashboard(df, score_metrics)

# print("Saved: loss_curves(.png/.pdf), [rouge/other]_metrics_by_epoch(.png/.pdf), best_score_barplot(.png/.pdf), metrics_summary_table(.png/.pdf), performance_dashboard(.png/.pdf)")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  bars = sns.barplot(x=list(sorted_scores.keys()), y=list(sorted_scores.values()),


### SMOKE TEST

In [5]:
# Quick smoke test on a couple of random samples after training (run after previous cell finishes)
import random, json, torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from pathlib import Path

model_dir = './trained/checkpoint-5/checkpoint-10980'
if not Path(model_dir).exists():
    print('Final model directory not found, please run the training cell first.')
else:
    tokenizer = T5Tokenizer.from_pretrained(model_dir)
    model = T5ForConditionalGeneration.from_pretrained(model_dir)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with open('merged_combined_corpus.json','r',encoding='utf-8') as f:
        merged = json.load(f)

    samples = random.sample(merged, k=min(3, len(merged)))
    for i,s in enumerate(samples,1):
        inp = s['input_text']
        tgt = s['target_text']
        inputs = tokenizer(inp, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(device)
        out_ids = model.generate(inputs['input_ids'], max_length=256, num_beams=4)
        pred = tokenizer.decode(out_ids[0], skip_special_tokens=True)
        print(f'--- Sample {i} ---')
        print('INPUT:\n', inp[:400])
        print('TARGET:\n', tgt)
        print('PRED:\n', pred)
        print('-'*60)

--- Sample 1 ---
INPUT:
 task: generate_sparql
input: What failure trends have been observed in  CO  Sensor 5.01's data over the last month?
entity: bldg:CO_Level_Sensor_5.01
TARGET:
 SELECT ?timeseriesId ?storedAt WHERE { bldg:CO_Level_Sensor_5.01 ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId ; ref:storedAt ?storedAt . }
PRED:
 SELECT ?timeseriesId ?storedAt WHERE { bldg:CO_Level_Sensor_5.01 ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId ; ref:storedAt ?storedAt . }
------------------------------------------------------------
--- Sample 2 ---
INPUT:
 task: generate_sparql
input: What improvements can be made if Sensor 5.02's data deviates from our smart building standards?
entity: bldg:Air_Quality_Sensor_5.02
TARGET:
 SELECT ?timeseriesId ?storedAt WHERE { bldg:Air_Quality_Sensor_5.02 ref:hasExternalReference ?ref . ?ref a ref:TimeseriesReference ; ref:hasTimeseriesId ?timeseriesId

### Metrics Fallback Explanation
If `evaluate.load('rouge')` fails (e.g., offline environment, hub connectivity, or cache corruption), the previous cell now:

- Tries to `pip install` a required package (e.g. `rouge-score`, `sacrebleu`, `nltk`, `bert-score`).
- If that still fails, it uses a lightweight local fallback implementation for: ROUGE (rouge-score), BLEU (sacrebleu), METEOR (nltk), BERTScore (bert-score).
- Prints which metrics are using fallbacks.

Usage Notes:
- To force re-download from Hugging Face Hub later, ensure network is available and re-run the cell.
- If you want to skip a metric entirely, you can comment out its block in `compute_metrics`.
- Starting training: uncomment `trainer.train()` at the bottom of the previous cell.

Troubleshooting:
- If BERTScore is slow on CPU, you can remove it or set a smaller batch by modifying the fallback.
- For deterministic runs, set `seed` in `Seq2SeqTrainingArguments` and also `random.seed`, `numpy.random.seed`, `torch.manual_seed` before training.
