In [1]:
import os
import time
import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from codecarbon import EmissionsTracker
import evaluate
from typing import Dict, Tuple, List
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
from peft import LoraConfig, get_peft_model, TaskType

Mounted at /content/drive


In [2]:
def loadsquaddata(dataset_fraction: float) -> Tuple[any, any]:

    print(f"\n{'='*60}")
    print(f"Loading SQuAD 2.0 dataset (fraction: {dataset_fraction})...")
    print(f"{'='*60}")

    dataset = load_dataset("squad_v2")

    train_size = int(len(dataset["train"]) * dataset_fraction)
    train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
    valid_dataset = dataset["validation"]

    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(valid_dataset)}")

    return train_dataset, valid_dataset

In [3]:
def preprocessFunction(examples: Dict, tokenizer, max_length: int = 384) -> Dict:

    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]

        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        # Find start and end of context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Check if answer is in context
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Find token start/end positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [4]:
def tokenizeDatasets(train_dataset, valid_dataset, model_name: str) -> Tuple[any, any, any]:
    print("\nTokenizing datasets...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize datasets
    tokenized_train = train_dataset.map(
        lambda x: preprocessFunction(x, tokenizer),
        batched=True,
        remove_columns=train_dataset.column_names
    )

    tokenized_valid = valid_dataset.map(
        lambda x: preprocessFunction(x, tokenizer),
        batched=True,
        remove_columns=valid_dataset.column_names
    )

    print("Tokenization complete!")
    return tokenized_train, tokenized_valid, tokenizer


In [5]:
def computeMetrics(eval_pred) -> Dict[str, float]:
    predictions, labels = eval_pred
    start_logits, end_logits = predictions

    # Get predicted positions
    start_preds = start_logits.argmax(axis=-1)
    end_preds = end_logits.argmax(axis=-1)

    start_labels = labels[0]
    end_labels = labels[1]

    # Calculate exact match (both start and end correct)
    exact_match = ((start_preds == start_labels) & (end_preds == end_labels)).mean()

    # Calculate F1 (token-level overlap)
    f1_scores = []
    for i in range(len(start_preds)):
        pred_range = set(range(start_preds[i], end_preds[i] + 1))
        label_range = set(range(start_labels[i], end_labels[i] + 1))

        if len(pred_range) == 0 and len(label_range) == 0:
            f1_scores.append(1.0)
        elif len(pred_range) == 0 or len(label_range) == 0:
            f1_scores.append(0.0)
        else:
            intersection = len(pred_range & label_range)
            precision = intersection / len(pred_range)
            recall = intersection / len(label_range)
            if precision + recall == 0:
                f1_scores.append(0.0)
            else:
                f1_scores.append(2 * precision * recall / (precision + recall))

    f1 = sum(f1_scores) / len(f1_scores)

    return {"f1": f1, "exact_match": exact_match}

In [6]:
def trainModel(model_name: str,tokenized_train,tokenized_valid,dataset_fraction: float,output_dir: str = "./results") -> Dict:

    print(f"\n{'='*60}")
    print(f"Training {model_name} with {dataset_fraction*100}% of data")
    print(f"{'='*60}")

    # Load model
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"{output_dir}/{model_name.replace('/', '_')}_{dataset_fraction}",
        eval_strategy ="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
    )

    # Load metric
    #metric = evaluate.load("squad_v2")

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        data_collator=default_data_collator,
        compute_metrics=lambda x: computeMetrics(x),
    )

    # Start carbon tracking
    tracker = EmissionsTracker(
        project_name=f"roberta_squad_{dataset_fraction}",
        output_dir=output_dir,
        log_level="warning"
    )

    start_time = time.time()
    tracker.start()

    # Train the model
    print("\nStarting training...")
    train_result = trainer.train()

    # Stop tracking
    emissions_data = tracker.stop()
    training_time = time.time() - start_time

    # Evaluate
    print("\nEvaluating model...")
    eval_results = trainer.evaluate()

    # Compile results
    results = {
        "training_method": "Full Fine-Tuning",
        "model_name": model_name,
        "train_samples": len(tokenized_train),
        "valid_samples": len(tokenized_valid),
        "dataset_fraction": dataset_fraction,
        "f1_score": eval_results.get("eval_f1", 0.0),
        "exact_match": eval_results.get("eval_exact_match", 0.0),
        "eval_loss": eval_results.get("eval_loss", 0.0),
        "training_time_hours": training_time / 3600,
    }

    # Add emissions data if available
    if emissions_data:
        results.update({
            "duration_hours": emissions_data / 3600 if isinstance(emissions_data, (int, float)) else 0,
            "emissions_kg": emissions_data if isinstance(emissions_data, (int, float)) else 0,
            "emissions_rate_kg_per_s": 0,  # Will be in emissions file
            "energy_consumed_kwh": 0,
            "cpu_energy_kwh": 0,
            "gpu_energy_kwh": 0,
            "ram_energy_kwh": 0,
            "cpu_power_w": 0,
            "gpu_power_w": 0,
            "ram_power_w": 0,
            "cpu_model": "",
            "cpu_count": 0,
            "gpu_model": "",
            "gpu_count": 0,
            "ram_total_size_gb": 0,
            "country_name": "",
            "region": "",
            "pue": 1.0,
        })

    print(f"\n✓ Training complete!")
    print(f"  F1 Score: {results['f1_score']:.4f}")
    print(f"  Exact Match: {results['exact_match']:.4f}")
    print(f"  Training Time: {results['training_time_hours']:.4f} hours")

    return results


In [7]:
def load_emissions_data(output_dir: str, dataset_fraction: float) -> Dict:
    emissions_file = os.path.join(output_dir, "emissions.csv")

    if not os.path.exists(emissions_file):
        print(f"Warning: Emissions file not found at {emissions_file}")
        return {}

    try:
        df = pd.read_csv(emissions_file)
        # Get the last row (most recent run)
        latest = df.iloc[-1]

        return {
            "duration_hours": latest.get("duration", 0) / 3600,
            "emissions_kg": latest.get("emissions", 0),
            "emissions_rate_kg_per_s": latest.get("emissions_rate", 0),
            "energy_consumed_kwh": latest.get("energy_consumed", 0),
            "cpu_energy_kwh": latest.get("cpu_energy", 0),
            "gpu_energy_kwh": latest.get("gpu_energy", 0),
            "ram_energy_kwh": latest.get("ram_energy", 0),
            "cpu_power_w": latest.get("cpu_power", 0),
            "gpu_power_w": latest.get("gpu_power", 0),
            "ram_power_w": latest.get("ram_power", 0),
            "cpu_model": latest.get("cpu_model", ""),
            "cpu_count": latest.get("cpu_count", 0),
            "gpu_model": latest.get("gpu_model", ""),
            "gpu_count": latest.get("gpu_count", 0),
            "ram_total_size_gb": latest.get("ram_total_size", 0),
            "country_name": latest.get("country_name", ""),
            "region": latest.get("region", ""),
            "pue": latest.get("pue", 1.0),
        }
    except Exception as e:
        print(f"Error loading emissions data: {e}")
        return {}


In [8]:
def save_results_to_csv(results_list: List[Dict], output_file: str = "training_results.csv"):
    df = pd.DataFrame(results_list)
    if os.path.exists(output_file):
      df.to_csv(output_file, mode="a", header=False, index=False)
    else:
      df.to_csv(output_file, index=False)
    print(f"\n{'='*60}")
    print(f"Results saved to: {output_file}")
    print(f"{'='*60}")
    print(df.to_string(index=False))

In [None]:
def fullfinetuning():

    MODEL_NAME = "roberta-base"
    DATASET_FRACTIONS = [0.25, 0.50, 0.80] #0.25, 0.50, 0.80
    OUTPUT_DIR = "/content/drive/MyDrive/Carbonemission/Version3_Roberta"

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Store all results
    all_results = []

    # Train with different dataset fractions
    for fraction in DATASET_FRACTIONS:
        # Load and prepare data
        train_data, valid_data = loadsquaddata(fraction)

        # Tokenize datasets
        tokenized_train, tokenized_valid, tokenizer = tokenizeDatasets(train_data, valid_data, MODEL_NAME)

        # Train model with carbon tracking
        results = trainModel(MODEL_NAME,tokenized_train,tokenized_valid,fraction,OUTPUT_DIR)

        # Load detailed emissions data
        emissions_details = load_emissions_data(OUTPUT_DIR, fraction)
        results.update(emissions_details)

        all_results.append(results)

        # Clean up to free memory
        del tokenized_train, tokenized_valid
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Save all results to CSV
    save_results_to_csv(all_results, os.path.join(OUTPUT_DIR, "training_results.csv"))

    print("\n" + "="*60)
    print("All training runs completed successfully!")
    print("="*60)


#fullfinetuning()


Loading SQuAD 2.0 dataset (fraction: 0.25)...


README.md: 0.00B [00:00, ?B/s]

squad_v2/train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

squad_v2/validation-00000-of-00001.parqu(…):   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Training samples: 32579
Validation samples: 11873

Tokenizing datasets...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/32579 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Tokenization complete!

Training roberta-base with 25.0% of data


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting training...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,1.1103,0.960686,0.707861,0.61703
2,0.7299,0.963777,0.73623,0.645414



Evaluating model...



✓ Training complete!
  F1 Score: 0.7362
  Exact Match: 0.6454
  Training Time: 0.2920 hours

Loading SQuAD 2.0 dataset (fraction: 0.5)...
Training samples: 65159
Validation samples: 11873

Tokenizing datasets...


Map:   0%|          | 0/65159 [00:00<?, ? examples/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenization complete!

Training roberta-base with 50.0% of data


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting training...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,0.9581,0.885081,0.748305,0.660574
2,0.7014,0.892498,0.767831,0.67624



Evaluating model...



✓ Training complete!
  F1 Score: 0.7678
  Exact Match: 0.6762
  Training Time: 0.5514 hours

Loading SQuAD 2.0 dataset (fraction: 0.8)...
Training samples: 104255
Validation samples: 11873

Tokenizing datasets...


Map:   0%|          | 0/104255 [00:00<?, ? examples/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenization complete!

Training roberta-base with 80.0% of data


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting training...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,0.9223,0.898659,0.744793,0.65131
2,0.6335,0.84364,0.779488,0.688621



Evaluating model...



✓ Training complete!
  F1 Score: 0.7795
  Exact Match: 0.6886
  Training Time: 0.8632 hours

Results saved to: /content/drive/MyDrive/Carbonemission/Version3_Roberta/training_results.csv
 training_method   model_name  train_samples  valid_samples  dataset_fraction  f1_score  exact_match  eval_loss  training_time_hours  duration_hours  emissions_kg  emissions_rate_kg_per_s  energy_consumed_kwh  cpu_energy_kwh  gpu_energy_kwh  ram_energy_kwh  cpu_power_w  gpu_power_w  ram_power_w                      cpu_model  cpu_count                 gpu_model  gpu_count  ram_total_size_gb    country_name    region  pue
Full Fine-Tuning roberta-base          32579          11873              0.25  0.736230     0.645414   0.963777             0.292015        0.292010      0.034176                 0.000033             0.127702        0.012409        0.100694        0.014599         42.5    58.165831         50.0 Intel(R) Xeon(R) CPU @ 2.20GHz         12 1 x NVIDIA A100-SXM4-80GB          1         167.

## LORA

In [9]:
def trainLoraModel(model_name: str,tokenized_train,tokenized_valid,dataset_fraction: float,output_dir: str = "./results",
    lora_r: int = 16,lora_alpha: int = 32,lora_dropout: float = 0.1) -> Dict:

    print(f"\n{'='*60}")
    print(f"Training {model_name} with LoRA ({dataset_fraction*100}% of data)")
    print(f"{'='*60}")

    # Load base model
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Configure LoRA
    peft_config = LoraConfig(
        task_type=TaskType.QUESTION_ANS,
        inference_mode=False,
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["query", "value"],
    )

    # Apply LoRA to model
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"{output_dir}/{model_name.replace('/', '_')}_lora_{dataset_fraction}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        data_collator=default_data_collator,
        compute_metrics=lambda x: computeMetrics(x),
    )

    tracker = EmissionsTracker(
        project_name=f"roberta_squad_lora_{dataset_fraction}",
        output_dir=output_dir,
        log_level="warning"
    )

    start_time = time.time()
    tracker.start()

    print("\nStarting LoRA training...")
    train_result = trainer.train()

    emissions_data = tracker.stop()
    training_time = time.time() - start_time

    print("\nEvaluating LoRA model...")
    eval_results = trainer.evaluate()

    results = {
        "training_method": "LoRA Fine-Tuning",
        "model_name": model_name,
        "train_samples": len(tokenized_train),
        "valid_samples": len(tokenized_valid),
        "dataset_fraction": dataset_fraction,
        "f1_score": eval_results.get("eval_f1", 0.0),
        "exact_match": eval_results.get("eval_exact_match", 0.0),
        "eval_loss": eval_results.get("eval_loss", 0.0),
        "training_time_hours": training_time / 3600,
    }

    # Add emissions data if available
    if emissions_data:
        results.update({
            "duration_hours": emissions_data / 3600 if isinstance(emissions_data, (int, float)) else 0,
            "emissions_kg": emissions_data if isinstance(emissions_data, (int, float)) else 0,
            "emissions_rate_kg_per_s": 0,
            "energy_consumed_kwh": 0,
            "cpu_energy_kwh": 0,
            "gpu_energy_kwh": 0,
            "ram_energy_kwh": 0,
            "cpu_power_w": 0,
            "gpu_power_w": 0,
            "ram_power_w": 0,
            "cpu_model": "",
            "cpu_count": 0,
            "gpu_model": "",
            "gpu_count": 0,
            "ram_total_size_gb": 0,
            "country_name": "",
            "region": "",
            "pue": 1.0,
        })

    print(f"\n✓ LoRA Training complete!")
    print(f"  F1 Score: {results['f1_score']:.4f}")
    print(f"  Exact Match: {results['exact_match']:.4f}")
    print(f"  Training Time: {results['training_time_hours']:.4f} hours")

    return results



In [None]:
def lora_fine_tuning():
    MODEL_NAME = "roberta-base"
    DATASET_FRACTION = 0.8 ##0.8...0.05
    OUTPUT_DIR = "/content/drive/MyDrive/Carbonemission/Version3_Roberta"

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    train_data, valid_data = loadsquaddata(DATASET_FRACTION)
    tokenized_train, tokenized_valid, tokenizer = tokenizeDatasets(train_data, valid_data, MODEL_NAME)

    results = trainLoraModel(MODEL_NAME,tokenized_train,tokenized_valid,DATASET_FRACTION,OUTPUT_DIR)

    emissions_details = load_emissions_data(OUTPUT_DIR, DATASET_FRACTION)
    results.update(emissions_details)

    save_results_to_csv([results], os.path.join(OUTPUT_DIR, "training_results.csv"))

    print("\n" + "="*60)
    print("LoRA fine-tuning completed!")
    print("="*60)

    del tokenized_train, tokenized_valid
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return results

In [None]:
#lora_results = lora_fine_tuning()


Loading SQuAD 2.0 dataset (fraction: 0.8)...


README.md: 0.00B [00:00, ?B/s]

squad_v2/train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

squad_v2/validation-00000-of-00001.parqu(…):   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Training samples: 104255
Validation samples: 11873

Tokenizing datasets...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/104255 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Tokenization complete!

Training roberta-base with LoRA (80.0% of data)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 124,647,940 || trainable%: 0.4744


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting LoRA training...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,1.0941,0.991645,0.71051,0.617872
2,0.979,0.900181,0.742627,0.650552





Evaluating LoRA model...



✓ LoRA Training complete!
  F1 Score: 0.7426
  Exact Match: 0.6506
  Training Time: 0.6569 hours

Results saved to: /content/drive/MyDrive/Carbonemission/Version3_Roberta/training_results.csv
 training_method   model_name  train_samples  valid_samples  dataset_fraction  f1_score  exact_match  eval_loss  training_time_hours  duration_hours  emissions_kg  emissions_rate_kg_per_s  energy_consumed_kwh  cpu_energy_kwh  gpu_energy_kwh  ram_energy_kwh  cpu_power_w  gpu_power_w  ram_power_w                      cpu_model  cpu_count                 gpu_model  gpu_count  ram_total_size_gb country_name  region  pue
LoRA Fine-Tuning roberta-base         104255          11873               0.8  0.742627     0.650552   0.900181             0.656856        0.656696      0.136088                 0.000058             0.289067        0.027906        0.228331         0.03283         42.5   320.255283         50.0 Intel(R) Xeon(R) CPU @ 2.20GHz         12 1 x NVIDIA A100-SXM4-80GB          1         167.

## Few Shots

In [10]:
def trainFewshotModel(model_name: str,tokenized_train,tokenized_valid,num_samples: int,output_dir: str = "./results") -> Dict:
    print(f"\n{'='*60}")
    print(f"Few-Shot Learning: {model_name} with {num_samples} samples")
    print(f"{'='*60}")

    # Load model
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Freeze all base model parameters
    for name, param in model.base_model.named_parameters():
        param.requires_grad = False

    # Keep only the QA head trainable
    for param in model.qa_outputs.parameters():
        param.requires_grad = True

    # Print trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTrainable parameters: {trainable_params:,} / {total_params:,} "
          f"({100 * trainable_params / total_params:.2f}%)")

    # Training arguments - adjusted for few-shot learning
    training_args = TrainingArguments(
        output_dir=f"{output_dir}/{model_name.replace('/', '_')}_fewshot_{num_samples}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-3,  # Higher LR for training only head
        per_device_train_batch_size=8,  # Smaller batch for few-shot
        per_device_eval_batch_size=16,
        num_train_epochs=10,  # More epochs for few-shot learning
        weight_decay=0.01,
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
        warmup_steps=50,  # Warmup for stability
    )

    # Load metric
    #metric = evaluate.load("squad_v2")

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        data_collator=default_data_collator,
        compute_metrics=lambda x: computeMetrics(x),
    )

    # Start carbon tracking
    tracker = EmissionsTracker(
        project_name=f"roberta_squad_fewshot_{num_samples}",
        output_dir=output_dir,
        log_level="warning"
    )

    start_time = time.time()
    tracker.start()

    # Train the model
    print("\nStarting few-shot training (frozen backbone)...")
    train_result = trainer.train()

    # Stop tracking
    emissions_data = tracker.stop()
    training_time = time.time() - start_time

    # Evaluate
    print("\nEvaluating few-shot model...")
    eval_results = trainer.evaluate()

    # Compile results
    results = {
        "training_method": "Few-Shot (Frozen Backbone)",
        "model_name": model_name,
        "train_samples": len(tokenized_train),
        "valid_samples": len(tokenized_valid),
        "dataset_fraction": len(tokenized_train) / 130319,  # Total SQuAD 2.0 train size
        "f1_score": eval_results.get("eval_f1", 0.0),
        "exact_match": eval_results.get("eval_exact_match", 0.0),
        "eval_loss": eval_results.get("eval_loss", 0.0),
        "training_time_hours": training_time / 3600,
        "few_shot_samples": num_samples,
    }

    # Add emissions data if available
    if emissions_data:
        results.update({
            "duration_hours": emissions_data / 3600 if isinstance(emissions_data, (int, float)) else 0,
            "emissions_kg": emissions_data if isinstance(emissions_data, (int, float)) else 0,
            "emissions_rate_kg_per_s": 0,
            "energy_consumed_kwh": 0,
            "cpu_energy_kwh": 0,
            "gpu_energy_kwh": 0,
            "ram_energy_kwh": 0,
            "cpu_power_w": 0,
            "gpu_power_w": 0,
            "ram_power_w": 0,
            "cpu_model": "",
            "cpu_count": 0,
            "gpu_model": "",
            "gpu_count": 0,
            "ram_total_size_gb": 0,
            "country_name": "",
            "region": "",
            "pue": 1.0,
        })

    print(f"\n✓ Few-Shot Training complete!")
    print(f"  F1 Score: {results['f1_score']:.4f}")
    print(f"  Exact Match: {results['exact_match']:.4f}")
    print(f"  Training Time: {results['training_time_hours']:.4f} hours")
    print(f"  Trainable Params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")

    return results


In [11]:
def loadFewshotData(num_samples: int) -> Tuple[any, any]:
    print(f"\n{'='*60}")
    print(f"Loading SQuAD 2.0 for Few-Shot Learning ({num_samples} samples)...")
    print(f"{'='*60}")

    # Load full dataset
    dataset = load_dataset("squad_v2")

    # Sample specific number of training examples
    train_dataset = dataset["train"].shuffle(seed=42).select(range(num_samples))
    valid_dataset = dataset["validation"]

    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(valid_dataset)}")

    return train_dataset, valid_dataset

In [14]:
def fewshot_fine_tuning():
    MODEL_NAME = "roberta-base"
    SAMPLE_SIZES = [500, 1000] # 100
    OUTPUT_DIR = "/content/drive/MyDrive/Carbonemission/Version3_Roberta"

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    all_results = []

    for num_samples in SAMPLE_SIZES:
        # Load few-shot data
        train_data, valid_data = loadFewshotData(num_samples)

        # Tokenize datasets
        tokenized_train, tokenized_valid, tokenizer = tokenizeDatasets(train_data, valid_data, MODEL_NAME)

        # Train with frozen backbone
        results = trainFewshotModel(
            MODEL_NAME,
            tokenized_train,
            tokenized_valid,
            num_samples,
            OUTPUT_DIR
        )

        # Load detailed emissions data
        emissions_details = load_emissions_data(OUTPUT_DIR, num_samples)
        results.update(emissions_details)

        all_results.append(results)

        # Clean up
        del tokenized_train, tokenized_valid
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Save results
    save_results_to_csv(all_results, os.path.join(OUTPUT_DIR, "fewshot_finetuning_results.csv"))

    print("\n" + "="*60)
    print("Few-shot fine-tuning completed!")
    print("="*60)

    return all_results

In [15]:
fewshot_fine_tuning()
# Combine all results
#all_results = full_results + [lora_results] + fewshot_results
#save_results_to_csv(all_results, "./squad_training_results/combined_results.csv")



Loading SQuAD 2.0 for Few-Shot Learning (500 samples)...
Training samples: 500
Validation samples: 11873

Tokenizing datasets...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenization complete!

Few-Shot Learning: roberta-base with 500 samples


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trainable parameters: 1,538 / 124,056,578 (0.00%)


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting few-shot training (frozen backbone)...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,5.4229,5.322933,0.018053,0.000758
2,4.9033,4.79267,0.025405,0.00539
3,4.816,4.520408,0.034151,0.014487
4,4.3804,4.337534,0.060514,0.042197
5,4.3909,4.209374,0.085048,0.068054
6,4.2881,4.151436,0.075571,0.057862
7,4.3538,4.09049,0.091529,0.075381
8,4.216,4.054918,0.096074,0.080266
9,4.2956,4.035199,0.098861,0.08313
10,4.2764,4.03124,0.097852,0.082119





Evaluating few-shot model...



✓ Few-Shot Training complete!
  F1 Score: 0.0989
  Exact Match: 0.0831
  Training Time: 0.1763 hours
  Trainable Params: 1,538 (0.00%)

Loading SQuAD 2.0 for Few-Shot Learning (1000 samples)...
Training samples: 1000
Validation samples: 11873

Tokenizing datasets...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenization complete!

Few-Shot Learning: roberta-base with 1000 samples


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trainable parameters: 1,538 / 124,056,578 (0.00%)


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU




Starting few-shot training (frozen backbone)...


Epoch,Training Loss,Validation Loss,F1,Exact Match
1,4.8659,4.729486,0.053795,0.032174
2,4.2923,4.279586,0.076282,0.056767
3,4.4481,4.030134,0.117599,0.100901
4,4.2028,3.900332,0.137899,0.121536
5,4.1412,3.821613,0.146706,0.131054
6,3.9299,3.792376,0.127525,0.111935
7,4.1216,3.741582,0.151567,0.13636
8,3.93,3.71697,0.158308,0.143098
9,3.8561,3.711665,0.154755,0.139392
10,3.9576,3.705425,0.157485,0.142592



Evaluating few-shot model...



✓ Few-Shot Training complete!
  F1 Score: 0.1583
  Exact Match: 0.1431
  Training Time: 0.1850 hours
  Trainable Params: 1,538 (0.00%)

Results saved to: /content/drive/MyDrive/Carbonemission/Version3_Roberta/fewshot_finetuning_results.csv
           training_method   model_name  train_samples  valid_samples  dataset_fraction  f1_score  exact_match  eval_loss  training_time_hours  few_shot_samples  duration_hours  emissions_kg  emissions_rate_kg_per_s  energy_consumed_kwh  cpu_energy_kwh  gpu_energy_kwh  ram_energy_kwh  cpu_power_w  gpu_power_w  ram_power_w                      cpu_model  cpu_count                 gpu_model  gpu_count  ram_total_size_gb country_name  region  pue
Few-Shot (Frozen Backbone) roberta-base            500          11873          0.003837  0.098861     0.083130   4.035199             0.176312               500        0.176307      0.031395                 0.000049             0.066687        0.007492        0.052497        0.006698         42.5   226.161517 

[{'training_method': 'Few-Shot (Frozen Backbone)',
  'model_name': 'roberta-base',
  'train_samples': 500,
  'valid_samples': 11873,
  'dataset_fraction': 0.0038367390787222124,
  'f1_score': 0.09886107437246154,
  'exact_match': 0.08312979028046828,
  'eval_loss': 4.03519868850708,
  'training_time_hours': 0.17631172988149854,
  'few_shot_samples': 500,
  'duration_hours': np.float64(0.17630693592583332),
  'emissions_kg': np.float64(0.0313950852297272),
  'emissions_rate_kg_per_s': np.float64(4.9464060857629286e-05),
  'energy_consumed_kwh': np.float64(0.0666869560492355),
  'cpu_energy_kwh': np.float64(0.0074918335775881),
  'gpu_energy_kwh': np.float64(0.0524966664417439),
  'ram_energy_kwh': np.float64(0.0066984560299033),
  'cpu_power_w': np.float64(42.5),
  'gpu_power_w': np.float64(226.1615172148813),
  'ram_power_w': np.float64(38.0),
  'cpu_model': 'Intel(R) Xeon(R) CPU @ 2.20GHz',
  'cpu_count': np.int64(12),
  'gpu_model': '1 x NVIDIA A100-SXM4-40GB',
  'gpu_count': np.int6