In [1]:
%%capture
!pip install duckduckgo-search langchain-community
!pip install -U ddgs
!pip install datasets==3.6.0
!pip install -U bitsandbytes accelerate
!pip install -U transformers
!pip install trl

In [2]:
import re
import os
import gc
from enum import Enum
from google.colab import drive, userdata
from google import genai
from google.genai import types
from typing import Union, List, Dict, Any, Optional, Tuple, Union
from functools import partial
import requests
import json
import textwrap
import sys
import pandas as pd
from pydantic import BaseModel, Field
import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.functional import softmax
from scipy.special import softmax
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    BitsAndBytesConfig,
    TrainingArguments,
    EarlyStoppingCallback,
    PreTrainedModel,
    PreTrainedTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import random
from sklearn.metrics import average_precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from peft import PeftModel, PeftConfig, LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset, Dataset, Features, Value, DatasetDict, concatenate_datasets
from tqdm.auto import tqdm
from trl import SFTTrainer
import matplotlib.pyplot as plt
import seaborn as sns

## Prerequisites

In [3]:
def checkpoint(dataset, path, filename):
  os.makedirs(path, exist_ok=True)
  dataset.to_csv(f"{path}/{filename}")

def load_checkpoint(path, filename):
  return pd.read_csv(f"{path}/{filename}")

def upload_to_huggingface(path, filename, repo_id):
  df = load_checkpoint(path, filename)
  hf_dataset = Dataset.from_pandas(df)
  hf_dataset.push_to_hub(repo_id)

def load_from_huggingface(repo_id, split=None):
  dataset_dict = load_dataset(repo_id)
  if not split:
    return dataset_dict
  dataset = dataset_dict[split]
  return dataset.to_pandas()


In [4]:
def get_hf_repo_id(username, model_name, hparams):
    short_codes = {'epoch': 'ep', 'batch_size': 'bs', 'learning_rate': 'lr',
                   'lora_rank': 'r', 'lora_alpha': 'a', 'dropout': 'do'}

    param_parts = []
    for k, v in sorted(hparams.items()):
        code = short_codes.get(k, k)
        param_parts.append(f"{code}{v}")

    param_str = "-".join(param_parts)
    clean_model = model_name.split('/')[-1]

    repo_name = f"factguard-{clean_model}-{param_str}"
    return f"{username}/{repo_name}"

In [5]:
ENCODER_DECODER_MODEL_TYPES = ['t5gemma']
DECODER_ONLY_MODEL_TYPES = ['gemma2']

In [6]:
AVAILABE_MODELS = ['google/flan-t5-large', 'google/t5gemma-2b-2b-ul2-it']

In [7]:
FINETUNED_MODELS = {"rickpereira/FactGuard-Distilled-T5": 'google/t5gemma-2b-2b-ul2-it'}

In [8]:
class FineTunedModelType(str, Enum):
    T5GEMMA = "t5gemma"
    GEMMA2 = "gemma2"

    def model_repo(self) -> str:
      if self == FineTunedModelType.T5GEMMA:
        return "rickpereira/FactGuard-Distilled-T5"
      elif self == FineTunedModelType.GEMMA2:
        return "rickpereira/FactGuard-Distilled-Decoder"
      else:
        raise ValueError(f"Unknown model type: {self}")

    def base_model(self) -> str:
        if self == FineTunedModelType.T5GEMMA:
            return "google/t5gemma-2b-2b-ul2-it"
        elif self == FineTunedModelType.GEMMA2:
            return "google/gemma-2-2b-it"
        else:
            raise ValueError(f"Unknown model type: {self}")

    def auto_model_class(self) -> str:
        if self == FineTunedModelType.T5GEMMA:
            return AutoModelForSeq2SeqLM
        elif self == FineTunedModelType.GEMMA2:
            return AutoModelForCausalLM

In [9]:
class EncoderDecoderModelType(str, Enum):
    T5GEMMA = "t5gemma"

    def model_repo(self) -> str:
      return 'google/t5gemma-2b-2b-ul2-it'

    def base_model(self) -> str:
        return "google/t5gemma-2b-2b-ul2-it"

    def auto_model_class(self) -> str:
        return AutoModelForSeq2SeqLM

    def finetuned(self, hparams) -> str:
        return get_hf_repo_id("rickpereira", "t5gemma", hparams)

class DecoderOnlyModelType(str, Enum):
    GEMMA2 = "gemma2"

    def model_repo(self) -> str:
      return 'google/gemma-2-2b-it'

    def base_model(self) -> str:
        return "google/gemma-2-2b-it"

    def auto_model_class(self) -> str:
        return AutoModelForCausalLM

    def finetuned(self, hparams) -> str:
        return get_hf_repo_id("karan-mids24-hf", "gemma-2-2b-it", hparams)

In [10]:
def get_user_instruction(claim: str,
                         context: Optional[str] = None,
                         enable_model_reasoning: bool = False) -> str:
    VERDICT_RULES = textwrap.dedent("""
    Determine the final verdict:
    * **Yes:** If the claim is fully supported by the Context (if provided) or by external knowledge.
    * **No:** If the claim is false, contradicted, or if there is insufficient evidence to support or deny the claim.
    """).strip()

    if enable_model_reasoning:
        OUTPUT_REQUIREMENT = "Output Requirement: Output the final verdict ('Yes' or 'No') immediately followed by the rationale/evidence."
    else:
        OUTPUT_REQUIREMENT = "Output Requirement: Output the final verdict ('Yes' or 'No') and nothing else."

    instruction_parts = [
        "**Fact-Check and Evidence Verification**" if context else "**Fact-Check**",
        "",
        VERDICT_RULES,
        OUTPUT_REQUIREMENT,
        ""
    ]
    if context:
        instruction_parts.append(f"--- Context ---\n{context}\n")

    instruction_parts.append(f"--- Claim ---\n{claim}")
    instruction_parts.append("--- Verdict and Rationale ---" if enable_model_reasoning else "--- Verdict ---")

    user_instruction = "\n".join(instruction_parts).strip()
    return user_instruction

In [11]:
def get_model_output(model_output_label: str,
                     rationale: Optional[str] = None,
                     enable_model_reasoning: bool = False) -> str:

    if enable_model_reasoning:
        if rationale is None:
             rationale = "No reasoning provided in the dataset."

        model_output = textwrap.dedent(f"""
        {model_output_label} RATIONALE: {rationale}
        """).strip()

        return model_output
    else:
        model_output = model_output_label.strip()
        return model_output

In [12]:
def get_probabilities_decoder_only(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, inputs, device):
  with torch.no_grad():
    logits = model(**inputs).logits

  # Extract the logits for the Yes and No tokens
  vocab = tokenizer.get_vocab()
  selected_logits = logits[0, -1, [vocab['True'], vocab['False']]]

  # Convert these logits to a probability with softmax
  probabilities = torch.softmax(selected_logits, dim=0)
  return probabilities


In [13]:
def get_probabilities_encoder_decoder(
    model: PreTrainedModel, tokenizer: PreTrainedTokenizer, inputs, device):
  decoder_start_token_id = tokenizer.pad_token_id
  if decoder_start_token_id is None:
      raise ValueError(f"Model {model.config.model_type} is Seq2Seq but has no pad_token_id defined for decoder_input_ids.")

  inputs['decoder_input_ids'] = torch.tensor(
      [[decoder_start_token_id]] * inputs['input_ids'].shape[0], # Batch size
      dtype=torch.long,
      device=device
  )
  with torch.no_grad():
      model.eval()
      outputs = model(**inputs)
      logits = outputs.logits

  vocab = tokenizer.get_vocab()
  LOGIT_BIAS_B = 1.45
  yes_token_id = vocab.get('Yes') or vocab.get(' Yes')
  no_token_id = vocab.get('No') or vocab.get(' No')
  raw_yes_logit = logits[0, -1, yes_token_id] # 0 -> -1
  raw_no_logit = logits[0, -1, no_token_id]
  adjusted_yes_logit = raw_yes_logit + LOGIT_BIAS_B
  selected_logits = torch.stack([adjusted_yes_logit, raw_no_logit], dim=0)
  probabilities = torch.softmax(selected_logits, dim=0)
  return probabilities


In [14]:
def get_probabilities(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, inputs, device):
    model_type = model.config.model_type
    is_seq2seq = model_type in ENCODER_DECODER_MODEL_TYPES
    return (
        get_probabilities_encoder_decoder(model, tokenizer, inputs, device)
        if is_seq2seq
        else get_probabilities_decoder_only(model, tokenizer, inputs, device)
    )

In [15]:
os.environ['LANGEXTRACT_API_KEY'] = userdata.get('GEMINI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

## Building LLM

In [16]:
def load_model(model_type: Union[DecoderOnlyModelType, EncoderDecoderModelType],
               use_quantiziation: bool = False,
               finetuned: bool = False,
               hparams = None) -> Tuple[PreTrainedModel, AutoTokenizer, Dict[str, Any]]:
  if finetuned:
    model_repo = model_type.finetuned(hparams)
    print(model_repo)
  else:
    model_repo = model_type.model_repo()
  tokenizer_name = model_type.base_model()
  AutoModelClass = model_type.auto_model_class()

  # 4-bit Quantization Config
  nf4_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.float16
  )
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  model = AutoModelClass.from_pretrained(
      model_repo,
      quantization_config=nf4_config if use_quantiziation else None,
      device_map='auto'
  )
  return model, tokenizer

In [None]:
gemma2b_base_model, tokenizer = load_model(DecoderOnlyModelType.GEMMA2, use_quantiziation=True)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

## Distillation

In [None]:
TARGET_COLUMNS_CAUSAL_LM = ['original_id', 'claim', 'label', 'rationale', 'context']

In [None]:
def format_text(example: Dict[str, Any]) -> Dict[str, str]:
  claim = example.get('claim', '')
  context = example.get('context', 'No specific context provided.')
  label = example.get('label')
  rationale = example.get('rationale', 'No reasoning provided.')
  if not isinstance(label, bool):
    print(f"Unexpected label type: {type(label)}")
    return

  user_prompt = get_user_instruction(context=context, claim=claim, enable_model_reasoning=False)
  model_output = get_model_output(
      model_output_label="Yes" if label else "No",
      rationale=rationale,
      enable_model_reasoning=False)
  text = (
      f"<start_of_turn>user\n{user_prompt}<end_of_turn>\n"
      f"<start_of_turn>model\n{model_output}<end_of_turn>"
  )

  return {"text": text}

In [None]:
def prepare_data(dataset_df: pd.DataFrame) -> DatasetDict:
  FINAL_FT_COLUMNS = ['claim', 'context', 'label', 'rationale']
  if dataset_df.empty:
    return DatasetDict({'train': Dataset.from_dict({'text': []})})

  subset = dataset_df[FINAL_FT_COLUMNS].copy()
  formatted_series = subset.apply(format_text, axis=1)
  data_list = [item for item in list(formatted_series) if item is not None]
  dataset_hf = Dataset.from_list(data_list)
  return DatasetDict({'train': dataset_hf})

In [None]:
def _load_and_align_datasets(fever_repo: str, squad_repo: str) -> DatasetDict:

    fever_distilled_datasets = load_from_huggingface(fever_repo)
    squad_distilled_datasets = load_from_huggingface(squad_repo)

    # FEVER alignment
    fever_aligned = fever_distilled_datasets.remove_columns("label")
    fever_aligned = fever_aligned.rename_column("verdict", "label")
    fever_aligned = fever_aligned.rename_column("original_fever_id", "original_id")
    fever_aligned = fever_aligned.cast_column("original_id", Value("string"))
    fever_aligned = fever_aligned.select_columns(TARGET_COLUMNS_CAUSAL_LM)

    # SQuAD alignment
    squad_aligned = squad_distilled_datasets.rename_column("original_squad_id", "original_id")
    squad_aligned = squad_aligned.select_columns(TARGET_COLUMNS_CAUSAL_LM)

    return DatasetDict(fever=fever_aligned, squad=squad_aligned)

In [None]:
def _combine_datasets(aligned_datasets: DatasetDict) -> DatasetDict:

    combined_datasets = DatasetDict()
    fever_data = aligned_datasets['fever']
    squad_data = aligned_datasets['squad']

    split_names = set(squad_data.keys()) & set(fever_data.keys())

    for split in split_names:
        combined_datasets[split] = concatenate_datasets([
            squad_data[split],
            fever_data[split]
        ])
    return combined_datasets

In [None]:
def _prepare_for_seq2seq_tuning(dataset_dict: DatasetDict) -> DatasetDict:

    def seq2seq_mapping_function(example):
        # INPUT_TEXT (Encoder Input): Instruction + Claim + Context
        if not isinstance(example.get('label'), bool):
            print(f"WARNING: Label Value - {example.get('label')} is {type(example.get('label'))}")
            return

        enable_model_reasoning = False
        input_text = get_user_instruction(
            claim=example['claim'],
            context=example['context'],
            enable_model_reasoning=enable_model_reasoning)
        label = "Yes" if example.get('label') else "No"
        target_text = get_model_output(
            model_output_label=label,
            rationale=example['rationale'],
            enable_model_reasoning=enable_model_reasoning)
        return {
            'input_text': input_text,
            'target_text': target_text
        }

    print("Preparing data for Encoder-Decoder (Seq2Seq) fine-tuning...")

    # Apply the mapping function to create the required text columns
    seq2seq_datasets = dataset_dict.map(
        seq2seq_mapping_function,
        remove_columns=TARGET_COLUMNS_CAUSAL_LM,
        batched=False
    )
    return seq2seq_datasets

In [None]:
def get_distilled_datasets(repo_id: Optional[str] = None, prepare_for_seq2seq: bool = False) -> DatasetDict:
    if repo_id:
        return load_from_huggingface(repo_id)

    aligned_data = _load_and_align_datasets(
        fever_repo="rickpereira/factguard_fever_distilled_datasets",
        squad_repo="rickpereira/factguard_squad_distilled_datasets"
    )
    combined_datasets = _combine_datasets(aligned_data)

    if prepare_for_seq2seq:
        processed_datasets = _prepare_for_seq2seq_tuning(combined_datasets)
        final_dataset = processed_datasets['train']
    else:
        print("Preparing data for Decoder-Only (CausalLM/SFTTrainer) fine-tuning...")
        df = combined_datasets['train'].to_pandas()
        prepared = prepare_data(df)
        final_dataset = prepared['train']

    split_datasets = final_dataset.train_test_split(
        test_size=0.1,
        seed=42
    )
    for split_name in combined_datasets.keys():
        if split_name != 'train' and split_name in split_datasets:
            split_datasets[split_name] = combined_datasets[split_name]

    return split_datasets

## Fine-Tuning with Evidence Prompt

In [None]:
datasets = get_distilled_datasets(prepare_for_seq2seq=False)

README.md:   0%|          | 0.00/462 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9100 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/430 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/7.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9100 [00:00<?, ? examples/s]

Preparing data for Decoder-Only (CausalLM/SFTTrainer) fine-tuning...


In [None]:
train_dataset = datasets['train']
eval_dataset = datasets['test']

In [None]:
def setup_training_arguments(is_seq2seq: bool, hparams: Dict[str, Any]) -> Any:
    """Sets up common training arguments using hparams."""

    ArgsClass = Seq2SeqTrainingArguments if is_seq2seq else TrainingArguments

    training_args = ArgsClass(
        num_train_epochs=hparams.get('epoch', 1),
        per_device_train_batch_size=hparams.get('batch_size', 4),
        learning_rate=hparams.get('learning_rate', 5e-5),

        # Hardcoded / Structural defaults
        gradient_accumulation_steps=2, # You might want to add this to hparams if you need control
        logging_steps=50,
        save_strategy="epoch",
        eval_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        gradient_checkpointing=True,
        overwrite_output_dir=True,
        fp16=False,
        bf16=True if torch.cuda.get_device_capability()[0] >= 8 else False,
        report_to="none",
        # predict_with_generate=is_seq2seq,
        dataloader_num_workers=8,
        optim="adamw_bnb_8bit"
    )
    return training_args

In [None]:
def finetune_decoder_only(train_dataset: Any, eval_dataset: Any, model: PreTrainedModel,
                    tokenizer: PreTrainedTokenizer, hparams: Dict[str, Any], use_lora: bool):

    # Pass hparams to setup arguments
    training_args = setup_training_arguments(is_seq2seq=False, hparams=hparams)

    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.0,
    )

    peft_config = None
    if use_lora:
        peft_config = LoraConfig(
            r=hparams.get('lora_rank', 64),
            lora_alpha=hparams.get('lora_alpha', 128),
            lora_dropout=hparams.get('dropout', 0.05),
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        callbacks=[early_stopping]
    )
    print(f"Starting TRL fine-tuning for Decoder-Only model: {model.config.model_type}")
    trainer.train()
    return trainer

In [None]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 27090
})

In [None]:
train_dataset[5]

{'text': '<start_of_turn>user\n**Fact-Check and Evidence Verification**\n\nDetermine the final verdict:\n* **Yes:** If the claim is fully supported by the Context (if provided) or by external knowledge.\n* **No:** If the claim is false, contradicted, or if there is insufficient evidence to support or deny the claim.\nOutput Requirement: Output the final verdict (\'Yes\' or \'No\') and nothing else.\n\n--- Context ---\nKatie Stevens is an American actress and singer, widely known for her role as Jane Sloan in "The Bold Type." She was born on December 8, 1992, not 1982.\n\n--- Claim ---\nKatie Stevens\' birth year was 1982.\n--- Verdict ---<end_of_turn>\n<start_of_turn>model\nNo<end_of_turn>'}

In [None]:
def finetune_encoder_decoder(train_dataset: Any, eval_dataset: Any, model: PreTrainedModel,
                             tokenizer: PreTrainedTokenizer, hparams: Dict[str, Any], use_lora: bool):
    if hasattr(model, "unload"):
        print("Unloading existing PEFT configuration (if present)...")
        model.unload()

    training_args = setup_training_arguments(is_seq2seq=True, hparams=hparams)

    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.0,
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    peft_config = None
    if use_lora:
        peft_config = LoraConfig(
            r=hparams.get('lora_rank', 64),
            lora_alpha=hparams.get('lora_alpha', 128),
            lora_dropout=hparams.get('dropout', 0.05),
            bias="all",
            task_type="SEQ_2_SEQ_LM",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        )

        model = get_peft_model(model, peft_config)
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
        elif hasattr(model, "base_model") and hasattr(model.base_model, "enable_input_require_grads"):
             model.base_model.enable_input_require_grads()
        else:
             print("Warning: Could not find enable_input_require_grads method.")

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[early_stopping],
    )
    print(f"Starting HF fine-tuning for Encoder-Decoder model: {model.config.model_type}")
    trainer.train()
    return trainer

In [None]:
def preprocess_encoder_decoder(examples: Dict[str, Any], tokenizer: Any, max_input_length: int = 512, max_target_length: int = 128) -> Dict[str, Any]:
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )

    labels = tokenizer(
        examples["target_text"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]

    if tokenizer.pad_token_id is not None:
        model_inputs["labels"] = np.array(model_inputs["labels"])
        model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100
        model_inputs["labels"] = model_inputs["labels"].tolist() # Convert back to list for dataset

    return model_inputs

In [None]:
def finetune(train_dataset: Any, eval_dataset: Any, model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer, hparams: Dict[str, Any], use_lora: bool = False):
    gc.collect()
    torch.cuda.empty_cache()

    model_type = model.config.model_type

    # Assuming ENCODER_DECODER_MODEL_TYPES is defined globally elsewhere
    if model_type in ENCODER_DECODER_MODEL_TYPES:
        print(f"Preprocessing data for Encoder-Decoder model: {model_type}")

        eval_dataset = eval_dataset.map(
            lambda x: preprocess_encoder_decoder(x, tokenizer=tokenizer),
            batched=True,
            remove_columns=['input_text', 'target_text'],
            desc="Tokenizing evaluation data"
        )

        train_dataset = train_dataset.map(
            lambda x: preprocess_encoder_decoder(x, tokenizer=tokenizer),
            batched=True,
            remove_columns=['input_text', 'target_text'],
            desc="Tokenizing training data"
        )

        return finetune_encoder_decoder(train_dataset, eval_dataset, model, tokenizer, hparams, use_lora)
    else:
        return finetune_decoder_only(train_dataset, eval_dataset, model, tokenizer, hparams, use_lora)

In [17]:
gc.collect()
torch.cuda.empty_cache()

In [17]:
def do_experiment(all_hparams):
  for hparams in all_hparams:
    print(f"Running experiment with hparams: {hparams}")
    model, tokenizer = load_model(DecoderOnlyModelType.GEMMA2, use_quantiziation=True)
    trainer = finetune(train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      model=model,
                      tokenizer=tokenizer,
                      hparams=hparams,
                      use_lora=True)
    sft_model = trainer.model
    repo_id = get_hf_repo_id("karan-mids24-hf", "gemma-2-2b-it", hparams)
    print(f"Pushing: {repo_id}")
    sft_model.push_to_hub(repo_id)


In [None]:
hparams2 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 32,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams3 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 8,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams4 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 16,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams5 = {
  'epoch': 2,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.01,
  'gradient_checkpointing': True
}

hparams6 = {
  'epoch': 1,
  'batch_size': 2,
  'learning_rate': 5e-5,
  'lora_rank': 32,
  'lora_alpha': 64,
  'dropout': 0.01,
  'gradient_checkpointing': True
}


all_hparams = [hparams2, hparams3, hparams4, hparams5, hparams6]

In [None]:
do_experiment(all_hparams)

Running experiment with hparams: {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.7318,0.718542,0.741813,6925132.0,0.828487


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr0.0005-a32-r64


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 30.2kB /  332MB            

Running experiment with hparams: {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 8, 'lora_alpha': 16, 'dropout': 0.05, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.1428,1.137772,1.138848,6925132.0,0.746124


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a16-r8


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 41.7kB / 41.6MB            

Running experiment with hparams: {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 16, 'lora_alpha': 16, 'dropout': 0.05, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.8721,0.858416,0.870796,6925132.0,0.798009


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr0.0005-a16-r16


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   1%|1         | 1.11MB / 83.1MB            

Running experiment with hparams: {'epoch': 2, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.01, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.8678,0.851828,0.870646,6925132.0,0.799657
2,0.4559,0.663583,0.598915,13850264.0,0.847013


  return fn(*args, **kwargs)


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.01-ep2-gradient_checkpointingTrue-lr5e-05-a128-r64


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 30.2kB /  332MB            

Running experiment with hparams: {'epoch': 1, 'batch_size': 2, 'learning_rate': 5e-05, 'lora_rank': 32, 'lora_alpha': 64, 'dropout': 0.01, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.0265,0.996927,0.995247,6925132.0,0.770802


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs2-do0.01-ep1-gradient_checkpointingTrue-lr5e-05-a64-r32


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 41.9kB /  166MB            

In [None]:
hparams1 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True

}

hparams7 = {
  'epoch': 3,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

all_hparams = [hparams1, hparams7]

In [None]:
do_experiment(all_hparams)

Running experiment with hparams: {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.9652,0.960314,0.966046,6925132.0,0.778056


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a128-r64


README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 30.2kB /  332MB            

Running experiment with hparams: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.4348,1.378027,1.428894,6925132.0,0.708589
2,0.8857,0.934313,0.862523,13850264.0,0.788192
3,0.3562,0.535326,0.43503,20775396.0,0.883776


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep3-gradient_checkpointingTrue-lr0.0005-a128-r64


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 30.2kB /  332MB            

In [None]:
# => Running experiment with hparams: {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}

# Epoch	Training Loss	Validation Loss	 Entropy	    Num Tokens	     Mean Token Accuracy
# 1	0.731800	0.718542	 0.741813        6925132.000000	       0.828487

hparams8 = {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}

all_hparams = [hparams8]

In [None]:
do_experiment(all_hparams)

Running experiment with hparams: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding EOS to train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/27090 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3010 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.


Starting TRL fine-tuning for Decoder-Only model: gemma2


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.9169,0.871339,0.875595,6925132.0,0.794332
2,0.4082,0.517399,0.486662,13850264.0,0.876134
3,0.1588,0.364928,0.256372,20775396.0,0.927389


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Pushing: karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep3-gradient_checkpointingTrue-lr0.0005-a32-r64


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 30.2kB /  332MB            

## Creating RAG

In [17]:
def non_retrieval_pipeline(claim: str, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, device: str) -> Dict[str, Any]:

    model_type = model.config.model_type
    is_seq2seq = any(t in model_type for t in ENCODER_DECODER_MODEL_TYPES)
    user_instruction = get_user_instruction(claim=claim, context=None)

    if is_seq2seq:
        final_prompt_string = f"{user_instruction}<bos> "
    else:
        final_prompt_string = f"<start_of_turn>user\n{user_instruction}<end_of_turn>\n<start_of_turn>model\n"

    tokenized_inputs = tokenizer(final_prompt_string, return_tensors="pt", truncation=True)

    try:
        if is_seq2seq:
          inputs_dict = {k: v for k, v in tokenized_inputs.items()}
          inputs_dict = {k: v.to(device) for k, v in inputs_dict.items()}
          probabilities_tensor = get_probabilities(model, tokenizer, inputs_dict, device)
        else:
          inputs = tokenized_inputs.to(device)
          probabilities_tensor = get_probabilities(model, tokenizer, inputs, device)

        # Probabilities
        prob_yes = probabilities_tensor[0].item()
        prob_no = probabilities_tensor[1].item()

        # Determine the final verdict
        verdict = "Yes" if prob_yes > prob_no else "No"

        return {"verdict": verdict, "probability_yes": prob_yes}

    except Exception as e:
        print(f"Error during logit extraction: {e}")
        return {"verdict": "ERROR", "probability_yes": 0.0}

In [18]:
def factguard_pipeline(claim: str, search_tool: DuckDuckGoSearchRun, model, tokenizer, device):
    # Search for Context
    web_context = search_tool.invoke(claim)
    if not web_context:
        web_context = "Search returned no relevant information."

    model_type = model.config.model_type
    is_seq2seq = any(t in model_type for t in ENCODER_DECODER_MODEL_TYPES)
    user_instruction = get_user_instruction(claim=claim, context=web_context)

    if is_seq2seq:
        final_prompt_string = f"{user_instruction}<bos> "
    else:
        final_prompt_string = f"<start_of_turn>user\n{user_instruction}<end_of_turn>\n<start_of_turn>model\n"

    tokenized_inputs = tokenizer(final_prompt_string, return_tensors="pt", truncation=True)
    inputs_dict = {k: v for k, v in tokenized_inputs.items()}
    inputs_dict = {k: v.to(device) for k, v in inputs_dict.items()}

    try:
        probabilities_tensor = get_probabilities(model, tokenizer, inputs_dict, device)

        # Probabilities
        prob_yes = probabilities_tensor[0].item()
        prob_no = probabilities_tensor[1].item()

        # Determine the final verdict
        verdict = "Yes" if prob_yes > prob_no else "No"

        return {"verdict": verdict, "probability_yes": prob_yes}

    except Exception as e:
        print(f"Error during logit extraction: {e}")
        return {"verdict": "ERROR", "probability_yes": 0.0}

## Evaluating

In [None]:
hparams2 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 32,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams3 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 8,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams4 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 16,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams5 = {
  'epoch': 2,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.01,
  'gradient_checkpointing': True
}

hparams6 = {
  'epoch': 1,
  'batch_size': 2,
  'learning_rate': 5e-5,
  'lora_rank': 32,
  'lora_alpha': 64,
  'dropout': 0.01,
  'gradient_checkpointing': True
}


all_hparams = [hparams2, hparams3, hparams4, hparams5, hparams6]

In [19]:
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
BASE_DIR = '/content/drive/MyDrive/experiments/FactGuard/'
os.makedirs(BASE_DIR, exist_ok=True)

In [21]:
def save_experiment_results(df, dataset, model_name, run_type, hparams=None):
    if run_type == 'baseline' or not hparams:
        param_str = "no_params"
    else:
        short_codes = {
            'epoch': 'ep', 'batch_size': 'bs', 'learning_rate': 'lr',
            'lora_rank': 'r', 'lora_alpha': 'a', 'dropout': 'do'
        }

        parts = []
        for k, v in sorted(hparams.items()):
            code = short_codes.get(k, k)
            parts.append(f"{code}{v}")

        param_str = "_".join(parts)

    # Structure: {DATASET}_{MODEL}_{TYPE}_{PARAMS}.parquet
    filename = f"{dataset}_{model_name}_{run_type}_{param_str}.parquet"
    full_path = os.path.join(BASE_DIR, filename)

    # 4. Save
    df.to_parquet(full_path, index=False)
    print(f"Saved: {filename}")

In [22]:
def load_experiment_result(filepath):
    try:
        df = pd.read_parquet(filepath)
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None

    # Remove path and extension to get: "FEVER_gemma2_finetune_ep3_bs16..."
    filename = os.path.splitext(os.path.basename(filepath))[0]

    # Split the main components (assuming the structure from the previous step)
    # Note: This split logic assumes Model Name does not contain underscores.
    # If your model names have underscores (e.g. 't5_base'), we might need a stricter delimiter.
    parts = filename.split('_')

    dataset = parts[0]
    model_name = parts[1]
    run_type = parts[2]

    # 3. Add basic metadata to DataFrame
    df['dataset'] = dataset
    df['model'] = model_name
    df['run_type'] = run_type

    # The remaining parts of the list are the hyperparameters (e.g., ['ep3', 'bs16', 'lr2e-5'])
    param_parts = parts[3:]

    # Define the mapping from filename short-codes back to full column names
    code_map = {
        'ep': 'epoch',
        'bs': 'batch_size',
        'lr': 'learning_rate',
        'r':  'lora_rank',
        'a':  'lora_alpha',
        'do': 'dropout'
    }

    if 'no_params' in param_parts:
        # Initialize columns with None or NaN for baselines
        for col in code_map.values():
            df[col] = None
    else:
        for item in param_parts:
            # Use Regex to separate the letters (key) from the numbers (value)
            # ^([a-z]+) matches the start letters (e.g., 'lr')
            # (.*)$ matches the rest (e.g., '2e-05' or '0.1')
            match = re.match(r"^([a-z]+)(.*)$", item)

            if match:
                code, value = match.groups()

                col_name = code_map.get(code, code) # fallback to code if not in map

                try:
                    clean_val = float(value)
                    # Convert to int if it's actually an integer (e.g. 3.0 -> 3)
                    if clean_val.is_integer():
                        clean_val = int(clean_val)
                except ValueError:
                    clean_val = value # Keep as string if not a number

                # Assign to the dataframe
                df[col_name] = clean_val

    return df

In [23]:
def get_probabilities_from_dataset(dataset, pipeline):
  if type(dataset) is not pd.DataFrame:
    dataset = dataset.to_pandas()

  yes_probs = []
  no_probs = []
  for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    claim = row['claim']
    result = pipeline(claim=claim)

    verdict = result['verdict']
    yes_prob = result['probability_yes']

    yes_probs.append(yes_prob)
    no_probs.append(1 - yes_prob)

  dataset['Yes'] = yes_probs
  dataset['No'] = no_probs
  return dataset

In [24]:
def display_confusion_matrix(yes_probs, yes_labels, threshold = 0.5):
  predictions = (yes_probs >= threshold).astype(int)
  cm = confusion_matrix(yes_labels, predictions)
  TN, FP, FN, TP = cm.ravel()
  recall_yes = TP / (TP + FN) if (TP + FN) > 0 else 0
  specificity_no = TN / (TN + FP) if (TN + FP) > 0 else 0
  accuracy = (TP + TN) / (TP + FP + FN + TN) if (TP + FP + FN + TN) > 0 else 0
  precision = TP / (TP + FP) if (TP + FP) > 0 else 0
  f1_score = 2 * (precision * recall_yes) / (precision + recall_yes) if (precision + recall_yes) > 0 else 0

  # 5. Print the results
  print(f"--- Performance at Threshold: {threshold} ---")
  print(f"Confusion Matrix:\n{cm}")
  print(f"Total Actual Yes's (TP + FN): {TP + FN}")
  print(f"Total Actual No's (TN + FP): {TN + FP}")
  print("---------------------------------------------")
  print(f"Percentage of **Actual Yes's** correct (Recall): {recall_yes:.2%}")
  print(f"Percentage of **Actual No's** correct (Specificity): {specificity_no:.2%}")
  print(f"Overall Accuracy: {accuracy:.2%}")
  print(f"Precision: {precision:.2%}")
  print(f"F1 Score: {f1_score:.2%}")
  print("---------------------------------------------")

In [25]:
class EvaluationDatasetType(Enum):
  fever = "fever"
  boolq = "boolq"
  liar = "liar"

  def _load_dataset(self):
    if self == EvaluationDatasetType.fever:
      return load_dataset("rickpereira/FEVER")
    elif self == EvaluationDatasetType.boolq:
      return load_dataset("google/boolq")
    elif self == EvaluationDatasetType.liar:
      return load_dataset("rickpereira/liar")
    else:
      raise ValueError(f"Unknown dataset type: {self}")

  def _prepare_fever(self, dataset):
    TARGET_COLUMNS = ['label', 'claim']
    fever = dataset.select_columns(TARGET_COLUMNS)
    return fever

  def _prepare_boolq(self, dataset):
    TARGET_COLUMNS = ['question', 'answer']
    boolq = dataset.select_columns(TARGET_COLUMNS)
    boolq = boolq.rename_column('question', 'claim')
    boolq= boolq.rename_column('answer', 'label')
    return boolq

  def _prepare_liar(self, dataset):
    TARGET_COLUMNS = ['label', 'claim']
    dataset = dataset.filter(lambda x: x['label'] in ['true', 'false'])
    dataset = dataset.rename_column("statement", "claim")
    liar = dataset.select_columns(TARGET_COLUMNS)
    return liar

  def _prepare(self, dataset):
    if self == EvaluationDatasetType.fever:
      return self._prepare_fever(dataset)
    elif self == EvaluationDatasetType.boolq:
      return self._prepare_boolq(dataset)
    elif self == EvaluationDatasetType.liar:
      return self._prepare_liar(dataset)
    else:
      raise ValueError(f"Unknown dataset type: {self}")

  def load_and_prepare(self):
    dataset = self._load_dataset()
    return self._prepare(dataset)

### FEVER

In [26]:
fever_datasets = EvaluationDatasetType.fever.load_and_prepare()

README.md:   0%|          | 0.00/657 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211057 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/52765 [00:00<?, ? examples/s]

In [27]:
test_dataset = fever_datasets['test'].to_pandas().sample(n=1000, random_state=42)

In [28]:
test_dataset

Unnamed: 0,label,claim
12623,SUPPORTS,Ben Affleck is an actor.
34211,SUPPORTS,Paul Thomas Anderson directed a movie.
6181,SUPPORTS,The Fujitsu iPAD was introduced in 2002.
36479,SUPPORTS,Absolute Beginners featured Patsy Kensit.
2975,REFUTES,The Celtic F.C. is based in Edinburgh.
...,...,...
42743,REFUTES,Tall Story is an American novel.
20254,SUPPORTS,Steffi Graf is an athlete.
18310,SUPPORTS,The Philippines was named by an explorer.
14198,SUPPORTS,Sam Worthington was born in the 20th century.


#### Baseline Results

In [None]:
model = gemma2b_base_model.to("cuda")

In [None]:
baseline = partial(non_retrieval_pipeline,
                   model=model,
                   tokenizer=tokenizer,
                   device="cuda")

In [None]:
baseline_probs = get_probabilities_from_dataset(test_dataset, baseline)

  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
baseline_probs

Unnamed: 0,label,claim,Yes,No
12623,SUPPORTS,Ben Affleck is an actor.,0.996582,0.003418
34211,SUPPORTS,Paul Thomas Anderson directed a movie.,0.995605,0.004395
6181,SUPPORTS,The Fujitsu iPAD was introduced in 2002.,0.013664,0.986336
36479,SUPPORTS,Absolute Beginners featured Patsy Kensit.,0.659180,0.340820
2975,REFUTES,The Celtic F.C. is based in Edinburgh.,0.004192,0.995808
...,...,...,...,...
42743,REFUTES,Tall Story is an American novel.,0.042694,0.957306
20254,SUPPORTS,Steffi Graf is an athlete.,0.992676,0.007324
18310,SUPPORTS,The Philippines was named by an explorer.,0.104309,0.895691
14198,SUPPORTS,Sam Worthington was born in the 20th century.,0.047943,0.952057


In [None]:
yes_labels = baseline_probs.label.apply(lambda x : 1 if x == 'SUPPORTS' else 0)
yes_probs = baseline_probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[Baseline] AU-PRC: {auprc}")

[Baseline] AU-PRC: 0.9378596602180433


In [None]:
display_confusion_matrix(yes_probs, yes_labels)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[212  29]
 [297 462]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 60.87%
Percentage of **Actual No's** correct (Specificity): 87.97%
Overall Accuracy: 67.40%
Precision: 94.09%
F1 Score: 73.92%
---------------------------------------------


In [None]:
save_experiment_results(
    baseline_probs,
    dataset='FEVER',
    model_name='gemma2b',
    run_type='baseline'
)

Saved: FEVER_gemma2b_baseline_no_params.parquet


#### FactGuard - LLM Eval

In [None]:
factguard_llm = partial(non_retrieval_pipeline,
                        model=sft_model,
                        tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(test_dataset, factguard_llm)

  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x == 'SUPPORTS' else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")

[SFT - LLM ONLY] AU-PRC: 0.9701184261955839


In [None]:
display_confusion_matrix(yes_probs)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[125 116]
 [ 16 743]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 97.89%
Percentage of **Actual No's** correct (Specificity): 51.87%
Overall Accuracy: 86.80%
Precision: 86.50%
F1 Score: 91.84%
---------------------------------------------


In [None]:
save_experiment_results(
    probs,
    dataset='FEVER',
    model_name='t5gemma',
    run_type='finetune',
    hparams=hparams
)

Saved: FEVER_t5gemma_finetune_bs8_do0.05_ep1_lr5e-05_a128_r64.parquet


#### FactGuard - RAG Eval

In [None]:
sft_model = gemma2b_ft_1

In [None]:
factguard = partial(factguard_pipeline,
                    search_tool=DuckDuckGoSearchRun(),
                    model=sft_model,
                    tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(test_dataset, factguard)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x == 'SUPPORTS' else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[SFT] AU-PRC: {auprc}")

[SFT] AU-PRC: 0.9855739540438013


In [None]:
display_confusion_matrix(yes_probs, yes_labels, threshold=0.03)

--- Performance at Threshold: 0.03 ---
Confusion Matrix:
[[218  23]
 [ 93 666]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 87.75%
Percentage of **Actual No's** correct (Specificity): 90.46%
Overall Accuracy: 88.40%
Precision: 96.66%
F1 Score: 91.99%
---------------------------------------------


In [None]:
# hparams1 = {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}
# gemma2b_ft_1, tokenizer = load_model(DecoderOnlyModelType.GEMMA2, use_quantiziation=True, finetuned=True, hparams = hparams1)

In [None]:
save_experiment_results(
    probs,
    dataset='FEVER',
    model_name='gemma2b',
    run_type='rag',
    hparams=hparams1
)

Saved: FEVER_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


In [None]:
def evaluate_fever(hparams, sft_model, tokenizer):
  print(f"Evals: {hparams}")
  fever_datasets = EvaluationDatasetType.fever.load_and_prepare()
  test_dataset = fever_datasets['test'].to_pandas().sample(n=1000, random_state=42)
  # LLM ONLY
  factguard_llm = partial(non_retrieval_pipeline,
                        model=sft_model,
                        tokenizer=tokenizer, device="cuda")
  probs = get_probabilities_from_dataset(test_dataset, factguard_llm)
  yes_labels = probs.label.apply(lambda x : 1 if x == 'SUPPORTS' else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='FEVER',
      model_name='gemma2b',
      run_type='finetune',
      hparams=hparams
  )
  # RAG
  factguard = partial(factguard_pipeline,
                    search_tool=DuckDuckGoSearchRun(),
                    model=sft_model,
                    tokenizer=tokenizer, device="cuda")
  probs = get_probabilities_from_dataset(test_dataset, factguard)
  yes_labels = probs.label.apply(lambda x : 1 if x == 'SUPPORTS' else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='FEVER',
      model_name='gemma2b',
      run_type='rag',
      hparams=hparams
  )

In [None]:
def evaluate_fever_all(all_hparams):
  for hparams in all_hparams:
    sft_model, tokenizer = load_model(
        model_type=DecoderOnlyModelType.GEMMA2,
        use_quantiziation=True, finetuned=True, hparams=hparams
    )
    sft_model = sft_model.to("cuda")
    evaluate_fever(hparams, sft_model, tokenizer)

In [None]:
all_hparams = [hparams2, hparams3, hparams4, hparams5, hparams6]

In [None]:
hparams1 = {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}
hparams7 = {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}
hparams8 = {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}
all_hparams = [hparams1, hparams7, hparams8]

In [None]:
evaluate_fever_all(all_hparams)

karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a128-r64


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.9703830074327656
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[230  11]
 [280 479]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 63.11%
Percentage of **Actual No's** correct (Specificity): 95.44%
Overall Accuracy: 70.90%
Precision: 97.76%
F1 Score: 76.70%
---------------------------------------------
Saved: FEVER_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

[SFT] AU-PRC: 0.9896875543936053
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[237   4]
 [163 596]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 78.52%
Percentage of **Actual No's** correct (Specificity): 98.34%
Overall Accuracy: 83.30%
Precision: 99.33%
F1 Score: 87.71%
---------------------------------------------
Saved: FEVER_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep3-gradient_checkpointingTrue-lr0.0005-a128-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.9444078327853931
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[236   5]
 [492 267]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 35.18%
Percentage of **Actual No's** correct (Specificity): 97.93%
Overall Accuracy: 50.30%
Precision: 98.16%
F1 Score: 51.79%
---------------------------------------------
Saved: FEVER_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a128_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

[SFT] AU-PRC: 0.9438147403052941
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[222  19]
 [372 387]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 50.99%
Percentage of **Actual No's** correct (Specificity): 92.12%
Overall Accuracy: 60.90%
Precision: 95.32%
F1 Score: 66.44%
---------------------------------------------
Saved: FEVER_gemma2b_rag_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a128_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep3-gradient_checkpointingTrue-lr0.0005-a32-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.9745845131459866
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[230  11]
 [226 533]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 70.22%
Percentage of **Actual No's** correct (Specificity): 95.44%
Overall Accuracy: 76.30%
Precision: 97.98%
F1 Score: 81.81%
---------------------------------------------
Saved: FEVER_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

[SFT] AU-PRC: 0.9783787986677408
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[226  15]
 [135 624]]
Total Actual Yes's (TP + FN): 759
Total Actual No's (TN + FP): 241
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 82.21%
Percentage of **Actual No's** correct (Specificity): 93.78%
Overall Accuracy: 85.00%
Precision: 97.65%
F1 Score: 89.27%
---------------------------------------------
Saved: FEVER_gemma2b_rag_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


### BOOLQ

In [42]:
hparams1 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True

}

hparams2 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 32,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams3 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 8,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams4 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 16,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams5 = {
  'epoch': 2,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.01,
  'gradient_checkpointing': True
}

hparams6 = {
  'epoch': 1,
  'batch_size': 2,
  'learning_rate': 5e-5,
  'lora_rank': 32,
  'lora_alpha': 64,
  'dropout': 0.01,
  'gradient_checkpointing': True
}


hparams7 = {
  'epoch': 3,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams8 = {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}

all_hparams = [hparams1, hparams2, hparams3, hparams4, hparams5, hparams6, hparams7, hparams8]

In [29]:
boolq_datasets = EvaluationDatasetType.boolq.load_and_prepare()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [30]:
boolq_datasets

DatasetDict({
    train: Dataset({
        features: ['claim', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['claim', 'label'],
        num_rows: 3270
    })
})

In [31]:
boolq_test_dataset = boolq_datasets['validation'].to_pandas()

In [32]:
boolq_test_dataset

Unnamed: 0,claim,label
0,does ethanol take more energy make that produces,False
1,is house tax and property tax are same,True
2,is pain experienced in a missing body part or ...,True
3,is harry potter and the escape from gringotts ...,True
4,is there a difference between hydroxyzine hcl ...,True
...,...,...
3265,is manic depression the same as bi polar,True
3266,was whiskey galore based on a true story,True
3267,are there plants on the international space st...,True
3268,does the hockey puck have to cross the line to...,True


In [33]:
boolq_test_dataset = boolq_test_dataset.sample(n=1000, random_state=42)

#### Baseline

In [34]:
# model, tokenizer, generation_config = load_model(EncoderDecoderModelType.T5GEMMA)
gemma2b_base_model, tokenizer = load_model(DecoderOnlyModelType.GEMMA2, use_quantiziation=True)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [35]:
gemma2b_base_model = gemma2b_base_model.to("cuda")

In [36]:
baseline = partial(non_retrieval_pipeline,
                   model=gemma2b_base_model,
                   tokenizer=tokenizer,
                   device="cuda")

In [37]:
baseline_probs = get_probabilities_from_dataset(boolq_test_dataset, baseline)

  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [38]:
baseline_probs

Unnamed: 0,claim,label,Yes,No
1644,is living at high altitude good for you,False,0.039978,0.960022
134,is fate and the furious the last movie,False,0.018021,0.981979
411,do the miami dolphins have a real dolphin in t...,False,0.048584,0.951416
203,is tess carroll died in charlie st cloud,False,0.017075,0.982925
1159,did the beatles ever play at red rocks,True,0.014229,0.985771
...,...,...,...,...
1225,do male and female pronghorn antelopes have horns,True,0.089905,0.910095
2202,has serbia ever won the fifa world cup,False,0.068176,0.931824
2576,does a soccer kick off have to go forward,False,0.891602,0.108398
767,is a soccer stadium bigger than a football sta...,True,0.236572,0.763428


In [39]:
yes_labels = baseline_probs.label.apply(lambda x : 1 if x else 0)
yes_probs = baseline_probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[Baseline] AU-PRC: {auprc}")

[Baseline] AU-PRC: 0.6902373090714296


In [41]:
display_confusion_matrix(yes_probs, yes_labels)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[322  77]
 [390 211]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 35.11%
Percentage of **Actual No's** correct (Specificity): 80.70%
Overall Accuracy: 53.30%
Precision: 73.26%
F1 Score: 47.47%
---------------------------------------------


In [42]:
save_experiment_results(
    baseline_probs,
    dataset='boolq',
    model_name='gemma2b',
    run_type='baseline'
)

Saved: boolq_gemma2b_baseline_no_params.parquet


#### LLM Only

In [None]:
factguard_llm = partial(non_retrieval_pipeline,
                        model=sft_model,
                        tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(boolq_test_dataset, factguard_llm)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")

[SFT - LLM ONLY] AU-PRC: 0.7504224314830383


In [None]:
display_confusion_matrix(yes_probs, threshold=0.59)

--- Performance at Threshold: 0.59 ---
Confusion Matrix:
[[203 196]
 [128 473]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 78.70%
Percentage of **Actual No's** correct (Specificity): 50.88%
Overall Accuracy: 67.60%
Precision: 70.70%
F1 Score: 74.49%
---------------------------------------------


In [None]:
save_experiment_results(
    probs,
    dataset='boolq',
    model_name='t5gemma',
    run_type='finetune',
    hparams=hparams
)

Saved: boolq_t5gemma_finetune_bs8_do0.05_ep1_lr5e-05_a128_r64.parquet


#### RAG

In [None]:
factguard = partial(factguard_pipeline,
                    search_tool=DuckDuckGoSearchRun(),
                    model=sft_model,
                    tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(boolq_test_dataset, factguard)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")

[SFT - LLM ONLY] AU-PRC: 0.7920889713755833


In [None]:
display_confusion_matrix(yes_probs, threshold=0.59)

--- Performance at Threshold: 0.59 ---
Confusion Matrix:
[[373  26]
 [428 173]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 28.79%
Percentage of **Actual No's** correct (Specificity): 93.48%
Overall Accuracy: 54.60%
Precision: 86.93%
F1 Score: 43.25%
---------------------------------------------


In [None]:
save_experiment_results(
    probs,
    dataset='boolq',
    model_name='t5gemma',
    run_type='rag',
    hparams=hparams
)

Saved: boolq_t5gemma_rag_bs8_do0.05_ep1_lr5e-05_a128_r64.parquet


In [43]:
def evaluate_boolq(hparams, sft_model, tokenizer):
  print(f"Evals: {hparams}")
  factguard_llm = partial(non_retrieval_pipeline,
                          model=sft_model,
                          tokenizer=tokenizer, device="cuda")
  probs = get_probabilities_from_dataset(boolq_test_dataset, factguard_llm)
  yes_labels = probs.label.apply(lambda x : 1 if x else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='boolq',
      model_name='gemma2b',
      run_type='finetune',
      hparams=hparams
  )
  # RAG
  factguard = partial(factguard_pipeline,
                      search_tool=DuckDuckGoSearchRun(),
                      model=sft_model,
                      tokenizer=tokenizer, device="cuda")
  probs = get_probabilities_from_dataset(boolq_test_dataset, factguard)
  yes_labels = probs.label.apply(lambda x : 1 if x else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT - RAG ONLY] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='boolq',
      model_name='gemma2b',
      run_type='rag',
      hparams=hparams
  )

In [44]:
def evaluate_boolq_all(all_hparams):
  for hparams in all_hparams:
    sft_model, tokenizer = load_model(
        model_type=DecoderOnlyModelType.GEMMA2,
        use_quantiziation=True, finetuned=True, hparams=hparams
    )
    sft_model = sft_model.to("cuda")
    evaluate_boolq(hparams, sft_model, tokenizer)

In [45]:
evaluate_boolq_all(all_hparams)

karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a128-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.7260315910687847
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[362  37]
 [462 139]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 23.13%
Percentage of **Actual No's** correct (Specificity): 90.73%
Overall Accuracy: 50.10%
Precision: 78.98%
F1 Score: 35.78%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

[SFT - RAG ONLY] AU-PRC: 0.8526320593805663
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[342  57]
 [258 343]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 57.07%
Percentage of **Actual No's** correct (Specificity): 85.71%
Overall Accuracy: 68.50%
Precision: 85.75%
F1 Score: 68.53%
---------------------------------------------
Saved: boolq_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr0.0005-a32-r64


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6958017820545177
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[374  25]
 [520  81]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 13.48%
Percentage of **Actual No's** correct (Specificity): 93.73%
Overall Accuracy: 45.50%
Precision: 76.42%
F1 Score: 22.91%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

[SFT - RAG ONLY] AU-PRC: 0.84883312929533
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[349  50]
 [272 329]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 54.74%
Percentage of **Actual No's** correct (Specificity): 87.47%
Overall Accuracy: 67.80%
Precision: 86.81%
F1 Score: 67.14%
---------------------------------------------
Saved: boolq_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a16-r8


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 8, 'lora_alpha': 16, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.720334182380546
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[233 166]
 [236 365]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 60.73%
Percentage of **Actual No's** correct (Specificity): 58.40%
Overall Accuracy: 59.80%
Precision: 68.74%
F1 Score: 64.49%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a16_r8.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 2.97 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.32 GiB is free. Process 4997 has 12.42 GiB memory in use. Of the allocated memory 10.95 GiB is allocated by PyTorch, and 1.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error during logit extraction: CUDA out of memory. Tried to allocate 2.75 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.32 GiB is free. Process 4997 has 12.42 GiB memory in use. Of the allocated memory 10.67 GiB is allocated by PyTorch, and 1.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memor

adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 16, 'lora_alpha': 16, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.7249804757843747
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[380  19]
 [524  77]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 12.81%
Percentage of **Actual No's** correct (Specificity): 95.24%
Overall Accuracy: 45.70%
Precision: 80.21%
F1 Score: 22.09%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a16_r16.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 2.77 GiB. GPU 0 has a total capacity of 14.74 GiB of which 82.12 MiB is free. Process 4997 has 14.66 GiB memory in use. Of the allocated memory 12.85 GiB is allocated by PyTorch, and 1.68 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error during logit extraction: CUDA out of memory. Tried to allocate 1.93 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.83 GiB is free. Process 4997 has 12.91 GiB memory in use. Of the allocated memory 11.38 GiB is allocated by PyTorch, and 1.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memo

adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 2, 'batch_size': 4, 'learning_rate': 5e-05, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.01, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.7162677875281007
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[369  30]
 [487 114]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 18.97%
Percentage of **Actual No's** correct (Specificity): 92.48%
Overall Accuracy: 48.30%
Precision: 79.17%
F1 Score: 30.60%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.01_ep2_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 1.12 GiB. GPU 0 has a total capacity of 14.74 GiB of which 538.12 MiB is free. Process 4997 has 14.21 GiB memory in use. Of the allocated memory 12.79 GiB is allocated by PyTorch, and 1.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error during logit extraction: CUDA out of memory. Tried to allocate 1.19 GiB. GPU 0 has a total capacity of 14.74 GiB of which 474.12 MiB is free. Process 4997 has 14.28 GiB memory in use. Of the allocated memory 12.86 GiB is allocated by PyTorch, and 1.29 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for M

adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Evals: {'epoch': 1, 'batch_size': 2, 'learning_rate': 5e-05, 'lora_rank': 32, 'lora_alpha': 64, 'dropout': 0.01, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.7168403699841147
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[364  35]
 [462 139]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 23.13%
Percentage of **Actual No's** correct (Specificity): 91.23%
Overall Accuracy: 50.30%
Precision: 79.89%
F1 Score: 35.87%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs2_do0.01_ep1_gradient_checkpointingTrue_lr5e-05_a64_r32.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 2.76 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.48 GiB is free. Process 4997 has 12.26 GiB memory in use. Of the allocated memory 4.32 GiB is allocated by PyTorch, and 7.81 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
[SFT - RAG ONLY] AU-PRC: 0.8287452330059868
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[320  79]
 [232 369]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 61.40%
Percentage of **Actual No's** correct (Specificity): 80.20%
Overall Accuracy: 68.90%
Precision: 82.37%
F1 Score: 70.35%
---------------------------------------------


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 128, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6362830497339172
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[375  24]
 [564  37]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 6.16%
Percentage of **Actual No's** correct (Specificity): 93.98%
Overall Accuracy: 41.20%
Precision: 60.66%
F1 Score: 11.18%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a128_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 2.76 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.41 GiB is free. Process 4997 has 12.32 GiB memory in use. Of the allocated memory 6.71 GiB is allocated by PyTorch, and 5.49 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
[SFT - RAG ONLY] AU-PRC: 0.6973348823438021
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[332  67]
 [437 164]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 27.29%
Percentage of **Actual No's** correct (Specificity): 83.21%
Overall Accuracy: 49.60%
Precision: 71.00%
F1 Score: 39.42%
---------------------------------------------


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Evals: {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}


  0%|          | 0/1000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6931035148091488
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[270 129]
 [317 284]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 47.25%
Percentage of **Actual No's** correct (Specificity): 67.67%
Overall Accuracy: 55.40%
Precision: 68.77%
F1 Score: 56.02%
---------------------------------------------
Saved: boolq_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


  0%|          | 0/1000 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 2.75 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.24 GiB is free. Process 4997 has 12.50 GiB memory in use. Of the allocated memory 9.09 GiB is allocated by PyTorch, and 3.28 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
[SFT - RAG ONLY] AU-PRC: 0.7806468224751634
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[252 147]
 [150 451]]
Total Actual Yes's (TP + FN): 601
Total Actual No's (TN + FP): 399
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 75.04%
Percentage of **Actual No's** correct (Specificity): 63.16%
Overall Accuracy: 70.30%
Precision: 75.42%
F1 Score: 75.23%
---------------------------------------------


### LIAR

In [43]:
liar_datasets = EvaluationDatasetType.liar.load_and_prepare()

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/169k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/168k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10240 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1267 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10240 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1267 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [44]:
liar_dataset = liar_datasets['test'].to_pandas()

In [45]:
liar_dataset

Unnamed: 0,label,claim
0,true,Building a wall on the U.S.-Mexico border will...
1,false,Wisconsin is on pace to double the number of l...
2,false,Says John McCain has done nothing to help the ...
3,true,Over the past five years the federal governmen...
4,true,Says that Tennessee law requires that schools ...
...,...,...
452,true,Says Barack Obama promised to halve the defici...
453,true,I am the only senator who turned down the stat...
454,false,There is no system to vet refugees from the Mi...
455,false,I think its seven or eight of the California s...


#### BASELINE

In [46]:
# model, tokenizer, generation_config = load_model(EncoderDecoderModelType.T5GEMMA)
gemma2b_base_model, tokenizer = load_model(DecoderOnlyModelType.GEMMA2, use_quantiziation=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [47]:
gemma2b_base_model = gemma2b_base_model.to("cuda")

In [48]:
baseline = partial(non_retrieval_pipeline,
                   model=gemma2b_base_model,
                   tokenizer=tokenizer,
                   device="cuda")

In [49]:
baseline_probs = get_probabilities_from_dataset(liar_dataset, baseline)

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [50]:
yes_labels = baseline_probs.label.apply(lambda x : 1 if x == 'true' else 0)
yes_probs = baseline_probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[Baseline] AU-PRC: {auprc}")

[Baseline] AU-PRC: 0.5587429569262828


In [51]:
display_confusion_matrix(yes_probs, yes_labels)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[231  18]
 [177  31]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 14.90%
Percentage of **Actual No's** correct (Specificity): 92.77%
Overall Accuracy: 57.33%
Precision: 63.27%
F1 Score: 24.12%
---------------------------------------------


In [52]:
save_experiment_results(
    baseline_probs,
    dataset='liar',
    model_name='gemma2b',
    run_type='baseline'
)

Saved: liar_gemma2b_baseline_no_params.parquet


#### LLM ONLY

In [None]:
factguard_llm = partial(non_retrieval_pipeline,
                        model=sft_model,
                        tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(liar_dataset, factguard_llm)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x == 'true' else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[Baseline] AU-PRC: {auprc}")

[Baseline] AU-PRC: 0.5513606119081219


In [None]:
display_confusion_matrix(yes_probs, threshold=0.5)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[ 23 226]
 [ 14 194]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 93.27%
Percentage of **Actual No's** correct (Specificity): 9.24%
Overall Accuracy: 47.48%
Precision: 46.19%
F1 Score: 61.78%
---------------------------------------------


In [None]:
save_experiment_results(
    probs,
    dataset='liar',
    model_name='t5gemma',
    run_type='finetune',
    hparams=hparams
)

Saved: liar_t5gemma_finetune_bs8_do0.05_ep1_lr5e-05_a128_r64.parquet


#### RAG

In [None]:
factguard = partial(factguard_pipeline,
                    search_tool=DuckDuckGoSearchRun(),
                    model=sft_model,
                    tokenizer=tokenizer, device="cuda")

In [None]:
probs = get_probabilities_from_dataset(liar_dataset, factguard)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
yes_labels = probs.label.apply(lambda x : 1 if x == 'true' else 0)
yes_probs = probs['Yes']
auprc = average_precision_score(yes_labels, yes_probs)
print(f"[Baseline] AU-PRC: {auprc}")

[Baseline] AU-PRC: 0.5223248272721103


In [None]:
display_confusion_matrix(yes_probs)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[ 89 160]
 [ 74 134]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 64.42%
Percentage of **Actual No's** correct (Specificity): 35.74%
Overall Accuracy: 48.80%
Precision: 45.58%
F1 Score: 53.39%
---------------------------------------------


In [None]:
save_experiment_results(
    probs,
    dataset='liar',
    model_name='t5gemma',
    run_type='rag',
    hparams=hparams
)

Saved: liar_t5gemma_rag_bs8_do0.05_ep1_lr5e-05_a128_r64.parquet


In [30]:
def evaluate_liar(hparams, sft_model, tokenizer):
  factguard_llm = partial(non_retrieval_pipeline,
                          model=sft_model,
                          tokenizer=tokenizer, device="cuda")

  probs = get_probabilities_from_dataset(liar_dataset, factguard_llm)

  yes_labels = probs.label.apply(lambda x : 1 if x == 'true' else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT - LLM ONLY] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='liar',
      model_name='gemma2b',
      run_type='finetune',
      hparams=hparams
  )

  # RAG
  factguard = partial(factguard_pipeline,
                      search_tool=DuckDuckGoSearchRun(),
                      model=sft_model,
                      tokenizer=tokenizer, device="cuda")

  probs = get_probabilities_from_dataset(liar_dataset, factguard)

  yes_labels = probs.label.apply(lambda x : 1 if x == 'true' else 0)
  yes_probs = probs['Yes']
  auprc = average_precision_score(yes_labels, yes_probs)
  print(f"[SFT - RAG] AU-PRC: {auprc}")
  display_confusion_matrix(yes_probs, yes_labels)
  save_experiment_results(
      probs,
      dataset='liar',
      model_name='gemma2b',
      run_type='rag',
      hparams=hparams
  )

In [31]:
def evaluate_liar_all(all_hparams):
  for hparams in all_hparams:
    sft_model, tokenizer = load_model(
        model_type=DecoderOnlyModelType.GEMMA2,
        use_quantiziation=True, finetuned=True, hparams=hparams
    )
    sft_model = sft_model.to("cuda")
    evaluate_liar(hparams, sft_model, tokenizer)

In [32]:
hparams1 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True

}

hparams2 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 32,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams3 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 8,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams4 = {
  'epoch': 1,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 16,
  'lora_alpha': 16,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams5 = {
  'epoch': 2,
  'batch_size': 4,
  'learning_rate': 5e-5,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.01,
  'gradient_checkpointing': True
}

hparams6 = {
  'epoch': 1,
  'batch_size': 2,
  'learning_rate': 5e-5,
  'lora_rank': 32,
  'lora_alpha': 64,
  'dropout': 0.01,
  'gradient_checkpointing': True
}


hparams7 = {
  'epoch': 3,
  'batch_size': 4,
  'learning_rate': 5e-4,
  'lora_rank': 64,
  'lora_alpha': 128,
  'dropout': 0.05,
  'gradient_checkpointing': True
}

hparams8 = {'epoch': 3, 'batch_size': 4, 'learning_rate': 0.0005, 'lora_rank': 64, 'lora_alpha': 32, 'dropout': 0.05, 'gradient_checkpointing': True}

all_hparams = [hparams1, hparams2, hparams3, hparams4, hparams5, hparams6, hparams7, hparams8]

evaluate_liar_all(all_hparams)

karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a128-r64


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

adapter_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.5980312381464264
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[203  46]
 [143  65]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 31.25%
Percentage of **Actual No's** correct (Specificity): 81.53%
Overall Accuracy: 58.64%
Precision: 58.56%
F1 Score: 40.75%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.4877755908463439
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[142 107]
 [114  94]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 45.19%
Percentage of **Actual No's** correct (Specificity): 57.03%
Overall Accuracy: 51.64%
Precision: 46.77%
F1 Score: 45.97%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr0.0005-a32-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6158024117797856
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[186  63]
 [115  93]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 44.71%
Percentage of **Actual No's** correct (Specificity): 74.70%
Overall Accuracy: 61.05%
Precision: 59.62%
F1 Score: 51.10%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.5358032607591445
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[142 107]
 [100 108]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 51.92%
Percentage of **Actual No's** correct (Specificity): 57.03%
Overall Accuracy: 54.70%
Precision: 50.23%
F1 Score: 51.06%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr5e-05-a16-r8


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6171260297818655
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[ 87 162]
 [ 47 161]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 77.40%
Percentage of **Actual No's** correct (Specificity): 34.94%
Overall Accuracy: 54.27%
Precision: 49.85%
F1 Score: 60.64%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a16_r8.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.5122073079629179
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[ 98 151]
 [ 73 135]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 64.90%
Percentage of **Actual No's** correct (Specificity): 39.36%
Overall Accuracy: 50.98%
Precision: 47.20%
F1 Score: 54.66%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr5e-05_a16_r8.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep1-gradient_checkpointingTrue-lr0.0005-a16-r16


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.5976394556012132
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[205  44]
 [128  80]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 38.46%
Percentage of **Actual No's** correct (Specificity): 82.33%
Overall Accuracy: 62.36%
Precision: 64.52%
F1 Score: 48.19%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a16_r16.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.5400635986970166
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[168  81]
 [103 105]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 50.48%
Percentage of **Actual No's** correct (Specificity): 67.47%
Overall Accuracy: 59.74%
Precision: 56.45%
F1 Score: 53.30%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep1_gradient_checkpointingTrue_lr0.0005_a16_r16.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.01-ep2-gradient_checkpointingTrue-lr5e-05-a128-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.5766227800055963
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[197  52]
 [130  78]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 37.50%
Percentage of **Actual No's** correct (Specificity): 79.12%
Overall Accuracy: 60.18%
Precision: 60.00%
F1 Score: 46.15%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.01_ep2_gradient_checkpointingTrue_lr5e-05_a128_r64.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 1.41 GiB. GPU 0 has a total capacity of 14.74 GiB of which 582.12 MiB is free. Process 3622 has 14.17 GiB memory in use. Of the allocated memory 13.14 GiB is allocated by PyTorch, and 931.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
[SFT - RAG] AU-PRC: 0.532038353926687
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[165  84]
 [107 101]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 48.56%
Percentage of **Actual No's** correct (Specificity): 66.27%
Overall Accuracy: 58.21%
Precision: 54.59%
F1 Score: 51.40%
---------------------------------------------
S

adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.6521251288063845
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[193  56]
 [118  90]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 43.27%
Percentage of **Actual No's** correct (Specificity): 77.51%
Overall Accuracy: 61.93%
Precision: 61.64%
F1 Score: 50.85%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs2_do0.01_ep1_gradient_checkpointingTrue_lr5e-05_a64_r32.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

Error during logit extraction: CUDA out of memory. Tried to allocate 380.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 240.12 MiB is free. Process 3622 has 14.50 GiB memory in use. Of the allocated memory 14.11 GiB is allocated by PyTorch, and 276.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error during logit extraction: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 240.12 MiB is free. Process 3622 has 14.50 GiB memory in use. Of the allocated memory 14.27 GiB is allocated by PyTorch, and 115.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentati

adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.5102122470546421
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[236  13]
 [187  21]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 10.10%
Percentage of **Actual No's** correct (Specificity): 94.78%
Overall Accuracy: 56.24%
Precision: 61.76%
F1 Score: 17.36%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a128_r64.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.4554396602130063
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[143 106]
 [113  95]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 45.67%
Percentage of **Actual No's** correct (Specificity): 57.43%
Overall Accuracy: 52.08%
Precision: 47.26%
F1 Score: 46.45%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a128_r64.parquet
karan-mids24-hf/factguard-gemma-2-2b-it-bs4-do0.05-ep3-gradient_checkpointingTrue-lr0.0005-a32-r64


adapter_config.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

  0%|          | 0/457 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[SFT - LLM ONLY] AU-PRC: 0.5157037973117609
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[162  87]
 [115  93]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 44.71%
Percentage of **Actual No's** correct (Specificity): 65.06%
Overall Accuracy: 55.80%
Precision: 51.67%
F1 Score: 47.94%
---------------------------------------------
Saved: liar_gemma2b_finetune_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


  0%|          | 0/457 [00:00<?, ?it/s]

[SFT - RAG] AU-PRC: 0.48427150902559746
--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[140 109]
 [106 102]]
Total Actual Yes's (TP + FN): 208
Total Actual No's (TN + FP): 249
---------------------------------------------
Percentage of **Actual Yes's** correct (Recall): 49.04%
Percentage of **Actual No's** correct (Specificity): 56.22%
Overall Accuracy: 52.95%
Precision: 48.34%
F1 Score: 48.69%
---------------------------------------------
Saved: liar_gemma2b_rag_bs4_do0.05_ep3_gradient_checkpointingTrue_lr0.0005_a32_r64.parquet


## Playground

In [None]:
search_tool=DuckDuckGoSearchRun()
a = search_tool.invoke("is Barack Obama the 45th president of the United States?")
print(a)

The first president , George Washington, won a unanimous vote of the Electoral College.[4] The incumbent president is Donald Trump, who assumed office on January 20, 2025.[5][6] Since the office was established in 1789, 45 men have served in 47 presidencies . Barack Obama (born August 4, 1961, Honolulu, Hawaii, U . S .) is the 44 th president of the United States (2009–17) and the first African American to hold the office. Before winning the presidency , Obama represented Illinois in the U . S . Senate (2005–08). Barack H. Obama is the 44 th President of the United States . Few presidents have walked a more improbable path to the White House.But Michelle Obama has spent her life challenging us to reconsider where that “supposed to” comes from — and who determines it. Presidents of the United States website. U . S . Presidents in Order. President Name.2000 election victory decided by Supreme Court. Barack Obama . Here are all the presidents of the United States in chronological order, f

In [None]:
a

'The first president , George Washington, won a unanimous vote of the Electoral College.[4] The incumbent president is Donald Trump, who assumed office on January 20, 2025.[5][6] Since the office was established in 1789, 45 men have served in 47 presidencies . Barack Obama (born August 4, 1961, Honolulu, Hawaii, U . S .) is the 44 th president of the United States (2009–17) and the first African American to hold the office. Before winning the presidency , Obama represented Illinois in the U . S . Senate (2005–08). Barack H. Obama is the 44 th President of the United States . Few presidents have walked a more improbable path to the White House.But Michelle Obama has spent her life challenging us to reconsider where that “supposed to” comes from — and who determines it. Presidents of the United States website. U . S . Presidents in Order. President Name.2000 election victory decided by Supreme Court. Barack Obama . Here are all the presidents of the United States in chronological order, 

### [Experimental] TruthfulQA

In [None]:
truthful_qa_datasets = load_dataset("domenicrosati/TruthfulQA")

README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/817 [00:00<?, ? examples/s]

In [None]:
def truthful_qa_splits(train_size=0.8, eval_size=0.1, random_state=42):
    print("\n--- Loading and Refactoring 'train' split ---")
    dataset = load_dataset("domenicrosati/TruthfulQA", split="train")

    df = dataset.to_pandas()
    all_claims = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Refactoring data"):
        question = row['Question']
        true_claims = [claim.strip() for claim in row['Correct Answers'].split(';') if claim.strip()]
        false_claims = [claim.strip() for claim in row['Incorrect Answers'].split(';') if claim.strip()]
        for claim in true_claims:
            all_claims.append({
                'question': question,
                'claim': claim,
                'label': 1
            })
        for claim in false_claims:
            all_claims.append({
                'question': question,
                'claim': claim,
                'label': 0
            })

    refactored_df = pd.DataFrame(all_claims)
    print(f"\nTotal Refactored Claims (before split): {len(refactored_df)}")

    # Calculate the remaining size for the combined eval and test set
    test_and_eval_size = 1.0 - train_size

    # Calculate the proportion of the *remaining* data that should be eval
    # This ensures train_size + eval_size + test_size = 1.0
    eval_proportion_of_remaining = eval_size / test_and_eval_size

    df_train, df_temp = train_test_split(
        refactored_df,
        train_size=train_size,
        random_state=random_state,
        stratify=refactored_df['label']
    )

    df_eval, df_test = train_test_split(
        df_temp,
        train_size=eval_proportion_of_remaining, # Split the temp set
        random_state=random_state,
        stratify=df_temp['label']
    )

    datasets = {
        'train': df_train,
        'eval': df_eval,
        'test': df_test
    }

    # Print summary
    print(f"\n--- Final Split Sizes ---")
    print(f"Train size: {len(df_train)}")
    print(f"Eval size:  {len(df_eval)}")
    print(f"Test size:  {len(df_test)}")

    return datasets

In [None]:
tqa_datasets = truthful_qa_splits(train_size=0.8, eval_size=0.1, random_state=42)


--- Loading and Refactoring 'train' split ---


Refactoring data:   0%|          | 0/817 [00:00<?, ?it/s]


Total Refactored Claims (before split): 6209

--- Final Split Sizes ---
Train size: 4967
Eval size:  621
Test size:  621


In [None]:
probs = get_probabilities_from_dataset(tqa_datasets['test'], search_tool, model, tokenizer, "cuda")

  0%|          | 0/621 [00:00<?, ?it/s]

--- Processing Claim: **The best medicine is laughter** ---
CONTEXT: Jan 2, 2016 · The word "best" is an adjective, and adjectives do not take articles by themselves. Because the noun car is modified by the superlative adjective best, and because this makes … Oct 18, 2018 · In the following sentence, however, best is an adjective: "What was best?" If we insert the word the, we get a noun phrase, the best. You could certainly declare that after … May 25, 2022 · "Which one is the best" is obviously a question format, so it makes sense that " which one the best is " should be the correct form. This is very good instinct, and you could … May 29, 2023 · So, " It is the best ever " means it's the best of all time, up to the present. " It was the best ever " means either it was the best up to that point in time, and a better one may have … Oct 20, 2016 · Both sentences could mean the same thing, however I like you best. I like chocolate best, better than anything else can be used when what on

In [None]:
true_labels = probs.label
yes_probs = probs['True']
auprc = average_precision_score(true_labels, yes_probs)
print(f"[SFT] AU-PRC: {auprc}")

[SFT] AU-PRC: 0.4464978920787068


In [None]:
display_confusion_matrix(yes_probs)

--- Performance at Threshold: 0.5 ---
Confusion Matrix:
[[120 217]
 [116 168]]
Total Actual True's (TP + FN): 284
Total Actual False's (TN + FP): 337
---------------------------------------------
Percentage of **Actual True's** correct (Recall): 59.15%
Percentage of **Actual False's** correct (Specificity): 35.61%


In [None]:
fever_datasets = load_dataset("rickpereira/FEVER")
test_set = fever_datasets['test'].to_pandas()

README.md:   0%|          | 0.00/657 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211057 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/52765 [00:00<?, ? examples/s]

In [None]:
def evaluate_fever(search_tool, model, tokenizer, device):
  probs = get_probabilities_from_dataset(test_set[:1000], search_tool, model, tokenizer, device)
  label_map = {'SUPPORTS': 1, 'REFUTES': 0}
  probs['label_binary'] = probs['label'].map(label_map)
  true_labels = probs.label_binary
  true_probs = probs['True']
  auprc = average_precision_score(true_labels, true_probs)
  print(f"[SFT] AU-PRC: {auprc}")
  display_confusion_matrix(true_probs)
  return probs

In [None]:
probs = evaluate_fever(search_tool, model, tokenizer, "cuda")

  0%|          | 0/1000 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
CONTEXT: Aishwarya Rai Bachchan. (pronounced [ɛːʃʋəɾjᵊ ɾɑːj ˈbətːʃən]; née Rai ; born 1 November 1973) is an Indian actress who is primarily known for her work in Hindi and Tamil films . Rai won the Miss World 1994 pageant and later established herself as one of the most-popular and influential... movies : Aishwarya Rai , who paired up with Rajini for the first time with Enthiran, rejected his 4 film offers before that.Before Endran, aishwarya Rai was approached for four films opposite Rajini. aishwarya Rai refused to act in those four films due to various reasons. Aishwarya Rai Bachchan was offered the 2014 film Happy New Year. But she refused to be a part of it despite it being backed by actors like Shah Rukh Khan, Abhishek Bachchan, and Sonu Sood. Did you know that Aishwarya Rai was offered a role in 'Kuch Kuch Hota Hai', but she rejected it. The reason will shock you. Tamil Nadu.The film was adored by the audiences, a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['True'] = true_probs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['False'] = false_probs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  probs['label_binary'] = probs['label'].map(label_map)


ValueError: Found input variables with inconsistent numbers of samples: [621, 1000]