In [1]:
%%capture
!pip install datasets==3.6.0
!pip install -U bitsandbytes accelerate
!pip install -U transformers
!pip install langextract
!pip install trl

In [2]:
from google.colab import drive, userdata
from google import genai
from google.genai import types
from typing import List
import requests
import os
import json
from pydantic import BaseModel, Field
import langextract as lx
import textwrap
import sys
import pandas as pd
from sklearn.metrics import average_precision_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.functional import softmax
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, EarlyStoppingCallback, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm
from trl import SFTTrainer, SFTConfig
from langextract import data
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
os.environ['LANGEXTRACT_API_KEY'] = userdata.get('GEMINI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

### Dataset

In [6]:
def load_from_huggingface(repo_id, split=None):
  dataset_dict = load_dataset(repo_id)
  if not split:
    return dataset_dict
  dataset = dataset_dict[split]
  return dataset.to_pandas()

In [7]:
factchecking_datasets = load_from_huggingface(repo_id='rickpereira/factguard_factchecking_datasets')

README.md:   0%|          | 0.00/537 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/368k [00:00<?, ?B/s]

data/eval-00000-of-00001.parquet:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/55.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6513 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/814 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/815 [00:00<?, ? examples/s]

In [8]:
## Split dataset into train, eval and test sets.
ds_train = factchecking_datasets['train']
ds_eval = factchecking_datasets['eval']
ds_test = factchecking_datasets['test']

In [9]:
ds_train

Dataset({
    features: ['question', 'claim', 'label'],
    num_rows: 6513
})

In [10]:
ds_train[0]

{'question': 'In France they speak French. In England they speak English. In Wales they speak Welsh. In Austria they speak what?',
 'claim': 'The main language of Austria is German or Austrian German',
 'label': 1}

### Baseline Analysis

In [11]:
## Baseline 1: Dummy Classifier
# DummyClassifier makes predictions that ignore the input features.
# This classifier serves as a simple baseline to compare against other more complex classifiers.
# The "most_frequent" strategy predicts the method that always returns the most frequent class label in the observed y argument.
X_train = ds_train['claim']
y_train = ds_train['label']
X_test = ds_test['claim']
y_test = ds_test['label']

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_pred_prob = dummy_clf.predict_proba(X_test)
y_pred = dummy_clf.predict(X_test)
y_test_true = np.array([[1, 0] if label == 0 else [0, 1] for label in y_test])

In [12]:
dummy_auprc = average_precision_score(y_test_true, y_pred_prob)
print(f"[DUMMY CLASSIFIER] Average Precision Score: {dummy_auprc}")

[DUMMY CLASSIFIER] Average Precision Score: 0.5


In [13]:
print(classification_report(np.array(y_test), y_pred, labels = [0, 1], zero_division=0))

              precision    recall  f1-score   support

           0       0.53      1.00      0.69       434
           1       0.00      0.00      0.00       381

    accuracy                           0.53       815
   macro avg       0.27      0.50      0.35       815
weighted avg       0.28      0.53      0.37       815



In [14]:
print(confusion_matrix(y_test, y_pred))

[[434   0]
 [381   0]]


In [15]:
## Baseline 2: TF-IDF + Logistic Regression
# Words are given a "score" based on how important they are.
# This score is a multiplication of two things:
# 1. TF (Term Frequency): How often does a word appear in this one document?
# 2. IDF (Inverse Document Frequency): How rare is this word across all of the documents (entire dataset)?
ds_train_pd = ds_train.to_pandas()
ds_test_pd = ds_test.to_pandas()
ds_eval_pd = ds_eval.to_pandas()
ds_train_pd['text_input'] = ds_train_pd['question'] + ' ' + ds_train_pd['claim']
ds_test_pd['text_input'] = ds_test_pd['question'] + ' ' + ds_test_pd['claim']
ds_eval_pd['text_input'] = ds_eval_pd['question'] + ' ' + ds_eval_pd['claim']

In [16]:
ds_train_pd.at[0, 'text_input']

'In France they speak French. In England they speak English. In Wales they speak Welsh. In Austria they speak what? The main language of Austria is German or Austrian German'

In [17]:
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5, max_df=0.9)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'))
])

In [18]:
print(len(ds_train_pd[ds_train_pd['label']==1]))
print(len(ds_train_pd[ds_train_pd['label']==0]))

3047
3466


In [19]:
model_pipeline.fit(ds_train_pd['text_input'], ds_train_pd['label'])

In [20]:
## Training Set ##
predictions_train = model_pipeline.predict(ds_train_pd['text_input'])
dummy_auprc = average_precision_score(ds_train_pd['label'], predictions_train)
print(f"[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: {dummy_auprc}")

## Eval Set ##
predictions_eval = model_pipeline.predict(ds_eval_pd['text_input'])
dummy_auprc = average_precision_score(ds_eval_pd['label'], predictions_eval)
print(f"[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: {dummy_auprc}")

## Test Set ##
predictions_test = model_pipeline.predict(ds_test_pd['text_input'])
dummy_auprc = average_precision_score(ds_test_pd['label'], predictions_test)
print(f"[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: {dummy_auprc}")

[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: 0.76009225967164
[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: 0.605922773645889
[LOGISTIC REGRESSION CLASSIFIER] Average Precision Score: 0.585111121666553


In [21]:
print("### Trainining Set ###")
print(classification_report(ds_train_pd['label'], predictions_train, labels = [0, 1]))
print("\n")
print("### Eval Set ###")
print(classification_report(ds_eval_pd['label'], predictions_eval, labels = [0, 1]))
print("\n")
print("### Test Set ###")
print(classification_report(ds_test_pd['label'], predictions_test, labels = [0, 1]))

### Trainining Set ###
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      3466
           1       0.83      0.81      0.82      3047

    accuracy                           0.83      6513
   macro avg       0.83      0.83      0.83      6513
weighted avg       0.83      0.83      0.83      6513



### Eval Set ###
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       433
           1       0.68      0.64      0.66       381

    accuracy                           0.69       814
   macro avg       0.69      0.69      0.69       814
weighted avg       0.69      0.69      0.69       814



### Test Set ###
              precision    recall  f1-score   support

           0       0.68      0.72      0.70       434
           1       0.66      0.62      0.64       381

    accuracy                           0.67       815
   macro avg       0.67      0.67      0.67       815
weighted avg 

In [22]:
## Baseline 3: LLM Google gemma-2b-it with zero shot training
OUTPUT_DIR = '/output'
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "google/gemma-2b-it"
print(f"Loading model '{MODEL_NAME}' on device: {device}")

# 4-bit Quantization Config
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

print(f"Loading model '{MODEL_NAME}' with 4-bit quantization on T4...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=nf4_config,
    device_map={"": 0}
)

# Set generation configuration
generation_config = {
    "max_new_tokens": 512,
    "temperature": 0.5,
    "do_sample": True,
}

Loading model 'google/gemma-2b-it' on device: cuda
Loading model 'google/gemma-2b-it' with 4-bit quantization on T4...


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [23]:
def generate_text(model, inputs, max_new_tokens=50):
  inputs = inputs.to(model.device)
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated_text

def apply_chat(tokenizer, prompt, dtype=torch.float16, device=None):
  inputs = tokenizer(prompt, return_tensors="pt")
  if device:
    return inputs.to(device)
  return inputs

def create_prompt(question, claim):
  return (
      f"Is the following statement true or false based on the question?\n\n"
      f"Question: {question}\n"
      f"Statement: {claim}\n\n"
      f"Answer with 'Yes' if the statement is True, and 'No' if it is False.\n"
      f"Do not provide an explanation reasoning, simply answer 'Yes' or 'No'."
      f"Answer: "
  )

def get_probabilities(model, tokenizer, inputs, device=device):
  with torch.no_grad():
    logits = model(**inputs).logits

  # Extract the logits for the Yes and No tokens
  vocab = tokenizer.get_vocab()
  selected_logits = logits[0, -1, [vocab['Yes'], vocab['No']]]

  # Convert these logits to a probability with softmax
  probabilities = softmax(selected_logits, dim=0)
  return probabilities

def get_probabilities_from_dataset(dataset, model, tokenizer):
  dataset = dataset.copy()
  yes_probs = []
  no_probs = []
  for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    prompt = create_prompt(row['question'], row['claim'])
    inputs = apply_chat(tokenizer, prompt, device=device)

    #print(generate_text(model, inputs))
    prob_yes, prob_no = get_probabilities(model, tokenizer, inputs)
    prob_yes = prob_yes.item()
    prob_no = prob_no.item()

    yes_probs.append(prob_yes)
    no_probs.append(prob_no)
  dataset['Yes'] = yes_probs
  dataset['No'] = no_probs
  return dataset

def predict(yes_probs, threshold = 0.5):
  predictions = (yes_probs > threshold).astype(int)
  return predictions

In [24]:
probs = get_probabilities_from_dataset(pd.DataFrame(ds_test), model, tokenizer)

  0%|          | 0/815 [00:00<?, ?it/s]

In [25]:
true_labels_test = probs.label
predictions_test = predict(probs.Yes)
auprc = average_precision_score(true_labels_test, predictions_test)
print(f"[GEMMA 2B IT CLASSIFIER] AU-PRC: {auprc}")

[GEMMA 2B IT CLASSIFIER] AU-PRC: 0.4515735073651353


In [26]:
print("### Test Set ###")
print(classification_report(true_labels_test, predictions_test, labels = [0, 1]))

### Test Set ###
              precision    recall  f1-score   support

           0       0.49      0.40      0.44       434
           1       0.44      0.54      0.48       381

    accuracy                           0.46       815
   macro avg       0.47      0.47      0.46       815
weighted avg       0.47      0.46      0.46       815



In [27]:
ds_train[0]

{'question': 'In France they speak French. In England they speak English. In Wales they speak Welsh. In Austria they speak what?',
 'claim': 'The main language of Austria is German or Austrian German',
 'label': 1}

### Fine-Tuning Analysis

In [28]:
def format_for_training(row):
    answer_text = "Yes" if row['label'] == 1 else "No"

    user_prompt = (
        f"Is the following statement true or false based on the question?\n\n"
        f"Question: {row['question']}\n"
        f"Statement: {row['claim']}\n\n"
        f"Answer with 'Yes' if the statement is True, and 'No' if it is False.\n"
        f"Do not provide an explanation reasoning, simply answer 'Yes' or 'No'."
    )

    messages = [
        {"role": "user", "content": user_prompt},
        {"role": "model", "content": answer_text}
    ]

    # Apply the gemma 2b Chat Template
    row["formatted_chat"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return row

ds_train = ds_train.map(format_for_training)
ds_eval = ds_eval.map(format_for_training)
ds_test = ds_test.map(format_for_training)

Map:   0%|          | 0/6513 [00:00<?, ? examples/s]

Map:   0%|          | 0/814 [00:00<?, ? examples/s]

Map:   0%|          | 0/815 [00:00<?, ? examples/s]

In [29]:
ds_train[0]

{'question': 'In France they speak French. In England they speak English. In Wales they speak Welsh. In Austria they speak what?',
 'claim': 'The main language of Austria is German or Austrian German',
 'label': 1,
 'formatted_chat': "<bos><start_of_turn>user\nIs the following statement true or false based on the question?\n\nQuestion: In France they speak French. In England they speak English. In Wales they speak Welsh. In Austria they speak what?\nStatement: The main language of Austria is German or Austrian German\n\nAnswer with 'Yes' if the statement is True, and 'No' if it is False.\nDo not provide an explanation reasoning, simply answer 'Yes' or 'No'.<end_of_turn>\n<start_of_turn>model\nYes<end_of_turn>\n"}

In [30]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] # Targets all linear layers for better performance
)

sft_config = SFTConfig(
    output_dir="./gemma-classification-finetune",
    dataset_text_field="formatted_chat",
    max_length=512,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    max_steps=100,
    optim="paged_adamw_8bit",
    packing=False,
)

gemma_2b_trainer = SFTTrainer(
    model=model,
    train_dataset=ds_train,
    peft_config=peft_config,
    args=sft_config,
    processing_class=tokenizer
)

Adding EOS to train dataset:   0%|          | 0/6513 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6513 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/6513 [00:00<?, ? examples/s]

In [31]:
gemma_2b_trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:


Abort: 

In [None]:
def get_probabilities(model, tokenizer, inputs):
    with torch.no_grad():
        logits = model(**inputs).logits

    vocab = tokenizer.get_vocab()
    selected_logits = logits[0, -1, [vocab['Yes'], vocab['No']]]

    probabilities = softmax(selected_logits, dim=0)

    return probabilities

def get_probabilities_from_dataset(dataset, model, tokenizer):
    dataset = dataset.copy()
    yes_probs = []
    no_probs = []

    print("Running inference on dataset...")
    for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
        user_prompt = (
            f"Is the following statement true or false based on the question?\n\n"
            f"Question: {row['question']}\n"
            f"Statement: {row['claim']}\n\n"
            f"Answer with 'Yes' if the statement is True, and 'No' if it is False.\n"
            f"Do not provide an explanation reasoning, simply answer 'Yes' or 'No'."
        )

        messages = [{"role": "user", "content": user_prompt}]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        probs = get_probabilities(model, tokenizer, inputs)

        yes_probs.append(probs[0].item()) # Index 0 is 'Yes'
        no_probs.append(probs[1].item())  # Index 1 is 'No'

    dataset['Yes'] = yes_probs
    dataset['No'] = no_probs
    return dataset

In [None]:
model.eval()

results_df = get_probabilities_from_dataset(df_test, model, tokenizer)