In [1]:
import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import bitsandbytes as bnb

# Load ScienceQA dataset (without streaming)
dataset = load_dataset("derek-thomas/ScienceQA", split={"train": "train", "validation": "validation"})

# Use DistilBERT for Question Answering
MODEL_NAME = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [3]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)

# model.lm_head = CastOutputToFloat(model.lm_head)

In [4]:
for name, module in model.named_modules():
    print(name)


distilbert
distilbert.embeddings
distilbert.embeddings.word_embeddings
distilbert.embeddings.position_embeddings
distilbert.embeddings.LayerNorm
distilbert.embeddings.dropout
distilbert.transformer
distilbert.transformer.layer
distilbert.transformer.layer.0
distilbert.transformer.layer.0.attention
distilbert.transformer.layer.0.attention.dropout
distilbert.transformer.layer.0.attention.q_lin
distilbert.transformer.layer.0.attention.k_lin
distilbert.transformer.layer.0.attention.v_lin
distilbert.transformer.layer.0.attention.out_lin
distilbert.transformer.layer.0.sa_layer_norm
distilbert.transformer.layer.0.ffn
distilbert.transformer.layer.0.ffn.dropout
distilbert.transformer.layer.0.ffn.lin1
distilbert.transformer.layer.0.ffn.lin2
distilbert.transformer.layer.0.ffn.activation
distilbert.transformer.layer.0.output_layer_norm
distilbert.transformer.layer.1
distilbert.transformer.layer.1.attention
distilbert.transformer.layer.1.attention.dropout
distilbert.transformer.layer.1.attention.q

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model, TaskType 

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    target_modules=["distilbert.transformer.layer.5.attention.q_lin", "distilbert.transformer.layer.5.attention.v_lin", "distilbert.transformer.layer.5.attention.out_lin"], #if you know the 
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.QUESTION_ANS # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 75266 || all params: 65267716 || trainable%: 0.11531888138999685


In [7]:
def find_answer_positions(context, answer_text):
    """Finds the start and end positions of the answer text within the tokenized context."""
    tokenized_context = tokenizer(context, add_special_tokens=False, return_offsets_mapping=True)
    
    # Convert answer to token IDs
    answer_tokens = tokenizer(answer_text,  add_special_tokens=False)["input_ids"]

    # Find start position
    for i in range(len(tokenized_context["input_ids"])):
        if tokenized_context["input_ids"][i:i+len(answer_tokens)] == answer_tokens:
            return i, i + len(answer_tokens) - 1  # Return start & end positions

    return None, None  # If answer not found


In [8]:
# def preprocess_function(examples):
#     """
#     Extracts (question, choices), finds the correct answer, tokenizes, 
#     and returns training data in the format expected by DistilBERT.
#     """
#     questions = examples["question"]
#     choices = examples["choices"]
#     answer_indices = examples["answer"]

#     tokenized_inputs = {
#         "input_ids": [],
#         "attention_mask": [],
#         "start_positions": [],
#         "end_positions": [],
#     }

#     for q, c, idx in zip(questions, choices, answer_indices):
#         if idx < 0 or idx >= len(c):  # Handle bad indices
#             continue

#         correct_answer = c[idx]
#         context = f"{q} Choices: {', '.join(c)}"
#         print(f"context {context}")

#         # Tokenize the entire question + choices
#         tokenized = tokenizer(context, padding="max_length", truncation=True, max_length=512, return_offsets_mapping=True)
#         start_pos, end_pos = find_answer_positions(context, correct_answer)
#         print(f"start_pos, end_pos: {start_pos} {end_pos}")

#         if start_pos is None or end_pos is None:
#             continue  # Skip if answer not found in tokenized context

#         tokenized_inputs["input_ids"].append(tokenized["input_ids"])
#         tokenized_inputs["attention_mask"].append(tokenized["attention_mask"])
#         tokenized_inputs["start_positions"].append(start_pos)
#         tokenized_inputs["end_positions"].append(end_pos)

#     return tokenized_inputs

In [10]:
def preprocess_function(examples):
    """
    Extracts question, choices, correct answer, and tokenizes them.
    """
    questions = examples["question"]
    choices = examples["choices"]
    answer_indices = examples["answer"]
    
    answers_text = []
    contexts = []
    start_pos = []
    end_pos = []
    index = 0
    qnc_offset = []
    for e in examples["question"]:
        choices = examples["choices"][index]
        answer_index = examples["answer"][index]
        answer = choices[answer_index]
        c =  f"{e} Choices: {', '.join(choices)}"
        qnc_offset = len(f"{e} Choices: ,")
        answers_text.append(answer)
        contexts.append(c)
        start_pos.append(qnc_offset+answer_index)
        end_pos.append(qnc_offset+len(answer))
        index+=1

    # Extract correct answer text
#     answers_text = [
#         a[idx] if 0 <= idx < len(a) else "" for a, idx in zip(choices, answer_indices)
#     ]

    # Tokenize question
    tokenized_questions = tokenizer(
        contexts,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    # Tokenize correct answer
    tokenized_answers = tokenizer(
        answers_text,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    print(f'input sizes {len(tokenized_questions["input_ids"])} {len(tokenized_answers["input_ids"])}')

    # Combine into a single dictionary
    tokenized_inputs = {
        "input_ids": tokenized_questions["input_ids"],
        "attention_mask": tokenized_questions["attention_mask"],
        "labels": tokenized_answers["input_ids"],  # Labels for causal language modeling
        "start_positions":  start_pos,
        "end_positions": end_pos
    }

    return tokenized_inputs

In [11]:
# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Convert to PyTorch dataset
train_dataset = tokenized_dataset["train"].with_format("torch")
valid_dataset = tokenized_dataset["validation"].with_format("torch")

Map:   0%|          | 0/12726 [00:00<?, ? examples/s]

input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 726 726


Map:   0%|          | 0/4241 [00:00<?, ? examples/s]

input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 1000 1000
input sizes 241 241


In [12]:
training_args = TrainingArguments(
    output_dir="./scienceqa_finetuned_lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    max_steps=1000,
    no_cuda=True
)



In [13]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


In [14]:
# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mshaddie77[0m ([33mshaddie77-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,5.4832,5.462069


TrainOutput(global_step=1000, training_loss=6.4942233276367185, metrics={'train_runtime': 2638.627, 'train_samples_per_second': 3.032, 'train_steps_per_second': 0.379, 'total_flos': 523537268736000.0, 'train_loss': 6.4942233276367185, 'epoch': 0.6285355122564424})

In [15]:

# Save the fine-tuned model
trainer.save_model("./scienceqa_finetuned_peft_lora_model")
tokenizer.save_pretrained("./scienceqa_finetuned_peft_lora_model")

print("Fine-tuning complete! Model saved at './scienceqa_finetuned_peft_lora_model'")

Fine-tuning complete! Model saved at './scienceqa_finetuned_peft_lora_model'


In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("shaddie/scienceqa_finetuned_peft_lora_model",
                  use_auth_token=True,
                  commit_message="fine-tuning-exercises",
                  private=True)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/302k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shaddie/scienceqa_finetuned_peft_lora_model/commit/fe21929c0a1c0deb349795f310cd3a72b3cc4cf8', commit_message='fine-tuning-exercises', commit_description='', oid='fe21929c0a1c0deb349795f310cd3a72b3cc4cf8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shaddie/scienceqa_finetuned_peft_lora_model', endpoint='https://huggingface.co', repo_type='model', repo_id='shaddie/scienceqa_finetuned_peft_lora_model'), pr_revision=None, pr_num=None)

In [18]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

peft_model_id = "shaddie/scienceqa_finetuned_peft_lora_model"
config = PeftConfig.from_pretrained(peft_model_id)

adapter_config.json:   0%|          | 0.00/820 [00:00<?, ?B/s]

In [19]:
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/302k [00:00<?, ?B/s]

In [26]:
def answer_question(question, choices):
    """
    Given a science question, retrieves a context from ScienceQA and predicts the answer.
    """
#     context = f"{prompt_question} Choices: {', '.join(choices)}"
    c = ', '.join(choices)
    context = f"{prompt_question} Choices: "+c 
    inputs = tokenizer(context, add_special_tokens=False)

#     # Tokenize question with context
#     inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    
    inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=512)


    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most probable start and end positions
    start_index = torch.argmax(start_logits).item()
    end_index = torch.argmax(end_logits).item()

    # Ensure indices are within valid range
    if start_index >= end_index or start_index <= 0 or end_index > len(inputs["input_ids"][0]):
        return "Answer not found."

    # Convert token IDs to words
    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer.strip()



In [27]:
prompt_question = "What causes tides?"
prompt_choices = ["Tides are caused by the moon's gravitational pull.", "The sun also has an effect, but the moon is the dominant factor" ]

answer_question(prompt_question, prompt_choices)

"the moon ' s gravitational pull"