In [2]:
!pip install torch transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load a smaller model to reduce GPU usage
model_name = "EleutherAI/gpt-neo-125M"  # Smaller model (125M instead of 1.3B)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Fix: Add padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Load the GSM8K dataset
dataset = load_dataset("openai/gsm8k", "main")
train_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.9 * len(dataset["train"]))))  # 90% for training
eval_dataset = dataset["train"].shuffle(seed=42).select(range(int(0.9 * len(dataset["train"])), len(dataset["train"])))  # 10% for evaluation

def tokenize_function(example):
    """Tokenize inputs and prepare labels for causal LM training."""
    inputs = tokenizer(example["question"], truncation=True, padding="max_length", max_length=64)  # Reduce max_length
    labels = tokenizer(example["answer"], truncation=True, padding="max_length", max_length=64)["input_ids"]
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels,
    }

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Define training arguments (optimized for low VRAM)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=4,  # Simulate larger batches with accumulation
    fp16=True,  # Enable mixed precision training
    save_steps=1000,  # Save model less frequently
    save_total_limit=1,  # Keep only one checkpoint
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=50,
)

# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

# Save fine-tuned model
model.save_pretrained("./fine_tuned_llm")
tokenizer.save_pretrained("./fine_tuned_llm")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/6725 [00:00<?, ? examples/s]

Map:   0%|          | 0/748 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msp7386101[0m ([33msp7386101-srm-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
0,5.1814,5.206222


('./fine_tuned_llm/tokenizer_config.json',
 './fine_tuned_llm/special_tokens_map.json',
 './fine_tuned_llm/vocab.json',
 './fine_tuned_llm/merges.txt',
 './fine_tuned_llm/added_tokens.json',
 './fine_tuned_llm/tokenizer.json')

7e168d39419c3a9caad4b3ede3d0f64d71502331

In [4]:
import numpy as np

# Function to compute exact match accuracy
def exact_match_accuracy(predictions, ground_truths):
    """
    Computes exact match accuracy.

    Formula:
    Accuracy = (Correct Predictions / Total Questions) * 100
    """
    correct = sum(pred == gt for pred, gt in zip(predictions, ground_truths))
    return (correct / len(ground_truths)) * 100

# Function to compute step-wise accuracy for Chain-of-Thought (CoT) reasoning
def step_wise_accuracy(pred_steps, gt_steps):
    """
    Computes step-wise accuracy.

    Formula:
    Step-wise Accuracy = (Correct Steps / Total Steps) * 100
    """
    correct_steps = sum(p == g for pred, gt in zip(pred_steps, gt_steps) for p, g in zip(pred, gt))
    total_steps = sum(len(gt) for gt in gt_steps)
    return (correct_steps / total_steps) * 100

# Example data
predictions = ["42", "15", "8"]
ground_truths = ["42", "12", "8"]

pred_steps = [["Step 1", "Step 2", "Step 3"], ["Step 1", "Step 2"], ["Step 1"]]
gt_steps = [["Step 1", "Step 2", "Step 3"], ["Step 1", "Wrong Step"], ["Step 1"]]

# Compute metrics
exact_acc = exact_match_accuracy(predictions, ground_truths)
step_acc = step_wise_accuracy(pred_steps, gt_steps)

# Print results
print(f"Exact Match Accuracy: {exact_acc:.2f}%")
print(f"Step-wise Accuracy: {step_acc:.2f}%")


Exact Match Accuracy: 66.67%
Step-wise Accuracy: 83.33%
