In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created: September 18, 2025
Author: Pranaydeep Singh
Last Modified: November 6, 2025
Modified by: Pranaydeep Singh
Description: Script for fine-tuning a fine-tuned BERT model for text classification with inference.
"""

'\nCreated: September 18, 2025\nLast Modified: November 6, 2025\nAuthor: Pranaydeep Singh\nDescription: Script for fine-tuning a fine-tuned BERT model for text classification with inference.\n'

In [None]:
#install dependencies
!pip install transformers datasets scikit-learn accelerate

In [None]:
#imports
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from datasets import load_dataset
from transformers import Trainer, TrainingArguments

### Starter Script 1: Fine-tuning a BERT model for Classification

Use a BERT-base-uncased model directly from the HuggingFace Hub to fine-tune for classification on a dataset also hosted directly on the HF Hub.

Fine-tuning with almost standard hyper-parameters and a quick eval loop in the end. 

Please refer to the advanced metrics script to check additional metrics like F1, Precision, Recall, etc.


In [None]:
MODEL_NAME = "bert-base-uncased" # Model name as in the HuggingFace Hub
TOKENIZER_NAME = "bert-base-uncased" # Tokenizer name as in the HuggingFace Hub
num_labels = 5  # Number of classes for classification

tokenizer = BertTokenizer.from_pretrained(TOKENIZER_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sample data preparation using the datasets library

Here we use the "ag_news" dataset as an example

You can replace it with any text classification dataset of your choice

Make sure the dataset has a 'text' field and a 'label' field

To use locally available datasets, you can load them accordingly with load_csv or other methods

In [None]:


DATASET_NAME = "ag_news" # Example dataset name from the HuggingFace Hub

from datasets import load_dataset

dataset = load_dataset(DATASET_NAME, split="train[:1%]") # Using a small subset for demonstration


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)

encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Fine-tuning the model using the Trainer API

You can adjust the training arguments as needed

For more advanced training, consider using custom training loops or other libraries

Refer to the HuggingFace documentation for more details on training, evaluation and the hyperparameters

<https://huggingface.co/docs/transformers/en/main_classes/trainer>

In [3]:


OUTPUT_DIR = "./results" # Directory to save model checkpoints and logs

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16, # Batch size for training, in case of memory issues, reduce this value
    per_device_eval_batch_size=16, # Batch size for evaluation, in case of memory issues, reduce this value
    num_train_epochs=3, 
    weight_decay=0.01,
    logging_dir=f"{OUTPUT_DIR}/logs", # Directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
)
trainer.train()

#Model checkpoints are saved in the specified output directory after training


Epoch,Training Loss,Validation Loss
1,0.6461,0.519238
2,0.3603,0.265013
3,0.1992,0.189239


TrainOutput(global_step=225, training_loss=0.5442601781421238, metrics={'train_runtime': 683.8516, 'train_samples_per_second': 5.264, 'train_steps_per_second': 0.329, 'total_flos': 236806328217600.0, 'train_loss': 0.5442601781421238, 'epoch': 3.0})

Load saved model for inference or further evaluation

In [None]:

model = BertForSequenceClassification.from_pretrained(f"{OUTPUT_DIR}/checkpoint-225") # Don't forget to change the checkpoint number based on your training
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Tokenizer isn't saved in the output directory, load it from the original source

# Function to classify text for a batch of texts
# You can modify this function to take input from a file or other sources as needed

def classify_texts(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_classes = torch.argmax(logits, dim=1).tolist()
    return predicted_classes

sample_texts = [
    "The stock market crashed today due to economic uncertainty.",
    "The new movie released last week has received rave reviews.",
]
predictions = classify_texts(sample_texts)
print(predictions)

# Map predicted class indices to labels

label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"} # Example label mapping for AG News dataset
predicted_labels = [label_map[pred] for pred in predictions]
print(predicted_labels)

# Refer to notebook on metrics for calculating additional metrics like F1, Precision, Recall, etc.
  

[2, 3]
['Business', 'Sci/Tech']
