In [None]:
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
import torch

def preprocess_for_bert(texts, labels=None, max_length=128, batch_size=16):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    # Tokenize and encode the texts
    encoded = tokenizer(
        texts,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    input_ids = encoded["input_ids"]
    attention_masks = encoded["attention_mask"]
    
    if labels is not None:
        labels_tensor = torch.tensor(labels)
        dataset = TensorDataset(input_ids, attention_masks, labels_tensor)
    else:
        dataset = TensorDataset(input_ids, attention_masks)
    
    # Create DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

# Load data from CSV
file_path = "most_frequent_severity_with_def_text.csv"
data = pd.read_csv(file_path)

# Extract columns
texts = data["def_text"].tolist()
labels = data["Most_Frequent_Severity"].tolist()

# Preprocess data
dataloader = preprocess_for_bert(texts, labels)

# Example: Iterate through batches
for batch in dataloader:
    input_ids, attention_masks, labels = batch
    print(f"Input IDs: {input_ids}")
    print(f"Attention Masks: {attention_masks}")
    print(f"Labels: {labels}")
