In [4]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm.auto import tqdm

# Clear CUDA cache
torch.cuda.empty_cache()

# Load Alpaca-style dataset
with open("qa_alpaca_cleaned.json", 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Prepare prompt-answer format
qa_pairs = [
    {
        "prompt": f"{item['instruction']}\n\n{item['input']}",
        "answer": item['output']
    } for item in raw_data
]

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Use BanglaBERT (Encoder-Decoder version for QnA)
model_id = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Tokenization function
def tokenize(example):
    inputs = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    targets = tokenizer(
        example["answer"],
        truncation=True,
        padding="max_length",
        max_length=64
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names, desc="Train tokenizing")
print("Tokenizing evaluation dataset...")
tokenized_eval = eval_dataset.map(tokenize, remove_columns=eval_dataset.column_names, desc="Eval tokenizing")

# Metric function
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    pred_flat = preds.flatten()
    label_flat = labels.flatten()

    precision, recall, f1, _ = precision_recall_fscore_support(label_flat, pred_flat, average='macro')
    acc = accuracy_score(label_flat, pred_flat)

    torch.cuda.empty_cache()
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./banglabert-qa-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
trainer.train()
torch.cuda.empty_cache()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizing training dataset...


Train tokenizing: 100%|██████████| 2961/2961 [00:02<00:00, 1448.86 examples/s]


Tokenizing evaluation dataset...


Eval tokenizing: 100%|██████████| 329/329 [00:00<00:00, 1513.86 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 329) + inhomogeneous part.

In [1]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm.auto import tqdm

# Clear CUDA cache
torch.cuda.empty_cache()

# Load Alpaca-style dataset
with open("qa_alpaca_cleaned.json", 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Prepare prompt-answer format
qa_pairs = [
    {
        "prompt": f"{item['instruction']}\n\n{item['input']}",
        "answer": item['output']
    } for item in raw_data
]

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Use BanglaBERT (Encoder-Decoder version for QnA)
model_id = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Tokenization function
def tokenize(example):
    inputs = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    targets = tokenizer(
        example["answer"],
        truncation=True,
        padding="max_length",
        max_length=64
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names, desc="Train tokenizing")
print("Tokenizing evaluation dataset...")
tokenized_eval = eval_dataset.map(tokenize, remove_columns=eval_dataset.column_names, desc="Eval tokenizing")

# Metric function
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Decode the predictions and labels to text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute accuracy by comparing text
    acc = np.mean([pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)])

    # Compute other metrics like precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(decoded_labels, decoded_preds, average='macro')
    
    torch.cuda.empty_cache()
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./banglabert-qa-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
trainer.train()
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizing training dataset...


Train tokenizing: 100%|██████████| 2961/2961 [00:02<00:00, 1413.04 examples/s]


Tokenizing evaluation dataset...


Eval tokenizing: 100%|██████████| 329/329 [00:00<00:00, 1429.62 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0,


KeyError: "The `metric_for_best_model` training argument is set to 'eval_f1', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [7]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm.auto import tqdm

# Clear CUDA cache
torch.cuda.empty_cache()

# Load Alpaca-style dataset
with open("qa_alpaca_cleaned.json", 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Prepare prompt-answer format
qa_pairs = [
    {
        "prompt": f"{item['instruction']}\n\n{item['input']}",
        "answer": item['output']
    } for item in raw_data
]

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Use BanglaBERT (Encoder-Decoder version for QnA)
model_id = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Tokenization function
def tokenize(example):
    inputs = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    targets = tokenizer(
        example["answer"],
        truncation=True,
        padding="max_length",
        max_length=64
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names, desc="Train tokenizing")
print("Tokenizing evaluation dataset...")
tokenized_eval = eval_dataset.map(tokenize, remove_columns=eval_dataset.column_names, desc="Eval tokenizing")

# Metric function
# def compute_metrics(eval_pred):
#     preds, labels = eval_pred
#     preds = np.argmax(preds, axis=-1)  # Get the index of the max logit as the prediction

#     # Avoid special tokens (-100 is used for padding in the labels)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

#     # Decode predictions and labels to text
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Compute accuracy by comparing text
#     precision, recall, f1, _ = precision_recall_fscore_support(decoded_labels, decoded_preds, average='macro')
#     acc = accuracy_score(decoded_labels, decoded_preds)

#     torch.cuda.empty_cache()
#     return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Ensure predictions are 2D (batch_size, num_labels)
    preds = np.squeeze(preds, axis=-1)  # Removes the last singleton dimension if it's there
    preds = np.argmax(preds, axis=-1)  # Get the index of the max logit as the prediction
    # Avoid special tokens (-100 is used for padding in the labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Calculate accuracy and other metrics here
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}


# TrainingArguments
training_args = TrainingArguments(
    output_dir="./banglabert-qa-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
trainer.train()
torch.cuda.empty_cache()


Tokenizing training dataset...


Train tokenizing: 100%|██████████| 2961/2961 [00:02<00:00, 1389.42 examples/s]


Tokenizing evaluation dataset...


Eval tokenizing: 100%|██████████| 329/329 [00:00<00:00, 1246.70 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 329) + inhomogeneous part.

In [None]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    BitsAndBytesConfig, EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm.auto import tqdm

torch.cuda.empty_cache()

# Load dataset
data_path = "qa_alpaca_cleaned.json"
with open(data_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Prepare prompt-answer pairs
qa_pairs = [{
    "prompt": f"প্রশ্ন: {item['input']}\nউত্তরের জন্য প্রাসঙ্গিক পাঠ্য:\n{item['instruction']}",
    "answer": item["output"]
} for item in raw_data]

dataset = Dataset.from_list(qa_pairs)

# Train-test split
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Load tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'
tokenizer.save_pretrained("./llama3-tokenizer")

# Load model with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": torch.cuda.current_device()},
    trust_remote_code=True
)
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Tokenization
def tokenize(example):
    full_input = f"{example['prompt']}\nউত্তর: {example['answer']}"
    tokenized = tokenizer(
        full_input,
        truncation=True,
        padding="max_length",
        max_length=256  # reduced from 512
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Add progress bar using tqdm
print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize, remove_columns=train_dataset.column_names, desc="Tokenizing train")

print("Tokenizing evaluation dataset...")
tokenized_eval = eval_dataset.map(tokenize, remove_columns=eval_dataset.column_names, desc="Tokenizing eval")

# Metric function
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    pred_flat = preds.flatten()
    label_flat = labels.flatten()

    precision, recall, f1, _ = precision_recall_fscore_support(label_flat, pred_flat, average='macro')
    acc = accuracy_score(label_flat, pred_flat)

    # Clear cache to prevent OOM
    torch.cuda.empty_cache()

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training configuration
training_args = TrainingArguments(
    output_dir="./llama3-qa-checkpoints",
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,  # Prevents memory overflow during eval
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
trainer.train()

torch.cuda.empty_cache()


Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.34s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Tokenizing training dataset...


Tokenizing train: 100%|██████████| 2961/2961 [00:03<00:00, 817.77 examples/s]


Tokenizing evaluation dataset...


Tokenizing eval: 100%|██████████| 329/329 [00:00<00:00, 768.67 examples/s]
  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  r

Epoch,Training Loss,Validation Loss


In [4]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
import torch
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

torch.cuda.empty_cache()

# Load dataset
data_path = "qa_alpaca_cleaned.json"
with open(data_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Combine input and instruction (if instruction exists)
inputs = [f"{item['input']} {item.get('instruction', '')}".strip() for item in raw_data]
labels = [item['output'] for item in raw_data]

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Create Hugging Face Dataset
dataset = Dataset.from_list([{
    "text": inp,
    "label": lbl
} for inp, lbl in zip(inputs, encoded_labels)])

# Split into train/test
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Load tokenizer and model
model_id = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_encoder.classes_)
)

# Tokenization
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = eval_dataset.map(tokenize, batched=True)

# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='macro', zero_division=0)
    recall = recall_score(labels, preds, average='macro', zero_division=0)
    f1 = f1_score(labels, preds, average='macro', zero_division=0)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# TrainingArguments (compatible with old versions)
training_args = TrainingArguments(
    output_dir="./banglabert-qa-checkpoints",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Evaluate
eval_result = trainer.evaluate()
print("\nFinal Evaluation Results:")
print(eval_result)

# Print classification report
predictions = trainer.predict(tokenized_eval)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

# print("\n Classification Report:")
# print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2961/2961 [00:00<00:00, 7618.24 examples/s]
Map: 100%|██████████| 329/329 [00:00<00:00, 5774.14 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,8.1183,8.123231,0.0,0.0,0.0,0.0
2,8.1035,8.048121,0.0,0.0,0.0,0.0
3,8.0835,8.620607,0.0,0.0,0.0,0.0
4,8.028,9.406298,0.0,0.0,0.0,0.0



Final Evaluation Results:
{'eval_loss': 8.123230934143066, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.9405, 'eval_samples_per_second': 169.545, 'eval_steps_per_second': 21.644, 'epoch': 4.988521269412559}


In [12]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
import torch
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

torch.cuda.empty_cache()

# Load dataset
data_path = "qa_alpaca_cleaned.json"
with open(data_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Combine input and instruction (if instruction exists)
inputs = [f"{item['input']} {item.get('instruction', '')}".strip() for item in raw_data]

# --- Heuristic for converting answers into categories ---
def categorize_output(output_text):
    output_text = output_text.lower()
    if any(word in output_text for word in ["না", "ভুল", "সঠিক নয়", "পারবো না", "অসম্ভব"]):
        return "Negative"
    elif any(word in output_text for word in ["হ্যাঁ", "সঠিক", "ঠিক আছে", "অবশ্যই", "সম্ভব"]):
        return "Positive"
    else:
        return "Neutral"

labels = [categorize_output(item['output']) for item in raw_data]

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Create Hugging Face Dataset
dataset = Dataset.from_list([{
    "text": inp,
    "label": lbl
} for inp, lbl in zip(inputs, encoded_labels)])

# Split into train/test
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Load tokenizer and model
model_id = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(label_encoder.classes_)
)

# Tokenization
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = eval_dataset.map(tokenize, batched=True)

# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='macro', zero_division=0)
    recall = recall_score(labels, preds, average='macro', zero_division=0)
    f1 = f1_score(labels, preds, average='macro', zero_division=0)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./banglabert-qa-checkpoints",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    learning_rate=3e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    eval_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train
trainer.train()

# Evaluate
eval_result = trainer.evaluate()
print("\nFinal Evaluation Results:")
print(eval_result)

# Predict and Classification Report
predictions = trainer.predict(tokenized_eval)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

# print("\nClassification Report:")
# print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

eval_result = trainer.evaluate()
print(f"\nAccuracy: {eval_result['eval_accuracy']:.4f}")


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2961/2961 [00:00<00:00, 5939.48 examples/s]
Map: 100%|██████████| 329/329 [00:00<00:00, 5070.00 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7514,0.716253,0.671733,0.434295,0.437394,0.43312
2,0.629,0.619833,0.765957,0.523996,0.495049,0.495849
3,0.4786,0.646625,0.75076,0.490936,0.503232,0.496831
4,0.2803,1.011809,0.726444,0.762016,0.665585,0.691912
5,0.1381,1.060871,0.738602,0.761085,0.661951,0.6962



Final Evaluation Results:
{'eval_loss': 1.06087064743042, 'eval_accuracy': 0.7386018237082067, 'eval_precision': 0.7610850897736143, 'eval_recall': 0.6619511247954287, 'eval_f1': 0.696199946430963, 'eval_runtime': 1.6888, 'eval_samples_per_second': 194.815, 'eval_steps_per_second': 24.87, 'epoch': 5.0}



Accuracy: 0.7386
