In [3]:
pip install datasets




In [4]:
pip install transformers




In [5]:
import torch
import random
import numpy as np
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
dataset = load_dataset("banking77")

Downloading builder script:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.3k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

In [7]:
# Extract texts and labels from the dataset
texts = dataset["train"]["text"]
labels = dataset["train"]["label"]

In [None]:
from collections import Counter

# Calculate and print the distribution of the number of records from each class
class_distribution = Counter(labels)
print("Class Distribution:")
for label, count in class_distribution.items():
    print(f"Class {label}: {count} records")

In [9]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [10]:
# Initialize Electra tokenizer and model
model_name = "google/electra-small-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=77)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:

# Tokenize and encode data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")

In [12]:

# Create PyTorch datasets
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], torch.tensor(val_labels))

In [13]:
# Define DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [14]:
# Get the size of the training dataset
train_dataset_size = len(train_dataset)

In [15]:
# Get the size of the validation dataset
val_dataset_size = len(val_dataset)

In [16]:
# Count and print the number of parameters in the model
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters in the model: {num_params}")

Number of parameters in the model: 13568589


In [17]:
num_epochs = 5

In [18]:
# Initialize optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)




In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    evaluation_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_steps=500,
)


In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0.0
    correct_val_predictions = 0
    all_val_labels = []
    all_val_predictions = []

    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_val_loss += loss.item()
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        correct_val_predictions += (predicted_labels == labels).sum().item()

        all_val_labels.extend(labels.tolist())
        all_val_predictions.extend(predicted_labels.tolist())

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct_val_predictions / val_dataset_size

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train loss: {avg_train_loss:.4f}")
    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy * 100:.2f}%")

    # Classification Report
    class_report = classification_report(all_val_labels, all_val_predictions)
    print("Classification Report:")
    print(class_report)

print("Training complete.")


In [None]:
import os
output_dir = "./saved_model"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Training complete. Model saved.")





