In [1]:
!pip install -U pyarrow --quiet
!pip install datasets transformers torch seqeval evaluate  --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.
ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m210.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load IMDB dataset
dataset = load_dataset("imdb")

# Define pruning function
def prune_bert_model(model, pruning_ratio):
    for module in model.modules():
        if isinstance(module, (nn.Linear, nn.Embedding)):
            weight = module.weight.data.abs()
            threshold = torch.kthvalue(weight.view(-1), int(weight.numel() * pruning_ratio)).values
            mask = weight > threshold
            module.weight.data *= mask
    return model

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# Prepare dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# Create data loaders
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16)

# Prune the model
pruning_ratio = 0.3  # 30% of weights will be pruned
pruned_model = prune_bert_model(model, pruning_ratio)

# Fine-tuning function
def fine_tune(model, train_dataloader, eval_dataloader, epochs, device):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Calculate total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        model.eval()
        correct = 0
        total = 0
        for batch in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)

        accuracy = correct / total
        print(f"Epoch {epoch+1} Accuracy: {accuracy:.4f}")

    return model

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Fine-tune the pruned model
fine_tuned_model = fine_tune(pruned_model, train_dataloader, eval_dataloader, epochs=3, device=device)

# Save the pruned and fine-tuned model
torch.save(fine_tuned_model.state_dict(), 'pruned_bert_model.pth')

# Final evaluation
fine_tuned_model.eval()
correct = 0
total = 0
for batch in tqdm(eval_dataloader, desc="Final Evaluation"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = fine_tuned_model(**batch)
    predictions = torch.argmax(outputs.logits, dim=-1)
    correct += (predictions == batch['labels']).sum().item()
    total += batch['labels'].size(0)

final_accuracy = correct / total
print(f"Final Pruned BERT model accuracy: {final_accuracy:.4f}")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Training Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

Average training loss: 0.2597


Evaluating Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1 Accuracy: 0.9291


Training Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

Average training loss: 0.1413


Evaluating Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2 Accuracy: 0.9368


Training Epoch 3:   0%|          | 0/1563 [00:00<?, ?it/s]

Average training loss: 0.0705


Evaluating Epoch 3:   0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3 Accuracy: 0.9395


Final Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

Final Pruned BERT model accuracy: 0.9395
