In [None]:
!pip install transformers datasets --quiet


# BERT DISTIL

In [None]:
pip install --upgrade transformers




In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Load IMDB dataset
dataset = load_dataset("imdb", split=["train", "test"])
# Take only 1000 samples for training and 1000 for testing for speed
# Access train and test splits using integer indices [0] and [1] as 'dataset' is a list
small_train = dataset[0].shuffle(seed=42).select(range(1000))
small_test = dataset[1].shuffle(seed=42).select(range(1000))

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

# Tokenize datasets
tokenized_train = small_train.map(tokenize_function, batched=True)
tokenized_test = small_test.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Load model for sequence classification (binary)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Training arguments
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    # Changed evaluation_strategy to eval_strategy
    eval_strategy='epoch',
    save_strategy='no',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=False,
    seed=42,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)


# Train
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3821,0.443254,0.819,0.786305,0.927577,0.682377
2,0.2651,0.327526,0.865,0.868549,0.827458,0.913934
3,0.1432,0.371999,0.878,0.879447,0.849237,0.911885


{'eval_loss': 0.37199896574020386, 'eval_accuracy': 0.878, 'eval_f1': 0.8794466403162056, 'eval_precision': 0.8492366412213741, 'eval_recall': 0.9118852459016393, 'eval_runtime': 6.7123, 'eval_samples_per_second': 148.979, 'eval_steps_per_second': 2.384, 'epoch': 3.0}


# RANDOM LAYER DELETION

In [None]:
# prompt: FROM THE ABOVE PERFFORMED TRAIN TESTDATA, PERFORM RANDOM LAYER DELETETION IN BERT

import random

def delete_random_layers(model, num_layers_to_delete):
    if num_layers_to_delete >= len(model.distilbert.transformer.layer):
        raise ValueError("Number of layers to delete is greater than or equal to the total number of layers.")

    # Get current layers
    layers = model.distilbert.transformer.layer

    # Randomly select indices of layers to delete
    layer_indices_to_delete = random.sample(range(len(layers)), num_layers_to_delete)
    layer_indices_to_delete.sort(reverse=True)  # Sort in reverse order to delete from end first

    # Create a new list of layers, excluding the ones to be deleted
    new_layers = [layer for i, layer in enumerate(layers) if i not in layer_indices_to_delete]

    # Replace the layers in the model's transformer
    model.distilbert.transformer.layer = torch.nn.ModuleList(new_layers)

    print(f"Deleted {num_layers_to_delete} random layers.")
    print(f"New number of layers: {len(model.distilbert.transformer.layer)}")

# Example usage: delete 2 random layers from the trained model
num_layers_to_delete = 2
delete_random_layers(model, num_layers_to_delete)

# You would typically re-train or fine-tune the model after deleting layers
# Here, we just demonstrate evaluation with the modified model
# Re-initialize the trainer with the modified model if you plan to continue training
# For evaluation only, you can use the existing trainer with the updated model reference
trainer.model = model

# Evaluate the model after layer deletion
print("\nEvaluating model after random layer deletion:")
results_after_deletion = trainer.evaluate()
results_after_deletion

Deleted 2 random layers.
New number of layers: 4

Evaluating model after random layer deletion:


{'eval_loss': 0.489665150642395,
 'eval_accuracy': 0.806,
 'eval_f1': 0.7849223946784922,
 'eval_precision': 0.855072463768116,
 'eval_recall': 0.7254098360655737,
 'eval_runtime': 4.4379,
 'eval_samples_per_second': 225.334,
 'eval_steps_per_second': 3.605,
 'epoch': 3.0}

# BERT BASE

In [None]:
!pip install -U "fsspec<2023.10.0" datasets --quiet


In [None]:
from datasets import load_dataset

# Load only raw text for IMDb
dataset = load_dataset("imdb", split=["train", "test"])

# Random subsample 500 each from train/test
train_data = dataset[0].shuffle(seed=42).select(range(500))
test_data = dataset[1].shuffle(seed=42).select(range(500))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=256)

train_dataset = train_data.map(tokenize, batched=True)
test_dataset = test_data.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
import torch


In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from torch import nn, optim

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        labels = batch['label'].to(device)
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            labels = batch['label'].to(device)
            outputs = model(**inputs)
            preds = outputs.logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total


In [None]:
for epoch in range(3):  # 3 epochs
    train_loss = train(model, train_loader)
    accuracy = evaluate(model, test_loader)
    print(f"Epoch {epoch+1}: Loss = {train_loss:.4f}, Accuracy = {accuracy:.4f}")


Epoch 1: Loss = 0.6451, Accuracy = 0.7900
Epoch 2: Loss = 0.3687, Accuracy = 0.8820
Epoch 3: Loss = 0.1667, Accuracy = 0.8620


In [None]:
# prompt: for the above 3 epoch result can you calculate final accuracy

# Assuming the output from the BERT BASE training loop is available
# For example:
# Epoch 1: Loss = 0.5543, Accuracy = 0.7200
# Epoch 2: Loss = 0.3876, Accuracy = 0.8300
# Epoch 3: Loss = 0.2611, Accuracy = 0.8800

# The final accuracy is the accuracy reported after the last epoch.
# In this case, it is the accuracy reported for Epoch 3.

# If the actual output is captured in a variable, you can extract the last accuracy.
# Since we don't have the output captured, we'll assume the last printed accuracy is the final one.
# The code already prints the accuracy after each epoch.

# To explicitly get the final accuracy after the loop:
final_accuracy = evaluate(model, test_loader)
print(f"\nFinal Accuracy after 3 epochs: {final_accuracy:.4f}")