In [87]:
import torch
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [88]:
device

device(type='mps')

In [89]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

In [96]:
teacher_model_name = "distilbert-base-uncased"
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2
)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
dataset = load_dataset("imdb")
def tokenize_function(examples):
    return teacher_tokenizer(
        examples["text"],
        padding="max_length",  # Ensures all sequences are padded to the max_length
        truncation=True,  # Truncates sequences longer than max_length
        max_length=512,  # Set to the maximum sequence length expected by the model
    )


# def tokenize_function(examples):
#     return teacher_tokenizer(examples["text"], truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:08<00:00, 2973.79 examples/s]


In [77]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [78]:
import torch
import psutil


def get_hardware_info():
    # Check if a CUDA GPU is available
    gpu_available = torch.cuda.is_available()

    # Check if MPS is available (for Apple Silicon devices like M1/M2)
    mps_available = torch.backends.mps.is_available()

    # Get available system RAM
    total_ram = psutil.virtual_memory().total / (1024**3)  # Convert to GB
    available_ram = psutil.virtual_memory().available / (1024**3)  # Convert to GB

    # Get CPU core count
    cpu_cores = psutil.cpu_count(logical=False)  # Physical cores
    logical_cores = psutil.cpu_count(logical=True)  # Logical cores

    # Get GPU details if a CUDA GPU is available
    if gpu_available:
        gpu_count = torch.cuda.device_count()
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / (
            1024**3
        )  # Convert to GB
        gpu_type = "CUDA"

    # Get MPS details if an MPS GPU is available
    elif mps_available:
        gpu_count = 1  # MPS typically means 1 Apple GPU
        gpu_mem = None  # No easy way to fetch memory for MPS currently
        gpu_type = "MPS"

    # If neither CUDA nor MPS is available
    else:
        gpu_count = 0
        gpu_mem = 0
        gpu_type = "None"

    # Combine the hardware info into a dictionary
    hardware_info = {
        "gpu_available": gpu_available,
        "mps_available": mps_available,
        "gpu_type": gpu_type,
        "gpu_count": gpu_count,
        "gpu_memory": gpu_mem,
        "total_ram": total_ram,
        "available_ram": available_ram,
        "cpu_cores": cpu_cores,
        "logical_cores": logical_cores,
    }

    return hardware_info


# Check hardware info
hardware_info = get_hardware_info()
print(hardware_info)

{'gpu_available': False, 'mps_available': True, 'gpu_type': 'MPS', 'gpu_count': 1, 'gpu_memory': None, 'total_ram': 8.0, 'available_ram': 2.030853271484375, 'cpu_cores': 8, 'logical_cores': 8}


In [79]:
class AdaptiveStudentModel(nn.Module):
    def __init__(self, teacher_model_name, hardware_info):
        super(AdaptiveStudentModel, self).__init__()

        # Load teacher model to get its hidden size
        self.teacher_model = AutoModelForSequenceClassification.from_pretrained(
            teacher_model_name, num_labels=2
        )
        hidden_size = self.teacher_model.config.hidden_size

        # Adjust the student model architecture based on hardware
        if hardware_info["gpu_available"]:
            if hardware_info["gpu_memory"] < 4:  # For low-end GPUs
                self.student_model = nn.Linear(hidden_size, 128)  # Fewer units
            else:
                self.student_model = nn.Linear(
                    hidden_size, 256
                )  # Higher-end GPUs get a larger model
        elif hardware_info["mps_available"]:
            self.student_model = nn.Linear(
                hidden_size, 128
            )  # Adapt for MPS (Apple Silicon)
        else:
            self.student_model = nn.Linear(
                hidden_size, 64
            )  # Smaller model for CPU-only devices

        self.output_layer = nn.Linear(
            self.student_model.out_features, 2
        )  # Output layer size stays the same

    def forward(self, input_ids, attention_mask):
        teacher_outputs = self.teacher_model(input_ids, attention_mask=attention_mask)
        student_outputs = self.student_model(teacher_outputs[1])  # Use pooled output
        return self.output_layer(student_outputs)

In [80]:
class AdaptiveStudentModel(nn.Module):
    def __init__(self, teacher_model_name, hardware_info):
        super(AdaptiveStudentModel, self).__init__()
        
        # Load teacher model to get its hidden size
        self.teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name, num_labels=2)
        hidden_size = self.teacher_model.config.hidden_size
        
        # Adjust the student model architecture based on hardware
        if hardware_info["gpu_available"]:
            if hardware_info["gpu_memory"] < 4:  # For low-end GPUs
                self.student_model = nn.Linear(hidden_size, 128)  # Fewer units
            else:
                self.student_model = nn.Linear(hidden_size, 256)  # Higher-end GPUs get a larger model
        elif hardware_info["mps_available"]:
            self.student_model = nn.Linear(hidden_size, 128)  # Adapt for MPS (Apple Silicon)
        else:
            self.student_model = nn.Linear(hidden_size, 64)  # Smaller model for CPU-only devices

        self.output_layer = nn.Linear(self.student_model.out_features, 2)  # Output layer size stays the same

    def forward(self, input_ids, attention_mask):
        teacher_outputs = self.teacher_model(input_ids, attention_mask=attention_mask)
        student_outputs = self.student_model(teacher_outputs[1])  # Use pooled output
        return self.output_layer(student_outputs)

In [81]:
# Distillation loss function
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    teacher_probs = nn.functional.softmax(teacher_logits / temperature, dim=-1)
    student_probs = nn.functional.log_softmax(student_logits / temperature, dim=-1)
    # Kullback-Leibler (KL) Divergence
    return nn.KLDivLoss()(student_probs, teacher_probs) * (temperature**2)

In [82]:
def get_adaptive_batch_size(hardware_info):
    if hardware_info["gpu_available"]:
        if hardware_info["gpu_memory"] < 4: 
            return 16
        else:  
            return 32
    elif hardware_info["mps_available"]:
        return 16  
    else:
        return 8  

In [83]:
# Training function (example implementation)
def adaptive_train(student_model, teacher_model, tokenized_datasets, hardware_info):
    student_model.train()
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        for batch in tokenized_datasets["train"]:
            optimizer.zero_grad()

            # Prepare inputs
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Get teacher outputs
            with torch.no_grad():
                teacher_outputs = teacher_model(
                    input_ids, attention_mask=attention_mask
                )

            # Get student outputs
            student_outputs = student_model(input_ids, attention_mask=attention_mask)

            # Compute loss (make sure you have a loss function defined)
            loss = distillation_loss(student_outputs, teacher_outputs.logits)
            loss.backward()
            optimizer.step()

            print(f"Epoch: {epoch}, Loss: {loss.item()}")

In [84]:
def adaptive_evaluate(student_model, tokenized_datasets):
    student_model.eval()
    correct_predictions = 0
    total_predictions = 0

    for batch in tokenized_datasets["test"]:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = student_model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=-1)

        correct_predictions += (predictions == batch["label"]).sum().item()
        total_predictions += len(predictions)

    accuracy = correct_predictions / total_predictions
    print(f"Accuracy: {accuracy:.4f}")

In [85]:
hardware_info = get_hardware_info()

student_model = AdaptiveStudentModel(teacher_model_name, hardware_info).to(device)

teacher_model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [86]:
adaptive_train(student_model, teacher_model, tokenized_datasets, hardware_info)

RuntimeError: The size of tensor a (363) must match the size of tensor b (512) at non-singleton dimension 1