In [1]:
import torch
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [2]:
device

device(type='mps')

In [3]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
teacher_model_name = "distilbert-base-uncased"
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2
)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
dataset = load_dataset("imdb")
# Updated tokenization function with padding and truncation
def tokenize_function(examples):
    # Ensure the input is a list of strings
    texts = [str(text) for text in examples["text"]]

    # Tokenize each example and ensure padding, truncation, and correct tensor formatting
    tokenized = teacher_tokenizer(
        texts,
        padding="max_length",  # Use max_length padding
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Remove the batch dimension added by the tokenizer
    tokenized = {k: v.squeeze(0) for k, v in tokenized.items()}

    return tokenized


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:08<00:00, 2893.25 examples/s]


In [6]:
tokenized_datasets["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

In [7]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# Define a data collator that pads sequences dynamically
data_collator = DataCollatorWithPadding(teacher_tokenizer, return_tensors="pt")

In [8]:
import torch
import psutil


def get_hardware_info():
    # Check if a CUDA GPU is available
    gpu_available = torch.cuda.is_available()

    # Check if MPS is available (for Apple Silicon devices like M1/M2)
    mps_available = torch.backends.mps.is_available()

    # Get available system RAM
    total_ram = psutil.virtual_memory().total / (1024**3)  # Convert to GB
    available_ram = psutil.virtual_memory().available / (1024**3)  # Convert to GB

    # Get CPU core count
    cpu_cores = psutil.cpu_count(logical=False)  # Physical cores
    logical_cores = psutil.cpu_count(logical=True)  # Logical cores

    # Get GPU details if a CUDA GPU is available
    if gpu_available:
        gpu_count = torch.cuda.device_count()
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / (
            1024**3
        )  # Convert to GB
        gpu_type = "CUDA"

    # Get MPS details if an MPS GPU is available
    elif mps_available:
        gpu_count = 1  # MPS typically means 1 Apple GPU
        gpu_mem = None  # No easy way to fetch memory for MPS currently
        gpu_type = "MPS"

    # If neither CUDA nor MPS is available
    else:
        gpu_count = 0
        gpu_mem = 0
        gpu_type = "None"

    # Combine the hardware info into a dictionary
    hardware_info = {
        "gpu_available": gpu_available,
        "mps_available": mps_available,
        "gpu_type": gpu_type,
        "gpu_count": gpu_count,
        "gpu_memory": gpu_mem,
        "total_ram": total_ram,
        "available_ram": available_ram,
        "cpu_cores": cpu_cores,
        "logical_cores": logical_cores,
    }

    return hardware_info


# Check hardware info
hardware_info = get_hardware_info()
print(hardware_info)

{'gpu_available': False, 'mps_available': True, 'gpu_type': 'MPS', 'gpu_count': 1, 'gpu_memory': None, 'total_ram': 8.0, 'available_ram': 2.5740509033203125, 'cpu_cores': 8, 'logical_cores': 8}


In [9]:
class AdaptiveStudentModel(nn.Module):
    def __init__(self, teacher_model_name, hardware_info):
        super(AdaptiveStudentModel, self).__init__()

        # Load teacher model to get its hidden size
        self.teacher_model = AutoModelForSequenceClassification.from_pretrained(
            teacher_model_name, num_labels=2
        )
        hidden_size = self.teacher_model.config.hidden_size

        # Adjust the student model architecture based on hardware
        if hardware_info["gpu_available"]:
            if hardware_info["gpu_memory"] < 4:  # For low-end GPUs
                self.student_model = nn.Linear(hidden_size, 128)  # Fewer units
            else:
                self.student_model = nn.Linear(
                    hidden_size, 256
                )  # Higher-end GPUs get a larger model
        elif hardware_info["mps_available"]:
            self.student_model = nn.Linear(
                hidden_size, 128
            )  # Adapt for MPS (Apple Silicon)
        else:
            self.student_model = nn.Linear(
                hidden_size, 64
            )  # Smaller model for CPU-only devices

        self.output_layer = nn.Linear(
            self.student_model.out_features, 2
        )  # Output layer size stays the same

    def forward(self, input_ids, attention_mask):
        teacher_outputs = self.teacher_model(input_ids, attention_mask=attention_mask)
        student_outputs = self.student_model(teacher_outputs[1])  # Use pooled output
        return self.output_layer(student_outputs)

In [10]:
class AdaptiveStudentModel(nn.Module):
    def __init__(self, teacher_model_name, hardware_info):
        super(AdaptiveStudentModel, self).__init__()

        # Load teacher model to get its hidden size
        self.teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name, num_labels=2)
        hidden_size = self.teacher_model.config.hidden_size

        # Adjust the student model architecture based on hardware
        if hardware_info["gpu_available"]:
            if hardware_info["gpu_memory"] < 4:  # For low-end GPUs
                self.student_model = nn.Linear(hidden_size, 128)  # Fewer units
            else:
                self.student_model = nn.Linear(hidden_size, 256)  # Higher-end GPUs get a larger model
        elif hardware_info["mps_available"]:
            self.student_model = nn.Linear(hidden_size, 128)  # Adapt for MPS (Apple Silicon)
        else:
            self.student_model = nn.Linear(hidden_size, 64)  # Smaller model for CPU-only devices

        self.output_layer = nn.Linear(self.student_model.out_features, 2)  # Output layer size stays the same

    def forward(self, input_ids, attention_mask):
        teacher_outputs = self.teacher_model(input_ids, attention_mask=attention_mask)
        student_outputs = self.student_model(
            teacher_outputs.pooler_output
        )  # Use pooled output
        return self.output_layer(student_outputs)

In [11]:
# Distillation loss function
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    teacher_probs = nn.functional.softmax(teacher_logits / temperature, dim=-1)
    student_probs = nn.functional.log_softmax(student_logits / temperature, dim=-1)
    # Kullback-Leibler (KL) Divergence
    return nn.KLDivLoss()(student_probs, teacher_probs) * (temperature**2)

In [12]:
def get_adaptive_batch_size(hardware_info):
    if hardware_info["gpu_available"]:
        if hardware_info["gpu_memory"] < 4: 
            return 16
        else:  
            return 32
    elif hardware_info["mps_available"]:
        return 16  
    else:
        return 8  

In [13]:
# Training function (example implementation)
def adaptive_train(
    student_model, teacher_model, tokenized_datasets, hardware_info, epochs=3
):
    student_model.train()
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

    # Define the DataLoader with dynamic padding and batching
    train_dataloader = DataLoader(
        tokenized_datasets["train"],
        batch_size=get_adaptive_batch_size(hardware_info),
        collate_fn=data_collator,
        shuffle=True,
    )

    for epoch in range(epochs):
        for batch in train_dataloader:
            optimizer.zero_grad()

            # input_ids and attention_mask are now PyTorch tensors
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Get teacher outputs
            with torch.no_grad():
                teacher_outputs = teacher_model(
                    input_ids, attention_mask=attention_mask
                )

            # Get student outputs
            student_outputs = student_model(input_ids, attention_mask=attention_mask)

            # Compute loss
            loss = distillation_loss(student_outputs, teacher_outputs.logits)
            loss.backward()
            optimizer.step()

            print(f"Epoch: {epoch}, Loss: {loss.item()}")

In [14]:
def adaptive_evaluate(student_model, tokenized_datasets):
    student_model.eval()
    correct_predictions = 0
    total_predictions = 0

    # Use DataLoader for evaluation as well
    eval_dataloader = DataLoader(
        tokenized_datasets["test"],
        batch_size=get_adaptive_batch_size(hardware_info),
        collate_fn=data_collator,
    )

    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = student_model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=-1)

        correct_predictions += (predictions == batch["label"]).sum().item()
        total_predictions += len(predictions)

    accuracy = correct_predictions / total_predictions
    print(f"Accuracy: {accuracy:.4f}")

In [15]:
hardware_info = get_hardware_info()

student_model = AdaptiveStudentModel(teacher_model_name, hardware_info).to(device)

teacher_model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
epochs = 3
adaptive_train(student_model, teacher_model, tokenized_datasets, hardware_info)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'teacher_tokenizer' is not defined

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
import psutil

# Device setup
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load teacher model and tokenizer
teacher_model_name = "distilbert-base-uncased"
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_name, num_labels=2
)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

# Load dataset
dataset = load_dataset("imdb")


# Updated tokenization function
def tokenize_function(examples):
    # Ensure the input is a list of strings
    texts = [str(text) for text in examples["text"]]

    # Tokenize each example and ensure padding, truncation, and correct tensor formatting
    tokenized = teacher_tokenizer(
        texts,
        padding="max_length",  # Use max_length padding
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

    # Remove the batch dimension added by the tokenizer
    tokenized = {k: v.squeeze(0) for k, v in tokenized.items()}

    # Add labels to the tokenized output
    tokenized["labels"] = examples["label"]

    return tokenized


# Update the dataset processing
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_datasets.set_format(
    "torch", columns=["input_ids", "attention_mask", "labels"]
)


# Hardware info function
def get_hardware_info():
    gpu_available = torch.cuda.is_available()
    mps_available = torch.backends.mps.is_available()
    total_ram = psutil.virtual_memory().total / (1024**3)  # Convert to GB
    available_ram = psutil.virtual_memory().available / (1024**3)  # Convert to GB
    cpu_cores = psutil.cpu_count(logical=False)  # Physical cores
    logical_cores = psutil.cpu_count(logical=True)  # Logical cores

    if gpu_available:
        gpu_count = torch.cuda.device_count()
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / (
            1024**3
        )  # Convert to GB
        gpu_type = "CUDA"
    elif mps_available:
        gpu_count = 1
        gpu_mem = None
        gpu_type = "MPS"
    else:
        gpu_count = 0
        gpu_mem = 0
        gpu_type = "None"

    return {
        "gpu_available": gpu_available,
        "mps_available": mps_available,
        "gpu_type": gpu_type,
        "gpu_count": gpu_count,
        "gpu_memory": gpu_mem,
        "total_ram": total_ram,
        "available_ram": available_ram,
        "cpu_cores": cpu_cores,
        "logical_cores": logical_cores,
    }


# Check hardware info
hardware_info = get_hardware_info()
print(hardware_info)


class AdaptiveStudentModel(nn.Module):
    def __init__(self, teacher_model_name, hardware_info):
        super(AdaptiveStudentModel, self).__init__()
        self.teacher_model = AutoModelForSequenceClassification.from_pretrained(
            teacher_model_name, num_labels=2
        )
        hidden_size = self.teacher_model.config.hidden_size

        # Determine the size of the intermediate layer
        if hardware_info["gpu_available"]:
            if hardware_info["gpu_memory"] < 4:
                intermediate_size = 128
            else:
                intermediate_size = 256
        elif hardware_info["mps_available"]:
            intermediate_size = 128
        else:
            intermediate_size = 64

        # Create a sequence of layers
        self.student_model = nn.Sequential(
            nn.Linear(hidden_size, intermediate_size),
            nn.ReLU(),
            nn.Linear(intermediate_size, intermediate_size),
            nn.ReLU(),
        )

        self.output_layer = nn.Linear(intermediate_size, 2)

    def forward(self, input_ids, attention_mask):
        # Use the teacher model to get hidden states
        with torch.no_grad():
            teacher_outputs = self.teacher_model(
                input_ids, attention_mask=attention_mask, output_hidden_states=True
            )
            teacher_hidden_states = teacher_outputs.hidden_states[
                -1
            ]  # Last hidden layer

        # Pool the hidden states (mean pooling)
        pooled_output = teacher_hidden_states.mean(
            dim=1
        )  # Shape: (batch_size, hidden_size)

        # Process the pooled output through the student model
        student_hidden = self.student_model(pooled_output)

        # Final classification layer
        student_logits = self.output_layer(student_hidden)

        return student_logits


# Distillation loss function
def distillation_loss(student_logits, teacher_logits, temperature=2.0):
    teacher_probs = nn.functional.softmax(teacher_logits / temperature, dim=-1)
    student_probs = nn.functional.log_softmax(student_logits / temperature, dim=-1)
    return nn.KLDivLoss()(student_probs, teacher_probs) * (temperature**2)


# Function to get adaptive batch size based on hardware
def get_adaptive_batch_size(hardware_info):
    if hardware_info["gpu_available"]:
        return 32 if hardware_info["gpu_memory"] >= 4 else 16
    elif hardware_info["mps_available"]:
        return 16
    else:
        return 8


# Training function
# Update the training function
def adaptive_train(student_model, teacher_model, train_dataloader, hardware_info, epochs=3):
    student_model.train()
    teacher_model.eval()  # Ensure teacher model is in evaluation mode
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask)

            student_logits = student_model(input_ids, attention_mask=attention_mask)

            # Compute loss
            distill_loss = distillation_loss(student_logits, teacher_outputs.logits)
            
            # Add a classification loss component
            clf_loss = nn.CrossEntropyLoss()(student_logits, labels)
            
            # Combine losses
            loss = distill_loss + clf_loss
            
            loss.backward()
            optimizer.step()

            print(f"Epoch: {epoch}, Loss: {loss.item()}")

# Evaluation function
def adaptive_evaluate(student_model, eval_dataloader):
    student_model.eval()
    correct_predictions = 0
    total_predictions = 0

    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = student_model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=-1)

        correct_predictions += (predictions == labels).sum().item()
        total_predictions += len(predictions)

    accuracy = correct_predictions / total_predictions
    print(f"Accuracy: {accuracy:.4f}")


# Initialize models and run training
hardware_info = get_hardware_info()
student_model = AdaptiveStudentModel(teacher_model_name, hardware_info).to(device)
teacher_model.to(device)

# Create DataLoaders
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=get_adaptive_batch_size(hardware_info),
    shuffle=True,
)

eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=get_adaptive_batch_size(hardware_info)
)

# Run training
epochs = 3
adaptive_train(
    student_model, teacher_model, train_dataloader, hardware_info, epochs=epochs
)

# Run evaluation
adaptive_evaluate(student_model, eval_dataloader)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'gpu_available': False, 'mps_available': True, 'gpu_type': 'MPS', 'gpu_count': 1, 'gpu_memory': None, 'total_ram': 8.0, 'available_ram': 1.310760498046875, 'cpu_cores': 8, 'logical_cores': 8}
Epoch: 0, Loss: 0.6953972578048706
Epoch: 0, Loss: 0.6927683353424072
Epoch: 0, Loss: 0.6878712177276611
Epoch: 0, Loss: 0.6898900866508484
Epoch: 0, Loss: 0.698832631111145
Epoch: 0, Loss: 0.6825283765792847
Epoch: 0, Loss: 0.7095991969108582
Epoch: 0, Loss: 0.6893239617347717
Epoch: 0, Loss: 0.6985973715782166
Epoch: 0, Loss: 0.6958765387535095
Epoch: 0, Loss: 0.6848708391189575
Epoch: 0, Loss: 0.6960991024971008
Epoch: 0, Loss: 0.6945747137069702
Epoch: 0, Loss: 0.6977376341819763
Epoch: 0, Loss: 0.6892644166946411
Epoch: 0, Loss: 0.6899850964546204
Epoch: 0, Loss: 0.6892284154891968
Epoch: 0, Loss: 0.6915101408958435
Epoch: 0, Loss: 0.6922131776809692
Epoch: 0, Loss: 0.7037166953086853
Epoch: 0, Loss: 0.691405177116394
Epoch: 0, Loss: 0.6984713673591614
Epoch: 0, Loss: 0.701328456401825
Epoch

KeyboardInterrupt: 