In [2]:
import os
import torch
import time
from huggingface_hub import notebook_login
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, GenerationConfig
#from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training, AdaLoraConfig
from transformers.models.roberta.modeling_roberta import RobertaForSequenceClassification, RobertaLayer
from transformers import TrainingArguments
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset
raw_datasets  = load_dataset("glue", 'sst2')

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "FacebookAI/roberta-base"
# Load the configuration of the pre-trained model
config = AutoConfig.from_pretrained(model_name)

config.hidden_dropout_prob=0.0
config.attention_probs_dropout_prob=0.00
# Load the tokenizer for the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding


col_to_delete = ['idx', 'sentence']

def preprocessing_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=128)

tokenized_datasets = raw_datasets.map(preprocessing_function, batched=True, remove_columns=col_to_delete)

tokenized_datasets.set_format("torch")
# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:


#     LoRAExpert defines a lightweight adapter module that uses low-rank 
#     approximation for efficient fine-tuning of neural networks.
   

class LoRAExpert(nn.Module):
    def __init__(self, input_size, output_size, rank):
        super().__init__()
        self.lora_A = nn.Linear(input_size, rank, bias=False)
        self.lora_B = nn.Linear(rank, output_size, bias=False)

    def forward(self, x):
        # Pass input through the first layer (A) and then the second layer (B)
        return self.lora_B(self.lora_A(x))


In [7]:
from transformers import PretrainedConfig

class SparseMoeConfig(PretrainedConfig):
    model_type = "sparse_moe"

    def __init__(
        self,
        vocab_size=50265,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        position_embedding_type="absolute",
        is_decoder=False,
        use_cache=True,
        pad_token_id=1,
        classifier_dropout=None,
        # DenseMoE-specific parameters
        num_local_experts=8,  # Number of local experts
        num_experts_per_tok=1,
        top_k=8,         # Number of top-k experts activated
        router_jitter_noise=0.01,  # Jitter noise for router logits
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.top_k = top_k  
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.is_decoder = is_decoder
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
        self.pad_token_id = pad_token_id
        # SparseMoE-specific attributes
        self.num_local_experts = num_local_experts
        self.num_experts_per_tok = num_experts_per_tok
        self.router_jitter_noise = router_jitter_noise





In [8]:
from transformers.models.roberta.modeling_roberta import RobertaForSequenceClassification
import torch
import torch.nn as nn
import torch.nn.functional as F
class SparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_dim = config.hidden_size
        self.num_experts = config.num_local_experts
        self.top_k = config.top_k
        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
        self.experts = nn.ModuleList([
            LoRAExpert(self.hidden_dim, self.hidden_dim, rank=4) for _ in range(self.num_experts)
        ])  # List of experts
        self.aux_loss_weight = 0.05  # Weight for auxiliary loss

    def forward(self, hidden_states):
        batch_size, seq_length, hidden_dim = hidden_states.shape
        hidden_states_flat = hidden_states.view(-1, hidden_dim)   # Flatten the input for processing

        # Compute routing logits and routing weights
        router_logits = self.gate(hidden_states_flat)
        routing_weights, selected_experts = torch.topk(F.softmax(router_logits, dim=-1), self.top_k, dim=-1)

        # Auxiliary loss: Balance routing probabilities
        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).sum(dim=1)
        fraction_tokens = expert_mask.float().sum(dim=0) / hidden_states_flat.size(0)
        average_probs = (expert_mask.float() * routing_weights).sum(dim=0) / hidden_states_flat.size(0)
        auxiliary_loss = self.aux_loss_weight * self.num_experts * torch.sum(fraction_tokens * average_probs)

        # Forward pass for selected experts
        final_hidden_states = torch.zeros_like(hidden_states_flat)
        for expert_idx in range(self.num_experts):
        # Get tokens routed to this expert
            token_idx = torch.where(expert_mask[:, expert_idx] > 0)[0]   # Get tokens for the expert
            token_states = hidden_states_flat[token_idx]
            
            # Compute expert output
            expert_output = self.experts[expert_idx](token_states)
            expert_output *= routing_weights[token_idx, expert_idx].unsqueeze(-1)
        
            # Accumulate expert contributions
            final_hidden_states.index_add_(0, token_idx, expert_output)

        # Reshape output and return with auxiliary loss
        return final_hidden_states.view(batch_size, seq_length, hidden_dim), auxiliary_loss



In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import RobertaConfig

config = RobertaConfig.from_pretrained("roberta-base")



from transformers.models.roberta.modeling_roberta import RobertaLayer

class RobertaLayerWithMoE(RobertaLayer):
    def __init__(self, config, moe_config):
        super().__init__(config)
        self.moe_block = SparseMoeBlock(moe_config)   # Initialize SparseMoeBlock with its configuration (though we implement DenseMoE )
        self.use_moe = True

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        use_cache=False,
    ):
        # Compute attention outputs
        attention_output = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )[0]

        if self.use_moe:   # Apply MoE block if enabled
            moe_output, auxiliary_loss = self.moe_block(attention_output)
            ffn_output = self.intermediate(moe_output)
            layer_output = self.output(ffn_output, moe_output)
            return layer_output, auxiliary_loss
        else:   # Default transformer behavior
            intermediate_output = self.intermediate(attention_output)  # Process through intermediate FFN
            layer_output = self.output(intermediate_output, attention_output)  # Combine with output layer
            return layer_output, 0.0  # Return output and zero auxiliary loss



In [10]:
from transformers import RobertaForSequenceClassification

class RobertaForSequenceClassificationWithMoE(RobertaForSequenceClassification):
    def __init__(self, config, moe_config):
        super().__init__(config)
        # Replace the encoder layers with MoE-enabled layers for the last 4 layers
        self.roberta.encoder.layer = nn.ModuleList([
            RobertaLayerWithMoE(config, moe_config) if i >= config.num_hidden_layers - 4 else RobertaLayer(config)
            for i in range(config.num_hidden_layers)
        ])

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # Forward pass through the RoBERTa model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Extract hidden states and auxiliary losses from MoE layers
        hidden_states = outputs.last_hidden_state  # Retrieve the final hidden states
        auxiliary_losses = [layer[1] for layer in outputs if isinstance(layer, tuple)]  # Collect auxiliary losses
        total_auxiliary_loss = sum(auxiliary_losses) if auxiliary_losses else 0.0  # Sum auxiliary losses
        # Pass hidden states through the classification head
        logits = self.classifier(hidden_states)
        # Compute the combined loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            main_loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))  # Compute main loss
            loss = main_loss + total_auxiliary_loss  # Add auxiliary loss to main loss
        # Return loss and logits if loss is computed, otherwise just return logits
        return (loss, logits) if loss is not None else logits


In [19]:
# Configuration for our Dense MoE LoRA

moe_config = SparseMoeConfig(
    vocab_size=50265,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,  # Match the pretrained model
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=514,  # Match RoBERTa
    type_vocab_size=1,  # Match RoBERTa
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    position_embedding_type="absolute",
    num_local_experts=8,
    num_experts_per_tok=1,
    router_jitter_noise=0.05,
    top_k=8
)



from transformers import RobertaModel
import torch.nn.init as init

base_model = RobertaForSequenceClassification.from_pretrained(model_name)

# Initialize the custom model
custom_model = RobertaForSequenceClassificationWithMoE(config=base_model.config, moe_config=moe_config)

# Manually load the pretrained weights into the custom model
custom_model.load_state_dict(base_model.state_dict(), strict=False)

# Initialize MoE-Specific Parameters
def initialize_moe_params(moe_block):
    # Initialize gate weights
    init.xavier_uniform_(moe_block.gate.weight)
    # Initialize LoRA expert weights
    for expert in moe_block.experts:
        if isinstance(expert, LoRAExpert):  # Check for LoRAExpert instance
            init.xavier_uniform_(expert.lora_A.weight)
            init.xavier_uniform_(expert.lora_B.weight)
        else:
            # Handle cases where the expert is not a LoRAExpert
            for layer in expert:
                if isinstance(layer, nn.Linear):
                    init.xavier_uniform_(layer.weight)
                    if layer.bias is not None:
                        init.zeros_(layer.bias)

# Apply initialization to MoE blocks
for layer in custom_model.roberta.encoder.layer:
    if hasattr(layer, "moe_block"):
        initialize_moe_params(layer.moe_block)


num_moe_layers = 4  # Enable MoE for the last 4 layers
for i, layer in enumerate(custom_model.roberta.encoder.layer):
    if i < len(custom_model.roberta.encoder.layer) - num_moe_layers:
        layer.use_moe = False # Disable MoE for earlier layers
    else:
        layer.use_moe = True  # Enable MoE for later layers
# Freeze base model parameters
for param in custom_model.roberta.parameters():
    param.requires_grad = False

# Unfreeze LoRA parameters
for name, param in custom_model.named_parameters():
    if "lora_" in name:  # LoRA-specific parameters
        param.requires_grad = True

# Unfreeze MoE parameters

for layer in custom_model.roberta.encoder.layer:
    if hasattr(layer, "moe_block"):
        # Unfreeze gate weights and LoRA expert parameters
        for name, param in layer.moe_block.named_parameters():
            param.requires_grad = True




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [21]:
from transformers import TrainingArguments, Trainer


import time
from transformers import Trainer, TrainingArguments


# Define training arguments

training_args = TrainingArguments(
    output_dir='loramoesparse',
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.00,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # We can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=500,
)

trainer = Trainer(
    model=custom_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)



In [22]:
def count_trainable_parameters(custom_model):
    return sum(p.numel() for p in custom_model.parameters() if p.requires_grad)

# Display trainable parameters
print(f"Number of trainable parameters: {count_trainable_parameters(custom_model):,}")

Number of trainable parameters: 813,314


In [23]:
trainer.train()



Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
100,0.6953,0.693069,0.254587,0.5,0.337386,0.509174
200,0.6677,0.490498,0.785132,0.780005,0.77791,0.77867
300,0.4905,0.337031,0.867805,0.863833,0.864158,0.864679
400,0.396,0.2807,0.888628,0.887156,0.887423,0.887615
500,0.35,0.276373,0.888085,0.886124,0.885234,0.885321
600,0.3469,0.251877,0.903637,0.903637,0.903637,0.90367
700,0.3326,0.239842,0.919108,0.918277,0.918488,0.918578
800,0.3172,0.251329,0.9076,0.902795,0.903271,0.90367
900,0.3095,0.24614,0.920291,0.918024,0.918398,0.918578
1000,0.2975,0.245509,0.919265,0.916856,0.917239,0.917431


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1404, training_loss=0.38400899540325173, metrics={'train_runtime': 690.1852, 'train_samples_per_second': 195.162, 'train_steps_per_second': 2.034, 'total_flos': 3304446813017292.0, 'train_loss': 0.38400899540325173, 'epoch': 2.0})