## Loss switching

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

# =========================================================
# 1) DUMMY MODEL
# =========================================================
# A simple linear model for demonstration purposes
# Input: 10 features, Output: 2 classes
model = nn.Linear(10, 2)

# Optimizer: decides how to adjust model weights to reduce errors
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# =========================================================
# 2) DUMMY DATA
# =========================================================
# Small batch of 32 samples, 10 features each
x = torch.randn(32, 10)
# Random integer labels for classification (0 or 1)
y = torch.randint(0, 2, (32,))

# =========================================================
# 3) DEFINE TWO LOSSES
# =========================================================
# Loss1: CrossEntropyLoss for classification
criterion1 = nn.CrossEntropyLoss()
# Loss2: MSELoss as a second task/dummy loss
criterion2 = nn.MSELoss()

# Number of epochs for training
NUM_EPOCHS = 4

# =========================================================
# 4) TRAINING LOOP WITH LOSS SWITCHING
# =========================================================
for epoch in range(NUM_EPOCHS):
    # Step 4.1: Zero gradients from previous step
    optimizer.zero_grad()
    
    # Step 4.2: Forward pass through the model
    logits = model(x)
    
    # Step 4.3: Compute both losses
    # Loss1: standard classification loss
    loss1 = criterion1(logits, y)
    # Loss2: MSE between predicted probabilities and one-hot labels
    y_onehot = torch.nn.functional.one_hot(y, num_classes=2).float()
    loss2 = criterion2(torch.softmax(logits, dim=1), y_onehot)
    
    # -----------------------------
    # Step 4.4: LOSS SWITCHING LOGIC
    # -----------------------------
    # Alpha factor controls contribution of each loss
    # Start with 100% loss1 and 0% loss2
    # End with 0% loss1 and 100% loss2
    # Linear interpolation over epochs
    alpha = 1 - (epoch / (NUM_EPOCHS - 1))  # 1 → 0 across epochs
    
    # Combine losses according to alpha
    total_loss = alpha * loss1 + (1 - alpha) * loss2
    
    # Step 4.5: Backpropagation
    # Compute gradients for all model parameters
    total_loss.backward()
    
    # Step 4.6: Update model weights based on gradients
    optimizer.step()
    
    # Step 4.7: Print metrics
    print(
        f"Epoch {epoch+1}: "
        f"alpha={alpha:.2f}, "
        f"Loss1={loss1.item():.4f}, "
        f"Loss2={loss2.item():.4f}, "
        f"Total={total_loss.item():.4f}"
    )

Epoch 1: alpha=1.00, Loss1=0.7756, Loss2=0.2785, Total=0.7756
Epoch 2: alpha=0.67, Loss1=0.7737, Loss2=0.2778, Total=0.6084
Epoch 3: alpha=0.33, Loss1=0.7718, Loss2=0.2770, Total=0.4419
Epoch 4: alpha=0.00, Loss1=0.7700, Loss2=0.2763, Total=0.2763


## Demo Of MoE

In [15]:
"""
Mixture-of-Experts (MoE) demo combining two NLP experts (BERT + DistilBERT).

- Uses pretrained models from Hugging Face when available (internet required).
- Falls back to simple text models if transformers are unavailable or offline.
- Uses a synthetic text dataset so it runs fully offline.
- Demonstrates how the gating network learns to combine expert outputs.

🟩 BERT excels at: Understanding nuanced language
Great for semantic understanding, complex sentiment, entailment, question answering, and coreference resolution.
Example: “The movie was bad, but the acting was brilliant.”
→ BERT captures long-range context

🟨 DistilBERT excels at: Speed and Efficiency
~40% smaller, ~60% faster at inference.

"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random

# =========================================================
# 1) CONFIGURATION AND DEVICE SETUP
# =========================================================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

BATCH_SIZE = 16
NUM_CLASSES = 10
EMBED_DIM = 256
NUM_EPOCHS = 1
USE_PRETRAINED = True

# =========================================================
# 2) SYNTHETIC TEXT DATASET
# =========================================================
class SyntheticSentimentDataset(Dataset):
    """
    Creates short, meaningful sentences with positive or negative sentiment.
    Each sample: (sentence, label)
    Label: 1 = Positive, 0 = Negative
    """

    def __init__(self, n=512):
        self.n = n

        # Define word pools
        self.positive_subjects = ["I", "We", "My friend", "Our team", "The movie"]
        self.negative_subjects = ["I", "We", "My boss", "The service", "The food"]

        self.positive_verbs = ["love", "enjoy", "like", "appreciate", "recommend"]
        self.negative_verbs = ["hate", "dislike", "regret", "complain about", "avoid"]

        self.positive_objects = ["the product", "this place", "the performance",
                                 "the design", "the food", "the experience"]
        self.negative_objects = ["the delay", "the taste", "this service",
                                 "the experience", "the product", "the noise"]

        self.positive_endings = ["It was amazing!", "Really great.", "Highly recommended!",
                                 "Would come again.", "Such a good feeling."]
        self.negative_endings = ["It was terrible.", "Really bad.", "Not worth it.",
                                 "Never again.", "Disappointing experience."]

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        # Randomly choose sentiment
        label = random.choice([0, 1])  # 0 = negative, 1 = positive

        if label == 1:
            subj = random.choice(self.positive_subjects)
            verb = random.choice(self.positive_verbs)
            obj = random.choice(self.positive_objects)
            end = random.choice(self.positive_endings)
        else:
            subj = random.choice(self.negative_subjects)
            verb = random.choice(self.negative_verbs)
            obj = random.choice(self.negative_objects)
            end = random.choice(self.negative_endings)

        # Create a simple but meaningful sentence
        text = f"{subj} {verb} {obj}. {end}"

        return text, label


# Instantiate dataset and dataloader
dataset = SyntheticSentimentDataset(n=512)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

Device: cuda:0


In [14]:
print("\nSample records from SyntheticTextDataset:")
for i in range(5):  # show 5 random samples
    text, label = dataset[i]
    print(f"{i+1:>2}) Text: {text}")
    print(f"    Label: {label}\n")


Sample records from SyntheticTextDataset:
 1) Text: My boss avoid the delay. It was terrible.
    Label: 0

 2) Text: Our team love this place. Such a good feeling.
    Label: 1

 3) Text: My friend appreciate this place. Highly recommended!
    Label: 1

 4) Text: I like the product. Would come again.
    Label: 1

 5) Text: The movie enjoy the design. It was amazing!
    Label: 1



In [5]:
# =========================================================
# 3) TOKENIZER SETUP (uses Hugging Face if available)
# =========================================================

try:
    # Attempt to import Hugging Face transformers library
    from transformers import AutoTokenizer, AutoModel
    HF_AVAILABLE = True  # Flag indicating that HF tokenizer/model can be used

    # Initialize BERT tokenizer
    tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased")
    # Initialize DistilBERT tokenizer
    tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert-base-uncased")

except Exception:
    # If transformers library or pretrained models are not available
    HF_AVAILABLE = False  # Fall back to simple tokenizer
    tokenizer_bert = tokenizer_distilbert = None
    print("Transformers not available; falling back to simple tokenizer.")


def simple_tokenize(batch_texts, max_len=16, tokenizer=None):
    """
    Tokenize a batch of text strings into input IDs and attention masks.
    
    Args:
        batch_texts: list of text strings
        max_len: maximum sequence length (truncate/pad sentences)
        tokenizer: optional Hugging Face tokenizer to use
    
    Returns:
        Dictionary with:
            'input_ids': tensor of token IDs [batch_size, max_len]
            'attention_mask': tensor of 0/1 mask [batch_size, max_len]
    """

    if HF_AVAILABLE and tokenizer is not None:
        # Use Hugging Face tokenizer if available
        enc = tokenizer(
            batch_texts,
            padding="max_length",       # pad sentences to max_len
            truncation=True,            # truncate sentences longer than max_len
            max_length=max_len,
            return_tensors="pt"         # return PyTorch tensors
        )
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"]}

    else:
        # Fallback: simple tokenization if HF tokenizer is unavailable
        input_ids, attention_mask = [], []

        for t in batch_texts:
            # Split sentence into words and truncate to max_len
            toks = t.split()[:max_len]

            # Convert words to small integer IDs via hashing
            ids = [(abs(hash(w)) % 1000) + 1 for w in toks]

            # Pad sequence to max_len with 0s
            pad_len = max_len - len(ids)
            ids = ids + [0]*pad_len

            # Create attention mask: 1 for real tokens, 0 for padding
            mask = [1]*len(toks) + [0]*pad_len

            input_ids.append(ids)
            attention_mask.append(mask)

        # Convert lists to PyTorch tensors
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long)
        }


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
# =========================================================
# 4) NLP EXPERT MODELS (BERT + DISTILBERT)
# =========================================================
class NLPExpert(nn.Module):
    """
    This class represents a single NLP “expert” in our mixture-of-experts setup.
    Conceptually, it takes a piece of text and turns it into a fixed-size numerical vector 
    (embedding) that summarizes the meaning of the sentence.

    - If a pretrained transformer (like BERT or DistilBERT) is available, it uses it 
      to get a sophisticated understanding of the text.
    - If transformers are not available (offline or demo mode), it uses a small simple model 
      that still produces a vector representation from the words.
    """

    def __init__(self, model_name="bert-base-uncased", out_dim=EMBED_DIM):
        super().__init__()
        self.model_name = model_name

        try:
            # Try to load a pretrained transformer for rich language understanding
            self.transformer = AutoModel.from_pretrained(model_name)
            # A small linear layer converts transformer output to the size we need for our model
            self.fc = nn.Linear(self.transformer.config.hidden_size, out_dim)
            self._is_pretrained = True  # remember that we are using a real transformer
        except Exception:
            # Fallback: lightweight model for when transformers aren't available
            # Embedding layer converts words to vectors
            self.embedding = nn.Embedding(1001, 128, padding_idx=0)
            # Simple pooling over words to get one vector representing the whole sentence
            self.pool = nn.AdaptiveAvgPool1d(1)
            # Linear layer maps the pooled vector to the size our MoE model expects
            self.fc = nn.Linear(128, out_dim)
            self._is_pretrained = False  # using simple fallback

    def forward(self, text_inputs):
        """
        Convert a batch of text into a batch of fixed-size vectors.

        Input:
            text_inputs: a dictionary containing tokenized text
                         ('input_ids' and 'attention_mask')
        Output:


In [7]:
# =========================================================
# 5) GATING NETWORK — decides expert weights per sample
# =========================================================
class SimpleGate(nn.Module):
    """
    This is the “decision maker” of the Mixture-of-Experts model.

    Conceptually:
    - The gate looks at a summary vector representing the input (context_feat).
    - It decides **how much to trust each expert** for this particular input.
    - Outputs a set of weights (one per expert) that sum to 1.
      These weights will later be used to combine expert outputs.
    """

    def __init__(self, input_dim=EMBED_DIM, num_experts=2):
        super().__init__()
        # Linear layer maps input context vector to one score per expert
        self.fc = nn.Linear(input_dim, num_experts)

    def forward(self, context_feat):
        """
        Forward pass:
        1. Take a context vector representing the input.
        2. Produce raw scores for each expert using a linear layer.
        3. Convert scores to probabilities using softmax (so they sum to 1).
        4. Return these probabilities (weights) for combining experts.

        Input:
            context_feat: [batch_size, input_dim] summary of the input
        Output:
            weights: [batch_size, num_experts] probabilities for each expert
        """
        logits = self.fc(context_feat)         # Raw score per expert
        weights = F.softmax(logits, dim=-1)    # Convert to probabilities
        return weights


In [8]:
# =========================================================
# 6) MIXTURE OF EXPERTS (COMBINES BERT + DISTILBERT)
# =========================================================
class TextMoE(nn.Module):
    """
    This class implements the Mixture-of-Experts (MoE) model for text.
    
    Conceptually:
    - We have multiple NLP “experts” (e.g., BERT and DistilBERT).
    - Each expert produces its own vector representation of the input sentence.
    - A gating network decides how much to trust each expert for this particular input.
    - The expert outputs are then combined (weighted sum) according to the gate.
    - The combined vector is passed to a final classifier to predict the label.
    """

    def __init__(self, expert1, expert2, gate, out_classes=NUM_CLASSES):
        super().__init__()
        # Store the two experts in a ModuleList
        self.experts = nn.ModuleList([expert1, expert2])
        # The gating network decides per-input expert weights
        self.gate = gate
        # Final classification layer maps combined embedding to class probabilities
        self.output_head = nn.Linear(EMBED_DIM, out_classes)

    def forward(self, tokenized1, tokenized2):
        """
        Forward pass for the MoE model.

        Inputs:
            tokenized1: tokenized text for expert1 (e.g., BERT)
            tokenized2: tokenized text for expert2 (e.g., DistilBERT)

        Steps:
        1. Get embeddings from each expert separately
            - out1 = expert1(tokenized1)
            - out2 = expert2(tokenized2)
        2. Create a “context vector” for the gate
            - Here, we simply take the average of the expert embeddings
        3. Compute soft weights for each expert using the gating network
        4. Stack expert embeddings together
        5. Multiply each expert embedding by its weight and sum
           → produces a single combined vector representing the input
        6. Pass the combined vector through a classification layer
           → produces final logits for each class
        7. Return both logits and the gate weights (useful for inspection)
        """
        # Step 1: forward through both experts
        out1 = self.experts[0](tokenized1)   # BERT output
        out2 = self.experts[1](tokenized2)   # DistilBERT output

        # Step 2: compute context for gating
        context = (out1 + out2) / 2

        # Step 3: gate decides weights for each expert
        weights = self.gate(context)  # [batch_size, 2]

        # Step 4: stack expert embeddings for weighted combination
        stacked = torch.stack([out1, out2], dim=-1)  # [B, EMBED_DIM, 2]

        # Step 5: combine experts according to weights
        mixed = (stacked * weights.unsqueeze(1)).sum(dim=-1)

        # Step 6: final classification
        logits = self.output_head(mixed)

        # Step 7: return logits and gate weights
        return logits, weights


In [9]:
# =========================================================
# 7) MODEL INITIALIZATION
# =========================================================
expert1 = NLPExpert(model_name="bert-base-uncased", out_dim=EMBED_DIM)
expert2 = NLPExpert(model_name="distilbert-base-uncased", out_dim=EMBED_DIM)
gate = SimpleGate(input_dim=EMBED_DIM, num_experts=2)

model = TextMoE(expert1, expert2, gate).to(device)
print("Model created.")
print(f"Expert1: {expert1.model_name}, pretrained: {expert1._is_pretrained}")
print(f"Expert2: {expert2.model_name}, pretrained: {expert2._is_pretrained}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model created.
Expert1: bert-base-uncased, pretrained: True
Expert2: distilbert-base-uncased, pretrained: True


In [10]:
# =========================================================
# 8) OPTIMIZER / LOSS FUNCTION
# =========================================================
# Optimizer: decides how to adjust model weights to reduce errors
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Loss function: measures how wrong the model's predictions are
# CrossEntropyLoss is used for multi-class classification tasks
criterion = nn.CrossEntropyLoss()


# =========================================================
# 9) TRAINING LOOP
# =========================================================
# We train the model for NUM_EPOCHS passes over the dataset
for epoch in range(NUM_EPOCHS):
    model.train()  # set model to training mode
    running_loss, correct, total = 0.0, 0, 0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

    for texts, labels in pbar:
        # Step 1: Tokenize text for both experts
        tokenized1 = simple_tokenize(texts, tokenizer=tokenizer_bert)
        tokenized2 = simple_tokenize(texts, tokenizer=tokenizer_distilbert)

        # Move all inputs to the correct device (CPU/GPU)
        for k in tokenized1: tokenized1[k] = tokenized1[k].to(device)
        for k in tokenized2: tokenized2[k] = tokenized2[k].to(device)
        labels = labels.to(device)

        # Step 2: Zero gradients from previous step
        optimizer.zero_grad()

        # Step 3: Forward pass through the Mixture-of-Experts model
        # Returns:
        #   logits = predicted class scores
        #   gate_w = weights assigned to each expert for this batch
        logits, gate_w = model(tokenized1, tokenized2)

        # Step 4: Compute loss between predictions and true labels
        loss = criterion(logits, labels)

        # Step 5: Backpropagation — compute gradients
        loss.backward()

        # Step 6: Update model weights based on gradients
        optimizer.step()

        # Step 7: Track running metrics
        running_loss += loss.item() * labels.size(0)
        _, predicted = logits.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        # Step 8: Update progress bar with loss, accuracy, and gate weights
        pbar.set_postfix({
            "loss": f"{running_loss/total:.4f}",
            "acc": f"{correct/total:.3f}",
            "w1": f"{gate_w[:,0].mean().item():.3f}",  # average weight for expert1
            "w2": f"{gate_w[:,1].mean().item():.3f}"   # average weight for expert2
        })

    # Step 9: Compute and print epoch-level metrics
    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total
    print(f"Epoch {epoch+1}: Loss={epoch_loss:.4f}, Acc={epoch_acc:.4f}")

Epoch 1/1: 100%|███████████████████████████| 32/32 [00:03<00:00,  9.41it/s, loss=2.3116, acc=0.094, w1=0.510, w2=0.490]

Epoch 1: Loss=2.3116, Acc=0.0938



