# Text Classification - Vanilla Mixture of Experts (Hard, Pretrained) Expert Check

----

## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our model and state dict and data
* Dataset - Create PyTorch Dataset
* Model - Create PyTorch Vanilla MoE Hard Pretrained
* Helper - helper functions
* Choices - analysis of the choice of experts


## $\color{blue}{Preamble:}$

This notebook will take analyse which experts are being activated for every author, to answer the question of whether we have learnt to activate the correct expert.

## $\color{blue}{Admin:}$

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install torch
!pip install dill

In [None]:
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
import pandas as pd
path = "class/datasets/"
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

In [None]:
df_train.head()

Unnamed: 0,index,master,book_idx,book,chapter_idx,chapter,author,content,vanilla_embedding
8114,8114,Dubliners,3,Dubliners,31,GRACE,Joyce,“Is it John of Tuam?” “Are you sure of that ...,"[-0.012913608, -0.026916211, 0.0023321153, -0...."
4951,4951,Ulysses,2,Nostos,15,Eumaeus,Joyce,sibly there were several others. He personally...,"[-0.019626686, -0.035692617, -0.034875672, 0.0..."
4629,4629,Ulysses,2,Nostos,15,Eumaeus,Joyce,"Stephen, who was trying his dead best to yawn ...","[0.015934143, -0.0034991587, 0.0035751674, 0.0..."
11556,11556,Dracula,4,Dracula,59,CHAPTER XXVII: MINA HARKER’S JOURNAL,Bram Stoker,"Now to the historical, for as Madam Mina write...","[-4.009433e-05, -0.0041142944, 0.026873538, -0..."
12262,12262,Republic,5,Republic,62,Book III,Plato,The harmonies which you mean are the mixed or ...,"[0.0048890463, -0.0060007297, 0.0054147574, -0..."


## $\color{blue}{Dataset:}$

In [None]:
train_embeddings = [torch.tensor(array) for array in df_train['vanilla_embedding']]
train_x = torch.stack(train_embeddings).to(device)

dev_embeddings = [torch.tensor(array) for array in df_dev['vanilla_embedding']]
dev_x = torch.stack(dev_embeddings).to(device)

test_embeddings = [torch.tensor(array) for array in df_test['vanilla_embedding']]
test_x = torch.stack(test_embeddings).to(device)

In [None]:
# train_y = torch.LongTensor(list(df_train['book_idx'])).to(device)
# dev_y = torch.LongTensor(list(df_dev['book_idx'])).to(device)
# test_y = torch.LongTensor(list(df_test['book_idx'])).to(device)

train_y = torch.LongTensor(list(df_train['chapter_idx'])).to(device)
dev_y = torch.LongTensor(list(df_dev['chapter_idx'])).to(device)
test_y = torch.LongTensor(list(df_test['chapter_idx'])).to(device)

In [None]:
from torch.utils.data import Dataset, DataLoader
# assuming already tensors, allready on device
class VanillaDataset(Dataset):
  """Dataset maker"""

  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self,index):
    x = self.x[index]
    y = self.y[index]

    return x, y

  def __len__(self):
    return len(self.y)


In [None]:
train_dataset = VanillaDataset(train_x, train_y)
dev_dataset = VanillaDataset(dev_x, dev_y)
test_dataset = VanillaDataset(test_x, test_y)

In [None]:
train_dataset[0][0].size()

torch.Size([768])

## $\color{blue}{Model:}$

modify below

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Router(nn.Module):
    def __init__(self, num_experts, temperature=2):
        super().__init__()
        self.num_experts = num_experts
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, self.num_experts)
        self.temperature = temperature

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x) / self.temperature

        if self.temperature > 1:
          self.temperature *= 0.99
        else:
          self.temperature = 1

        return F.softmax(x, dim=-1)

class MoE(nn.Module):
    def __init__(self, expert_joyce, expert_stoker, expert_plato, temperature=1.2, num_experts=3, output_size=70, dropout_rate=0.11, top_k=1):
        super().__init__()
        self.num_experts = num_experts
        self.dropout_rate = dropout_rate
        self.k = top_k
        self.output_size = output_size
        self.temperature = temperature
        self.experts = nn.ModuleList([expert_joyce, expert_stoker, expert_plato])
        self.router = Router(self.num_experts)

    def forward(self, x):
        # Get routing weights
        routing_weights = self.router(x)  # Shape (bs, num_experts)

        # Sample k experts according to the routing weights
        # Ensure sum of weights is 1 (needed condition for probabilities)
        routing_weights = F.normalize(routing_weights, p=1, dim=-1)

        # Get the indices of experts based on probabilities
        topk_indices = torch.multinomial(routing_weights, num_samples=self.k, replacement=False)

        topk_vals = routing_weights.gather(1, topk_indices)  # Get the probability values for selected experts


        # Initialize an output tensor with zeros
        outputs = torch.zeros(x.size(0), self.output_size, device=x.device)  # Shape (bs, c)
        expert_weights = []
        # Iterate through the experts
        for i in range(self.k):
            expert_indices = topk_indices[:, i]

            for j in range(self.num_experts):
                # Check if the expert j is selected in current batch
                expert_mask = (expert_indices == j)
                if expert_mask.any():
                    expert_weight = topk_vals[:, i].view(-1, 1) * expert_mask.float().view(-1, 1)

                    # Get output from the expert
                    expert_output = self.experts[j](x)  # Shape (bs, c)

                    # Multiply the output by the corresponding weights and sum up
                    outputs += expert_output * expert_weight  # Shape (bs, c)
                    expert_weights.append(expert_weight)
                else:
                  expert_weights.append(torch.zeros((x.size(0),1)))

        return outputs, expert_weights


import torch.nn as nn
import torch.nn.functional as F

class DenseBlock(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate):
        super(DenseBlock, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.batch_norm = nn.BatchNorm1d(output_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        x = self.activation(x)
        x = self.dropout(x)
        return x

class FeedForwardExpert(nn.Module):
    def __init__(self, output_size, dropout_rate):
        super(FeedForwardExpert, self).__init__()
        self.output_size = output_size

        # Define the dense blocks
        self.block1 = DenseBlock(768, 400, dropout_rate)
        self.block2 = DenseBlock(400, 200, dropout_rate)
        self.final_layer = nn.Linear(200, self.output_size)

        self.initialize_weights()

    def forward(self, x):
        x = self.block1(x)  # Bx768 -> Bx400
        x = self.block2(x)  # Bx400 -> Bx50
        x = self.final_layer(x)  # Bx50 -> Bx6
        return x

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)


## $\color{blue}{Helper:}$

In [None]:
def accuracy(outputs, labels):
    # argmax to get predicted classes
    _, predicted = torch.max(outputs, 1)

    # count correct
    correct = (predicted == labels).sum().item()

    # get average
    acc = correct / labels.size(0)  # Total number of samples
    return acc

In [None]:
def validate(model, dev_loader, criterion):
    model.eval()
    dev_losses = []
    dev_accuracy = []
    pred_holder = []
    real_holder = []
    weights_holder = []
    y_holder = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(dev_loader):
            out, weights = model(x)

            dev_loss = criterion(out, y)
            dev_acc = accuracy(out, y)
            weights_holder.append(weights)
            y_holder.append(y)
            _, predicted = torch.max(out, 1)


            dev_losses.append(dev_loss.item())
            dev_accuracy.append(dev_acc)
            preds = [item.item() for item in predicted]
            pred_holder += preds
            reals = [item.item() for item in y]
            real_holder += reals

    return np.mean(dev_losses), np.mean(dev_accuracy), pred_holder, real_holder, weights_holder[0], y_holder[0]

## $\color{blue}{Choices:}$

In [None]:
model = MoE(FeedForwardExpert(70,.11),FeedForwardExpert(70,.11),FeedForwardExpert(70,.11))
path = 'class/models/vanilla_moe_hard_pre.pt'
model.load_state_dict(torch.load(path))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

  model.load_state_dict(torch.load(path))


1308029

In [None]:
def run(model):
  # Set up new model
  model = model.to(device)
  criterion = nn.CrossEntropyLoss()

  # Prepare data loaders
  dev_loader = DataLoader(dev_dataset, batch_size=len(dev_dataset))

  # call training and validation functions
  dev_loss, dev_acc, preds, reals, weights, ys = validate(model, dev_loader, criterion)

  return dev_loss, dev_acc, preds, reals, weights, ys

In [None]:
dev_loss, dev_acc, preds, reals, weights, ys = run(model)

In [None]:
D = dict(df_train.groupby('book_idx')['chapter_idx'].unique())
chapter2book = {}
for book, chapters in D.items():
  for chapter in chapters:
    chapter2book[chapter] = book

In [None]:
ys_books = torch.Tensor([chapter2book[chapter] for chapter in ys.cpu().numpy()])

In [None]:
joyce_mask = torch.isin(ys_books, torch.Tensor([0,1,2,3]))
stoker_mask = ys_books == 4
plato_mask = ys_books == 5

In [None]:
ex_1 = weights[0]
ex_2 = weights[1]
ex_3 = weights[2]

In [None]:
chosen = [[],[],[]]
for i in range(len(weights[0])):
  argmax = np.argmax([weights[0].cpu().numpy()[i],weights[1].cpu().numpy()[i], weights[2].cpu().numpy()[i]])
  for j in range(3):
    if j == argmax:
      chosen[j].append(1)
    else:
      chosen[j].append(0)

chosen = [torch.Tensor(el) for el in chosen]

In [None]:
chosen_joyce = [el[joyce_mask] for el in chosen]
chosen_stoker = [el[stoker_mask] for el in chosen]
chosen_plato = [el[plato_mask] for el in chosen]

In [None]:
chosen_joyce[0].size()

torch.Size([600])

In [None]:
def analyse(expert):
  joyce = expert[joyce_mask]
  stoker = expert[stoker_mask]
  plato = expert[plato_mask]

  print("Experts average on Joyce Poitns: ", joyce.mean())
  print("Experts average on Stoker Poitns: ", stoker.mean())
  print("Experts average on Plato Poitns: ", plato.mean())


In [None]:
# Expert 1
analyse(ex_1)
print()
print(f'Expert chosen on Joyce Points: {chosen_joyce[0].mean()}')
print(f'Expert chosen on Stoker Points: {chosen_stoker[0].mean()}')
print(f'Expert chosen on Plato Points: {chosen_plato[0].mean()}')

Experts average on Joyce Poitns:  tensor(0.6082, device='cuda:0')
Experts average on Stoker Poitns:  tensor(0.0467, device='cuda:0')
Experts average on Plato Poitns:  tensor(0.0157, device='cuda:0')

Expert chosen on Joyce Points: 0.7549999952316284
Expert chosen on Stoker Points: 0.1785714328289032
Expert chosen on Plato Points: 0.05714285746216774


In [None]:
# Expert 2
analyse(ex_2)
print()
print(f'Expert chosen on Joyce Points: {chosen_joyce[1].mean()}')
print(f'Expert chosen on Stoker Points: {chosen_stoker[1].mean()}')
print(f'Expert chosen on Plato Points: {chosen_plato[1].mean()}')

Experts average on Joyce Poitns:  tensor(0.0058, device='cuda:0')
Experts average on Stoker Poitns:  tensor(0.3664, device='cuda:0')
Experts average on Plato Poitns:  tensor(0.0013, device='cuda:0')

Expert chosen on Joyce Points: 0.03333333507180214
Expert chosen on Stoker Points: 0.5848214030265808
Expert chosen on Plato Points: 0.02142857201397419


In [None]:
# Expert 3
analyse(ex_3)
print()
print(f'Expert chosen on Joyce Points: {chosen_joyce[2].mean()}')
print(f'Expert chosen on Stoker Points: {chosen_stoker[2].mean()}')
print(f'Expert chosen on Plato Points: {chosen_plato[2].mean()}')

Experts average on Joyce Poitns:  tensor(0.0658, device='cuda:0')
Experts average on Stoker Poitns:  tensor(0.0701, device='cuda:0')
Experts average on Plato Poitns:  tensor(0.7994, device='cuda:0')

Expert chosen on Joyce Points: 0.21166667342185974
Expert chosen on Stoker Points: 0.2366071492433548
Expert chosen on Plato Points: 0.9214285612106323
