# Text Classification - Vanilla Mixture of Experts (End to End, Hard) Predictions

----

## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our model and state dict and data
* Dataset - Create PyTorch Dataset
* Model - Create PyTorch Vanilla Model
* Helper - helper functions
* Predictions - get and save predictions


## $\color{blue}{Preamble:}$

This notebook will take the best model parameters from the Vanilla Mixture of Experts (End to End, Hard) and produce a list of predictions for further analysis.

## $\color{blue}{Admin:}$

In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'

Mounted at /content/drive
/content/drive/MyDrive


In [3]:
%%capture
!pip install torch
!pip install dill

In [4]:
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
import pandas as pd
path = "class/datasets/"
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

In [6]:
df_train.head()

Unnamed: 0,index,master,book_idx,book,chapter_idx,chapter,author,content,vanilla_embedding
8114,8114,Dubliners,3,Dubliners,31,GRACE,Joyce,“Is it John of Tuam?” “Are you sure of that ...,"[-0.012913608, -0.026916211, 0.0023321153, -0...."
4951,4951,Ulysses,2,Nostos,15,Eumaeus,Joyce,sibly there were several others. He personally...,"[-0.019626686, -0.035692617, -0.034875672, 0.0..."
4629,4629,Ulysses,2,Nostos,15,Eumaeus,Joyce,"Stephen, who was trying his dead best to yawn ...","[0.015934143, -0.0034991587, 0.0035751674, 0.0..."
11556,11556,Dracula,4,Dracula,59,CHAPTER XXVII: MINA HARKER’S JOURNAL,Bram Stoker,"Now to the historical, for as Madam Mina write...","[-4.009433e-05, -0.0041142944, 0.026873538, -0..."
12262,12262,Republic,5,Republic,62,Book III,Plato,The harmonies which you mean are the mixed or ...,"[0.0048890463, -0.0060007297, 0.0054147574, -0..."


## $\color{blue}{Dataset:}$

In [7]:
train_embeddings = [torch.tensor(array) for array in df_train['vanilla_embedding']]
train_x = torch.stack(train_embeddings).to(device)

dev_embeddings = [torch.tensor(array) for array in df_dev['vanilla_embedding']]
dev_x = torch.stack(dev_embeddings).to(device)

test_embeddings = [torch.tensor(array) for array in df_test['vanilla_embedding']]
test_x = torch.stack(test_embeddings).to(device)

In [8]:
# train_y = torch.LongTensor(list(df_train['book_idx'])).to(device)
# dev_y = torch.LongTensor(list(df_dev['book_idx'])).to(device)
# test_y = torch.LongTensor(list(df_test['book_idx'])).to(device)

train_y = torch.LongTensor(list(df_train['chapter_idx'])).to(device)
dev_y = torch.LongTensor(list(df_dev['chapter_idx'])).to(device)
test_y = torch.LongTensor(list(df_test['chapter_idx'])).to(device)

In [9]:
from torch.utils.data import Dataset, DataLoader
# assuming already tensors, allready on device
class VanillaDataset(Dataset):
  """Dataset maker"""

  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self,index):
    x = self.x[index]
    y = self.y[index]

    return x, y

  def __len__(self):
    return len(self.y)


In [10]:
train_dataset = VanillaDataset(train_x, train_y)
dev_dataset = VanillaDataset(dev_x, dev_y)
test_dataset = VanillaDataset(test_x, test_y)

In [11]:
train_dataset[0][0].size()

torch.Size([768])

## $\color{blue}{Model:}$

modify below

In [12]:
import torch.nn as nn
import torch.nn.functional as F

class DenseBlock(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate):
        super(DenseBlock, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.batch_norm = nn.BatchNorm1d(output_size)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.linear(x)
        x = self.batch_norm(x)
        x = self.activation(x)
        x = self.dropout(x)
        return x

class FeedForwardExpert(nn.Module):
    def __init__(self, output_size, dropout_rate):
        super(FeedForwardExpert, self).__init__()
        self.output_size = output_size

        # Define the dense blocks
        self.block1 = DenseBlock(768, 400, dropout_rate)
        self.block2 = DenseBlock(400, 200, dropout_rate)
        self.final_layer = nn.Linear(200, self.output_size)

        self.initialize_weights()

    def forward(self, x):
        x = self.block1(x)  # Bx768 -> Bx400
        x = self.block2(x)  # Bx400 -> Bx50
        x = self.final_layer(x)  # Bx50 -> Bx6
        return x

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

class Router(nn.Module):
    def __init__(self, num_experts):
        super().__init__()
        self.num_experts = num_experts
        self.fc1 = nn.Linear(768, 128)
        self.fc2 = nn.Linear(128, self.num_experts)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=-1)

class MoE(nn.Module):
    def __init__(self, num_experts, output_size=70, dropout_rate=0.11, top_k=2):
        super().__init__()
        self.num_experts = num_experts
        self.dropout_rate = dropout_rate
        self.k = top_k
        self.output_size = output_size
        self.experts = nn.ModuleList([FeedForwardExpert(self.output_size, self.dropout_rate) for _ in range(num_experts)])
        self.router = Router(self.num_experts)

    def forward(self, x):
        # Get routing weights and find the top-k experts
        routing_weights = self.router(x)
        topk_vals, topk_indices = torch.topk(routing_weights, self.k, dim=1)
        topk_vals_normalized = topk_vals / topk_vals.sum(dim=1, keepdim=True) # Normalize top-k values

        # Initialize an output tensor with zeros
        outputs = torch.zeros(x.size(0), self.output_size, device=x.device) # Shape (bs, c)

        # Iterate through the experts
        for expert_index in range(len(self.experts)):
        # Check if the expert is in the top-k indices
            expert_mask = (topk_indices == expert_index)
            if expert_mask.any(): # If this expert is selected in any of the top-k

                expert_weight = (topk_vals_normalized * expert_mask).sum(dim=1).view(-1,1) # (bs, 1)
                # Get output from the expert
                expert_output = self.experts[expert_index](x) # Shape (bs, c)

                # Multiply the output by the corresponding weights and sum up
                outputs += (expert_output * expert_weight) # Shape (bs, c)
        return outputs



## $\color{blue}{Helper:}$

In [13]:
def accuracy(outputs, labels):
    # argmax to get predicted classes
    _, predicted = torch.max(outputs, 1)

    # count correct
    correct = (predicted == labels).sum().item()

    # get average
    acc = correct / labels.size(0)  # Total number of samples
    return acc

In [14]:
def validate(model, dev_loader, criterion):
    model.eval()
    dev_losses = []
    dev_accuracy = []
    pred_holder = []
    real_holder = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(dev_loader):
            out = model(x)

            dev_loss = criterion(out, y)
            dev_acc = accuracy(out, y)

            _, predicted = torch.max(out, 1)


            dev_losses.append(dev_loss.item())
            dev_accuracy.append(dev_acc)
            preds = [item.item() for item in predicted]
            pred_holder += preds
            reals = [item.item() for item in y]
            real_holder += reals

    return np.mean(dev_losses), np.mean(dev_accuracy), pred_holder, real_holder

## $\color{blue}{Predictions:}$

modify below

In [15]:
model = MoE(3, dropout_rate=0.11)
path = 'class/models/vanilla_moe_e2e_hard.pt'
model.load_state_dict(torch.load(path))

  model.load_state_dict(torch.load(path))


<All keys matched successfully>

In [16]:
def run(model):
  # Set up new model
  model = model.to(device)
  criterion = nn.CrossEntropyLoss()

  # Prepare data loaders
  dev_loader = DataLoader(dev_dataset, batch_size=128)

  # call training and validation functions
  dev_loss, dev_acc, preds, reals = validate(model, dev_loader, criterion)

  return dev_loss, dev_acc, preds, reals

In [17]:
dev_loss, dev_acc, preds, reals = run(model)

modify below

In [19]:
df_dev['vanilla_moe_e2e_hard_preds'] = preds

In [20]:
path = "class/datasets/"
df_dev.to_pickle(path + "df_dev")