In [None]:
!unzip tacred.zip

In [None]:
!pip install git+https://github.com/huggingface/trl
!pip install --upgrade huggingface_hub

In [None]:
!huggingface-cli login --token 'your_token_here'

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `gmm` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `gmm`


## Libraries

In [2]:
import os
import json
import random
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import BertModel, PreTrainedModel, BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from tqdm import tqdm
from sklearn.metrics import accuracy_score
# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
torch.cuda.empty_cache()

## Source Codes.

In [3]:

def get_phi(m, D, which_phi='performer', device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Function that returns the random feature map, phi.
    Since our neuron-astrocyte model is equivalent to using Random Feature Attention,
    we use this representation for simplicity. Different phi functions lead to different feature maps.
    """
    # Set random seed for reproducibility
    torch.manual_seed(42)

    # Random weight matrix for random feature map
    W = torch.randn((m, D), device=device) # Changed W initialization to (m, D)

    if which_phi == 'cosine':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def phi(x, c=0):
            """Uses a cosine random feature map to approximate softmax attention."""

            projected_x = nn.Linear(x.shape[-1], D, bias=False).to(device)(x) # Project x to dimension D using a linear layer with no bias
            # projected_x.shape is now (sequence_length, D) which is (512, 512)

            return torch.sqrt(torch.tensor(2 / m, device=device)) * torch.cos(projected_x @ W + rand_b) * torch.exp(0.5 * (torch.norm(projected_x) ** 2) - c) # Changed W @ projected_x to projected_x @ W

    elif which_phi == 'performer':
        def phi(x, c=0):
            """Uses an exponential random feature map to approximate softmax attention."""
            # Transpose W to ensure correct dimensions for multiplication
            return torch.exp(-0.5 * torch.log(torch.tensor(m, device=device)) + x @ W - 0.5 * (torch.norm(x) ** 2)) # Changed W.T @ x to x @ W

    elif which_phi == 'linear':
        def phi(x, c=0):
            """Uses a linear random feature map to approximate softmax attention."""
            h = -0.5 * torch.log(torch.tensor(m, device=device)) + W @ x - 0.5 * (torch.norm(x) ** 2)
            return 1 + h

    elif which_phi == 'truncated_performer':
        def phi(x, thresh=150):
            """Uses an exponential random feature map to approximate softmax attention."""
            scaling_factors = torch.exp(-0.5 * torch.log(torch.tensor(m, device=device)) - 0.5 * (torch.norm(x) ** 2))
            h = torch.exp(W @ x) # Multiplication should now work correctly
            return scaling_factors * torch.clamp(h, min=0, max=thresh)

    elif which_phi == 'positive_cosine':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def phi(x, thresh=10):
            """Uses a positive cosine random feature map to approximate softmax attention."""
            # Convert m to a tensor to ensure correct type for division
            m_tensor = torch.tensor(m, device=device, dtype=torch.float32)
            # Project x to dimension D using a linear layer with no bias
            projected_x = nn.Linear(x.shape[-1], D, bias=False).to(device)(x)
            scaling_factors = torch.sqrt(2.0 / (torch.pi * m_tensor)) * torch.exp(0.5 * (torch.norm(projected_x) ** 2))
            h = torch.cos(W @ projected_x + rand_b)  # Use projected_x for matrix multiplication
            return torch.clamp(scaling_factors * h, min=0)

    elif which_phi == 'dima_sin':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def clipped_sin(x):
            """Clips the sine values."""
            return torch.where(x > torch.pi / 2, 1, torch.where(x < -torch.pi / 2, -1, torch.sin(x)))

        def phi(x, thresh=10):
            """Uses a sine-based random feature map to approximate softmax attention."""
            # Convert m to a tensor to ensure correct type for division
            m_tensor = torch.tensor(m, device=device, dtype=torch.float32) # Convert m to a tensor

            # Project x to dimension D using a linear layer with no bias
            projected_x = nn.Linear(x.shape[-1], D, bias=False).to(device)(x)  # Project x to D dimensions

            scaling_factors = torch.sqrt(2.0 / m_tensor) * torch.exp(0.5 * (torch.norm(projected_x) ** 2)) # Use m_tensor for division
            h = clipped_sin(W @ projected_x + rand_b) # Use projected_x for matrix multiplication
            return scaling_factors * h

    else:
        raise ValueError(f"Unknown phi type: {which_phi}")

    return phi

def get_astro_responses(query_layer, key_layer, nhead, phi):
    """
    Computes astrocyte response given a random feature map, queries, and keys.

    Args:
        query_layer: Tensor of shape (n_sample, ntokens, dim)
        key_layer: Tensor of shape (nhead, ntokens, dim)
        nhead: Integer index for the current head
        phi: Function to apply to the keys and queries

    Returns:
        Tensor of shape (n_sample, ntokens, ntokens) representing astro_pulses.
    """
    # Get the device of the query_layer
    device = query_layer.device

    rfa_normalized_keys = phi(key_layer[nhead])
    transformed_queries = phi(query_layer)


    rfa_normalized_keys = rfa_normalized_keys.T

    # Perform batched matrix multiplication
    astro_pulses = torch.matmul(transformed_queries, rfa_normalized_keys)

    return astro_pulses


def neurogenesis(head_size, query_layer, key_layer, nhead, phi='performer'):
    # Normalize Q and K matrices appropriately
    query_layer = query_layer / head_size ** (1/4)
    key_layer = key_layer / head_size ** (1/4)

    # Ensure tensors are on GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    query_layer = query_layer.to(device)
    key_layer = key_layer.to(device)


    phi_low_m = get_phi(m=1024, D=1024, which_phi=phi) #changed value of D from 512 to 1024


    astro_ps_low_m = get_astro_responses(query_layer, key_layer, 0, phi_low_m)

    # Move the result back to CPU for further processing or conversion to NumPy
    return astro_ps_low_m.cpu().detach().numpy()

In [4]:


class CustomBERTEncoderBlock(nn.Module):
    def __init__(self, embed_size, ff_hidden_size, dropout=0.1, neurogenesis=True, phi='performer'):

        super(CustomBERTEncoderBlock, self).__init__()
        self.query_fc = nn.Linear(embed_size, embed_size)
        self.key_fc = nn.Linear(embed_size, embed_size)
        self.value_fc = nn.Linear(embed_size, embed_size)
        self.attn_out_fc = nn.Linear(embed_size, embed_size)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, ff_hidden_size),
            nn.ReLU(),
            nn.Linear(ff_hidden_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        self.neurogenesis = neurogenesis
        self.phi = phi

    def forward(self, x):
        query = self.query_fc(x)
        key = self.key_fc(x)
        value = self.value_fc(x)
        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / (query.size(-1) ** 0.5)
        # Apply neurogenesis only if enabled
        if self.neurogenesis:
            low = neurogenesis(512, query, key, 4, phi=self.phi)
            # # Convert low and high to PyTorch tensors before applying softmax
            low = torch.tensor(low, device=x.device, dtype=torch.float32)  # Ensure correct data type
            attn_weights = F.softmax(low, dim=-1)
        else:
            attn_weights = F.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_weights, value)

        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))

        return x, key, query, value


class BertForSequenceClassification_Neuro(PreTrainedModel): # Inherit from PreTrainedModel
    def __init__(self, config, pretrained_model_name='bert-large-uncased', num_classes=8, ff_hidden_size=2048, dropout=0.1, use_custom_encoder=True, neuro_genesis=True, phi='performer'):

        super(BertForSequenceClassification_Neuro, self).__init__(config) # Pass config to super()
        self.bert = BertModel.from_pretrained(pretrained_model_name, config=config) # Pass config to BertModel
        self.use_custom_encoder = use_custom_encoder
        self.num_labels = num_classes
        embed_size = self.bert.config.hidden_size

        if use_custom_encoder:
            self.custom_encoder = CustomBERTEncoderBlock(embed_size, ff_hidden_size, dropout,neurogenesis=neurogenesis, phi=phi)

        self.classifier = nn.Linear(embed_size, num_classes)
        self.dropout = nn.Dropout(dropout)


    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        sequence_output = bert_output.last_hidden_state
        pooled_output = bert_output.pooler_output

        if self.use_custom_encoder:
            sequence_output, key, query, value = self.custom_encoder(sequence_output)
            pooled_output = sequence_output[:, 0, :]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if self.use_custom_encoder:
            return logits
        return logits


In [5]:

def read_json(path):
    """ Read a json file from the given path."""
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def write_json(data, path):
    """ Write a json file to the given path."""
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)



def data_preparation(data):
  data_prepared = []
  for item in data:

    sentence = item["sentence"]
    entity1 = item["subject"]
    entity2 = item["object"]
    relation = item["relation"]
    sentence_e1 = sentence.replace(entity1, f"[E1]{entity1}[/E1]")
    sentence_e2 = sentence_e1.replace(entity2, f"[E2]{entity2}[/E2]")
    row = {"sentence": "[CLS] "+sentence_e2, "relation": relation, "e1":entity1, "e2":entity2}
    data_prepared.append(row)
  return data_prepared


class RelationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [6]:
class TaskRouter(nn.Module):
    def __init__(self, input_dim, num_tasks, pretrained_model=None):
        super(TaskRouter, self).__init__()
        self.fc = nn.Linear(input_dim, num_tasks)

        if pretrained_model:
            # Initialize router weights using a pretrained model
            with torch.no_grad():
                pretrained_weights = pretrained_model.state_dict()
                if "classifier.weight" in pretrained_weights and "classifier.bias" in pretrained_weights:
                    self.fc.weight.copy_(pretrained_weights["classifier.weight"][:num_tasks, :input_dim])
                    self.fc.bias.copy_(pretrained_weights["classifier.bias"][:num_tasks])
                else:
                    print("Pretrained weights not compatible; using default initialization.")

    def forward(self, x):
        return torch.softmax(self.fc(x), dim=1)  # Probabilities for each task

class DynamicMixtureOfTasks(nn.Module):

    def __init__(self, initial_tasks, neuro_genesis, neuro_phi, num_classes=4):
        super(DynamicMixtureOfTasks, self).__init__()
        self.neuro_genesis = neuro_genesis
        self.neuro_phi = neuro_phi

        config = BertConfig.from_pretrained("bert-large-uncased", num_labels=num_classes)


        self.tasks = nn.ModuleList([
            BertForSequenceClassification_Neuro(config,
                                                pretrained_model_name='bert-large-uncased',
                                                num_classes=num_classes,
                                                use_custom_encoder=True,
                                                neuro_genesis=neuro_genesis,
                                                phi=neuro_phi) for _ in range(len(initial_tasks))])

        self.bert = BertModel.from_pretrained("bert-large-uncased", config=config)
        pretrained_classifier = BertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=len(initial_tasks)
        )
        self.router = TaskRouter(1024, len(initial_tasks), pretrained_classifier)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        router_input = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        ).pooler_output
        task_probs = self.router(router_input)
        selected_tasks = torch.argmax(task_probs, dim=1)
        batch_outputs = []

        # Iterate through each sample in the batch
        for i, selected_task in enumerate(selected_tasks):
            # Get the output of the selected task model for the current sample
            task_output = self.tasks[selected_task](input_ids[i].unsqueeze(0), attention_mask[i].unsqueeze(0))

            # Append the task output to the batch_outputs list
            batch_outputs.append(task_output)

        # Stack the outputs of all task models to form the final output
        batch_outputs = torch.cat(batch_outputs, dim=0)

        return batch_outputs, task_probs

    def add_task(self, num_classes):
        config = BertConfig.from_pretrained("bert-large-uncased", num_labels=num_classes)
        new_task = BertForSequenceClassification_Neuro(config,
                                                pretrained_model_name='bert-large-uncased',
                                                num_classes=num_classes,
                                                use_custom_encoder=True,
                                                neuro_genesis=self.neuro_genesis,
                                                phi=self.neuro_phi)
        self.tasks.append(new_task)
        num_tasks = len(self.tasks)
        self.router.fc = nn.Linear(1024, num_tasks)
        self.router.bn = nn.BatchNorm1d(num_tasks)

In [7]:
def compute_task_ids(labels):
    task_clusters = {
        1: [0, 1, 2, 3],
        2: [4, 5, 6, 7],
        3: [8, 9, 10, 11],
        4: [12, 13, 14, 15],
        5: [16, 17, 18, 19],
        6: [20, 21, 22, 23],
        7: [24, 25, 26, 27],
        8: [28, 29, 30, 31],
        9: [32, 33, 34, 35],
        10: [36, 37, 38, 39]
    }
    task_ids = []
    for key, values in task_clusters.items(): # Unpack the tuple into key, values
        for label in labels:
            if label in values:
                task_ids.append(key) # Append the key to task_ids
    return task_ids


In [9]:
def train_model(epochs, batch_size, val_dataset, train_dataset, model, run_id, task_id, patience=3):
    """
    Trains a BERT-based model on a given dataset with validation, early stopping, and best model loading.

    Args:
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training and validation.
        val_dataset: Validation dataset.
        train_dataset: Training dataset.
        model: Pretrained model to fine-tune.
        run_id (str): Unique identifier for the training run.
        task_id (str): Task identifier for model saving.
        patience (int): Number of epochs to wait for validation loss improvement before stopping.

    Returns:
        model: The best trained model (based on validation loss).
        tokenizer: The tokenizer associated with the model.
        train_hist: Training history containing loss, accuracy, and learning rate for each epoch.
    """
    torch.cuda.empty_cache()
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    model = model.to("cuda")
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
    criterion = torch.nn.CrossEntropyLoss()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    train_hist = []
    best_val_loss = float('inf')
    best_model_state = None
    no_improvement_epochs = 0
    print(epochs)
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Training phase
        model.train()
        train_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc="Training", leave=False)

        for batch in train_loader_tqdm:

            task_ids = compute_task_ids(batch['label'])
            task_ids_tensor = torch.tensor(task_ids)
            task_ids_tensor = task_ids_tensor.to("cuda")

            # print(f"task_ids: {task_ids}")
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['label'].to("cuda")

            outputs, task_prob_router = model(input_ids, attention_mask)
            logits = outputs.squeeze(1)
            task_prob_router_logits = task_prob_router
            loss_router = criterion(task_prob_router_logits, task_ids_tensor)

            model_loss = criterion(logits, labels)

            loss = model_loss + loss_router
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            train_loader_tqdm.set_postfix({"Batch Loss": loss.item(), "Router Loss": loss_router.item(), "Model Loss":model_loss.item()})


        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        val_router_preds, val_router_labels = [], []
        val_loader_tqdm = tqdm(val_loader, desc="Validation", leave=False)
        with torch.no_grad():
            for batch in val_loader_tqdm:
                input_ids = batch['input_ids'].to("cuda")
                attention_mask = batch['attention_mask'].to("cuda")
                labels = batch['label'].to("cuda")
                task_ids = compute_task_ids(batch['label'])
                task_ids_tensor = torch.tensor(task_ids)
                task_ids_tensor = task_ids_tensor.to("cuda")

                outputs, task_prob = model(input_ids=input_ids, attention_mask=attention_mask)
                logits  = outputs.squeeze(1)
                model_loss = criterion(logits, labels)
                loss_router = criterion(task_prob, task_ids_tensor)
                loss = model_loss + loss_router
                val_loss += loss.item()

                val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                val_router_preds.extend(torch.argmax(task_prob, dim=1).cpu().numpy())
                val_router_labels.extend(task_ids_tensor.cpu().numpy())

                val_loader_tqdm.set_postfix({"Batch Loss": loss.item(), "Router Loss":loss_router.item(), "Model Loss":model_loss.item()})

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(val_labels, val_preds)
        val_router_accuracy = accuracy_score(val_router_labels, val_router_preds)

        scheduler.step(val_loss)

        print(f"Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f},  Router Val Accuracy = {val_router_accuracy:.4f}, Learning Rate = {scheduler.optimizer.param_groups[0]['lr']:.6f}")

        epoch_log = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'val_router_accuracy': val_router_accuracy,
            'router_loss': loss_router.item(),
            'model_loss': model_loss.item(),
            'learning_rate': scheduler.optimizer.param_groups[0]['lr']
        }
        train_hist.append(epoch_log)

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            no_improvement_epochs = 0
        else:
            no_improvement_epochs += 1

        # Early stopping
        if no_improvement_epochs >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

    # Load the best model state into the original model
    if best_model_state is not None:
        model.load_state_dict(best_model_state) # Changed this line
        print("Loaded the best model based on validation loss.")

    print("Training complete.")
    return model, tokenizer, train_hist

In [10]:

def test_model(model, test_dataset, batch_size,run_id, task_id, out_dir):
  model.eval()
  test_preds, test_labels = [], []
  test_loader = DataLoader(test_dataset, batch_size=batch_size)
  test_loader_tqdm = tqdm(test_loader, desc="Testing", leave=False)
  test_router_preds, test_router_labels = [], []
  with torch.no_grad():
      for batch in test_loader_tqdm:
          input_ids = batch['input_ids'].to("cuda")
          attention_mask = batch['attention_mask'].to("cuda")
          labels = batch['label'].to("cuda")
          outputs, task_prob = model(input_ids=input_ids, attention_mask=attention_mask) # Pass attention_mask to the model
          task_ids = compute_task_ids(batch['label'])
          task_ids_tensor = torch.tensor(task_ids)
          task_ids_tensor = task_ids_tensor.to("cuda")

          logits = outputs
          test_router_preds.extend(torch.argmax(task_prob, dim=1).cpu().numpy())
          test_router_labels.extend(task_ids_tensor.cpu().numpy())
          test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())  # Use logits for argmax
          test_labels.extend(labels.cpu().numpy())
  # print(test_labels)
  # print(test_preds)


  test_accuracy = accuracy_score(test_labels, test_preds)
  test_router_accuracy = accuracy_score(test_router_labels, test_router_preds)
  print(f"Test Router Accuracy: {test_router_accuracy:.4f}")

  test_preds = [{i:int(pred)} for i, pred in enumerate(test_preds)]
  test_labels = [{i:int(label)} for i, label in enumerate(test_labels)]
  write_json(test_labels, f"{out_dir}/test_labels_{run_id}_{task_id}.json")
  write_json(test_preds, f"{out_dir}/test_preds_{run_id}_{task_id}.json")

  print(f"Test Accuracy: {test_accuracy:.4f}")
  print(f"Len of Test set: {len(test_preds)}")
  return test_accuracy, test_router_accuracy

In [11]:
def retun_new_task_data(run_id, task_id, dataset_path):


    train_data = read_json(f"{dataset_path}/train/run_{run_id}/task{task_id}/train_1.json")
    val_data = read_json(f"{dataset_path}/train/run_{run_id}/task{task_id}/dev_1.json")
    test_data = read_json(f"{dataset_path}/test/run_{run_id}/task{task_id}/test_1.json")

    train_prepared = data_preparation(train_data)
    val_prepared = data_preparation(val_data)
    test_prepared = data_preparation(test_data)

    train_labels = [item['relation'] for item in train_prepared]
    val_labels = [item['relation'] for item in val_prepared]
    test_labels = [item['relation'] for item in test_prepared]

    label_to_int = {label: idx for idx, label in enumerate(set(train_labels))}
    train_labels = [label_to_int[label] for label in train_labels]
    val_labels = [label_to_int[label] for label in val_labels]
    test_labels = [label_to_int[label] for label in test_labels]

    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    max_length = 128
    train_dataset = RelationDataset([item['sentence'] for item in train_prepared], train_labels, tokenizer, max_length)
    val_dataset = RelationDataset([item['sentence'] for item in val_prepared], val_labels, tokenizer, max_length)
    test_dataset = RelationDataset([item['sentence'] for item in test_prepared], test_labels, tokenizer, max_length)

    return train_dataset, val_dataset, test_dataset, label_to_int


In [None]:


def main(out_dir, dataset_path, neuro_genesis, baseline_name, epoch_list, batch_list, neuro_phi):


    torch.cuda.empty_cache()
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    for run_id in range(1,2):
        torch.cuda.empty_cache()
        results = []
        all_train_texts, all_train_labels = [], []  # Initialize empty lists for texts and labels
        all_val_texts, all_val_labels = [], []
        all_test_texts, all_test_labels = [], []

        initial_task_id = 1
        model = DynamicMixtureOfTasks(
            initial_tasks=[4,4],
            neuro_genesis=neuro_genesis,
            neuro_phi=neuro_phi,
            num_classes=4
        )
        train_dataset, val_dataset, test_dataset, label_to_int = retun_new_task_data(run_id, initial_task_id, dataset_path)

        # Get the texts and labels from the initial dataset
        all_train_texts.extend(train_dataset.texts)
        all_train_labels.extend(train_dataset.labels)  # Use the labels attribute
        all_val_texts.extend(val_dataset.texts)  # Access the texts attribute for val_dataset
        all_val_labels.extend(val_dataset.labels)
        all_test_texts.extend(test_dataset.texts)  # Access the texts attribute for test_dataset
        all_test_labels.extend(test_dataset.labels)

        for task_id in range(2, 11):
            torch.cuda.empty_cache()

            train_dataset, val_dataset, test_dataset, label_to_int = retun_new_task_data(run_id, task_id, dataset_path)

            # Extend the combined lists with data from the current task
            all_train_texts.extend(train_dataset.texts)  # Access texts attribute
            all_train_labels.extend(train_dataset.labels)
            all_val_texts.extend(val_dataset.texts)  # Access texts attribute
            all_val_labels.extend(val_dataset.labels)
            all_test_texts.extend(test_dataset.texts)  # Access texts attribute
            all_test_labels.extend(test_dataset.labels)

            print(f"len(all_train_dataset): {len(all_train_texts)}") # Print the length of the combined dataset

            for i, epoch in enumerate(epoch_list):

                for batch in batch_list:
                    train_dataset = RelationDataset([item for item in all_train_texts], all_train_labels, tokenizer, 512)
                    val_dataset = RelationDataset([item for item in all_val_texts], all_val_labels, tokenizer, 512)
                    test_dataset = RelationDataset([item for item in all_test_texts], all_test_labels, tokenizer, 512)
                    model, tokenizer, train_hist = train_model(epoch, batch, val_dataset, train_dataset, model, run_id, task_id)

                    write_json(train_hist, f"{out_dir}/train_hist_{run_id}_{task_id}.json")
                    # model_name_hf = f"{baseline_name}_{run_id}_{task_id}"
                    # base_model_hf = f"Sefika/bert_large_baseline_{run_id}_{task_id}"
                    # model.push_to_hub(model_name_hf, private=True)
                    test_acc, test_router_acc = test_model(model, test_dataset, batch,run_id, task_id, out_dir)
                    # test_seen_acc = test_model(model, all_seen_test_data, batch)
                    result = {"run_id":run_id, "task_id": task_id, "epoch": epoch, "batch_size": batch, "test_acc": test_acc, "test_router_acc":test_router_acc}
                    results.append(result)
                    write_json(results, f"{out_dir}/results_{run_id}.json")

                    torch.cuda.empty_cache()
            all_train_texts, all_train_labels = [], []  # Initialize empty lists for texts and labels
            all_val_texts, all_val_labels = [], []
            # all_test_texts, all_test_labels = [], []
            if task_id != 10:
              model.add_task(len(label_to_int))
              print(f"len(model.tasks): {len(model.tasks)}")

                # del model
    del tokenizer
    del model
    torch.cuda.empty_cache()

if __name__ == "__main__":
    #drive folde = EMNLP-neurogenesis

    # phi = ['cosine','performer', 'linear', 'truncated_performer', 'positive_cosine', 'dima_sin']
    phi = ['performer']
    neuro_genesis = True
    epoch_list = [5]
    batch_list = [4]
    for neuro_genesis_type_phi in phi:
      baseline_name = "bert_large_performer_neurogenesis"
      dataset_path = "/content/tacred/final"
      output_dir = f"drive/MyDrive/EMNLP-neurogenesis/mix_of_tasks"

      main(output_dir, dataset_path, neuro_genesis, baseline_name, epoch_list, batch_list, neuro_genesis_type_phi)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


len(all_train_dataset): 2984
5
Epoch 1/5




Train Loss = 0.6679, Val Loss = 0.4461, Val Accuracy = 0.9495, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 0.3930, Val Loss = 0.6713, Val Accuracy = 0.9170, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 0.3753, Val Loss = 0.5400, Val Accuracy = 0.9495, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 0.3715, Val Loss = 0.6042, Val Accuracy = 0.9206, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 4 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.9572
Len of Test set: 257
len(model.tasks): 3
len(all_train_dataset): 1482
5
Epoch 1/5




Train Loss = 0.9055, Val Loss = 0.9786, Val Accuracy = 0.8446, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 0.5980, Val Loss = 0.9696, Val Accuracy = 0.8784, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 0.5726, Val Loss = 1.2472, Val Accuracy = 0.8514, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 0.5734, Val Loss = 1.0981, Val Accuracy = 0.8514, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 5/5




Train Loss = 0.6237, Val Loss = 0.9958, Val Accuracy = 0.8851, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 5 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.8015
Len of Test set: 388
len(model.tasks): 4
len(all_train_dataset): 1976
5
Epoch 1/5




Train Loss = 0.8946, Val Loss = 0.9007, Val Accuracy = 0.9500, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 0.7661, Val Loss = 0.8750, Val Accuracy = 0.9714, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 0.7728, Val Loss = 0.8534, Val Accuracy = 0.9786, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 0.7804, Val Loss = 0.8931, Val Accuracy = 0.9500, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 5/5




Train Loss = 0.7483, Val Loss = 0.8129, Val Accuracy = 0.9857, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.6433
Len of Test set: 513
len(model.tasks): 5
len(all_train_dataset): 1436
5
Epoch 1/5




Train Loss = 1.1227, Val Loss = 0.9953, Val Accuracy = 0.9735, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 0.9097, Val Loss = 1.0811, Val Accuracy = 0.9603, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 0.9050, Val Loss = 1.0493, Val Accuracy = 0.9801, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 0.9049, Val Loss = 1.0606, Val Accuracy = 0.9801, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 4 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.3146
Len of Test set: 639
len(model.tasks): 6
len(all_train_dataset): 1440
5
Epoch 1/5




Train Loss = 1.5477, Val Loss = 1.4093, Val Accuracy = 0.9154, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 1.1354, Val Loss = 1.4358, Val Accuracy = 0.9231, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 1.0544, Val Loss = 1.4484, Val Accuracy = 0.9308, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 1.0439, Val Loss = 1.4816, Val Accuracy = 0.9231, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 4 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.3720
Len of Test set: 750
len(model.tasks): 7
len(all_train_dataset): 1318
5
Epoch 1/5




Train Loss = 1.4076, Val Loss = 1.2289, Val Accuracy = 0.9726, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 1.1733, Val Loss = 1.2244, Val Accuracy = 0.9726, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 1.1655, Val Loss = 1.2301, Val Accuracy = 0.9726, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 1.1654, Val Loss = 1.2322, Val Accuracy = 0.9726, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 5/5




Train Loss = 1.1654, Val Loss = 1.2334, Val Accuracy = 0.9726, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 5 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.4328
Len of Test set: 878
len(model.tasks): 8
len(all_train_dataset): 926
5
Epoch 1/5




Train Loss = 1.6044, Val Loss = 1.6501, Val Accuracy = 0.8503, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 1.3075, Val Loss = 1.6016, Val Accuracy = 0.9048, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 1.2748, Val Loss = 1.6232, Val Accuracy = 0.8844, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5




Train Loss = 1.2742, Val Loss = 1.6462, Val Accuracy = 0.8912, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 5/5




Train Loss = 1.2741, Val Loss = 1.6747, Val Accuracy = 0.8912, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000002
Early stopping triggered after 5 epochs.
Loaded the best model based on validation loss.
Training complete.




Test Router Accuracy: 1.0000
Test Accuracy: 0.4402
Len of Test set: 979
len(model.tasks): 9
len(all_train_dataset): 1260
5
Epoch 1/5




Train Loss = 1.5287, Val Loss = 1.3788, Val Accuracy = 1.0000, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 2/5




Train Loss = 1.3798, Val Loss = 1.4172, Val Accuracy = 0.9869, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 3/5




Train Loss = 1.3866, Val Loss = 1.4016, Val Accuracy = 0.9935, Router Loss= 1.0000, Router Val Accuracy = 1.0000, Learning Rate = 0.000020
Epoch 4/5


Validation:  88%|████████▊ | 68/77 [00:16<00:02,  4.04it/s, Batch Loss=1.37, Router Loss=1.37, Model Loss=4.33e-5]

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Mix of Tasks Model

task_labels