In [None]:
!unzip tacred.zip

In [None]:
!pip install git+https://github.com/huggingface/trl
!pip install --upgrade huggingface_hub

In [None]:
!huggingface-cli login --token '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'

### Libraries

In [16]:
import os
import json
import random
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, PreTrainedModel, BertTokenizer, BertConfig
from tqdm import tqdm
from sklearn.metrics import accuracy_score
# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


### source code

In [17]:

def get_phi(m, D, which_phi='performer', device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Function that returns the random feature map, phi.
    Since our neuron-astrocyte model is equivalent to using Random Feature Attention,
    we use this representation for simplicity. Different phi functions lead to different feature maps.
    """
    # Set random seed for reproducibility
    torch.manual_seed(42)

    # Random weight matrix for random feature map
    W = torch.randn((m, D), device=device)

    if which_phi == 'cosine':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def phi(x, c=0):
            """Uses a cosine random feature map to approximate softmax attention."""
            return torch.sqrt(2 / m) * torch.cos(W @ x + rand_b) * torch.exp(0.5 * (torch.norm(x) ** 2) - c)

    elif which_phi == 'performer':
        def phi(x, c=0):
            """Uses an exponential random feature map to approximate softmax attention."""
            return torch.exp(-0.5 * torch.log(torch.tensor(m, device=device)) + W @ x - 0.5 * (torch.norm(x) ** 2))

    elif which_phi == 'linear':
        def phi(x, c=0):
            """Uses a linear random feature map to approximate softmax attention."""
            h = -0.5 * torch.log(torch.tensor(m, device=device)) + W @ x - 0.5 * (torch.norm(x) ** 2)
            return 1 + h

    elif which_phi == 'truncated_performer':
        def phi(x, thresh=150):
            """Uses an exponential random feature map to approximate softmax attention."""
            scaling_factors = torch.exp(-0.5 * torch.log(torch.tensor(m, device=device)) - 0.5 * (torch.norm(x) ** 2))
            h = torch.exp(W @ x)
            return scaling_factors * torch.clamp(h, min=0, max=thresh)

    elif which_phi == 'positive_cosine':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def phi(x, thresh=10):
            """Uses a positive cosine random feature map to approximate softmax attention."""
            scaling_factors = torch.sqrt(2 / (torch.pi * m)) * torch.exp(0.5 * (torch.norm(x) ** 2))
            h = torch.cos(W @ x + rand_b)
            return torch.clamp(scaling_factors * h, min=0)

    elif which_phi == 'dima_sin':
        # Random biases for cosine feature map
        rand_b = torch.rand(m, device=device) * 2 * torch.pi

        def clipped_sin(x):
            """Clips the sine values."""
            return torch.where(x > torch.pi / 2, 1, torch.where(x < -torch.pi / 2, -1, torch.sin(x)))

        def phi(x, thresh=10):
            """Uses a sine-based random feature map to approximate softmax attention."""
            scaling_factors = torch.sqrt(2 / m) * torch.exp(0.5 * (torch.norm(x) ** 2))
            h = clipped_sin(W @ x + rand_b)
            return scaling_factors * h

    else:
        raise ValueError(f"Unknown phi type: {which_phi}")

    return phi




def get_astro_responses(query_layer, key_layer, nhead, phi):
    """
    Computes astrocyte response given a random feature map, queries, and keys.

    Args:
        query_layer: Tensor of shape (n_sample, ntokens, dim)
        key_layer: Tensor of shape (nhead, ntokens, dim)
        nhead: Integer index for the current head
        phi: Function to apply to the keys and queries

    Returns:
        Tensor of shape (n_sample, ntokens, ntokens) representing astro_pulses.
    """
    # Get the device of the query_layer
    device = query_layer.device

    rfa_normalized_keys = phi(key_layer[nhead])
    transformed_queries = phi(query_layer)


    rfa_normalized_keys = rfa_normalized_keys.T

    # Perform batched matrix multiplication
    astro_pulses = torch.matmul(transformed_queries, rfa_normalized_keys)

    return astro_pulses


def neurogenesis(head_size, query_layer, key_layer, nhead, phi='performer'):
    # Normalize Q and K matrices appropriately
    query_layer = query_layer / head_size ** (1/4)
    key_layer = key_layer / head_size ** (1/4)

    # Ensure tensors are on GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    query_layer = query_layer.to(device)
    key_layer = key_layer.to(device)


    phi_low_m = get_phi(m=512, D=head_size, which_phi=phi)


    astro_ps_low_m = get_astro_responses(query_layer, key_layer, 0, phi_low_m)

    # Move the result back to CPU for further processing or conversion to NumPy
    return astro_ps_low_m.cpu().detach().numpy()

In [18]:


class CustomBERTEncoderBlock(nn.Module):
    def __init__(self, embed_size, ff_hidden_size, dropout=0.1, neurogenesis=True, phi='performer'):

        super(CustomBERTEncoderBlock, self).__init__()
        self.query_fc = nn.Linear(embed_size, embed_size)
        self.key_fc = nn.Linear(embed_size, embed_size)
        self.value_fc = nn.Linear(embed_size, embed_size)
        self.attn_out_fc = nn.Linear(embed_size, embed_size)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, ff_hidden_size),
            nn.ReLU(),
            nn.Linear(ff_hidden_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
        self.neurogenesis = neurogenesis
        self.phi = phi

    def forward(self, x):
        query = self.query_fc(x)
        key = self.key_fc(x)
        value = self.value_fc(x)
        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / (query.size(-1) ** 0.5)
        # Apply neurogenesis only if enabled
        if self.neurogenesis:
            low = neurogenesis(512, query, key, 4, phi=self.phi)
            # # Convert low and high to PyTorch tensors before applying softmax
            low = torch.tensor(low, device=x.device, dtype=torch.float32)  # Ensure correct data type
            attn_weights = F.softmax(low, dim=-1)
        else:
            attn_weights = F.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_weights, value)

        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))

        return x, key, query, value


class BertForSequenceClassification_Neuro(PreTrainedModel): # Inherit from PreTrainedModel
    def __init__(self, config, pretrained_model_name='bert-large-uncased', num_classes=8, ff_hidden_size=2048, dropout=0.1, use_custom_encoder=True, neuro_genesis=True, phi='performer'):

        super(BertForSequenceClassification_Neuro, self).__init__(config) # Pass config to super()
        self.bert = BertModel.from_pretrained(pretrained_model_name, config=config) # Pass config to BertModel
        self.use_custom_encoder = use_custom_encoder
        self.num_labels = num_classes
        embed_size = self.bert.config.hidden_size

        if use_custom_encoder:
            self.custom_encoder = CustomBERTEncoderBlock(embed_size, ff_hidden_size, dropout,neurogenesis=neurogenesis, phi=phi)

        self.classifier = nn.Linear(embed_size, num_classes)
        self.dropout = nn.Dropout(dropout)


    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        sequence_output = bert_output.last_hidden_state
        pooled_output = bert_output.pooler_output

        if self.use_custom_encoder:
            sequence_output, key, query, value = self.custom_encoder(sequence_output)
            pooled_output = sequence_output[:, 0, :]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if self.use_custom_encoder:
            return logits
        return logits


In [19]:

def read_json(path):
    """ Read a json file from the given path."""
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def write_json(data, path):
    """ Write a json file to the given path."""
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)



def data_preparation(data):
  data_prepared = []
  for item in data:

    sentence = item["sentence"]
    entity1 = item["subject"]
    entity2 = item["object"]
    relation = item["relation"]
    sentence_e1 = sentence.replace(entity1, f"[E1]{entity1}[/E1]")
    sentence_e2 = sentence_e1.replace(entity2, f"[E2]{entity2}[/E2]")
    row = {"sentence": "[CLS] "+sentence_e2, "relation": relation, "e1":entity1, "e2":entity2}
    data_prepared.append(row)
  return data_prepared


class RelationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }





In [20]:

def train_model(epochs, batch_size, val_dataset, train_dataset, model, run_id, task_id, patience=3):
    """
    Trains a BERT-based model on a given dataset with validation, early stopping, and best model loading.

    Args:
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training and validation.
        val_dataset: Validation dataset.
        train_dataset: Training dataset.
        model: Pretrained model to fine-tune.
        run_id (str): Unique identifier for the training run.
        task_id (str): Task identifier for model saving.
        patience (int): Number of epochs to wait for validation loss improvement before stopping.

    Returns:
        model: The best trained model (based on validation loss).
        tokenizer: The tokenizer associated with the model.
        train_hist: Training history containing loss, accuracy, and learning rate for each epoch.
    """
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
    model = model.to("cuda")
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)  # Adjust learning rate
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
    criterion = torch.nn.CrossEntropyLoss()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    train_hist = []
    best_val_loss = float('inf')
    best_model_state = None
    no_improvement_epochs = 0

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        # Training phase
        model.train()
        train_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc="Training", leave=False)
        for batch in train_loader_tqdm:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['label'].to("cuda")

            outputs = model(input_ids, attention_mask)
            logits = outputs

            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            train_loader_tqdm.set_postfix({"Batch Loss": loss.item()})

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        val_loader_tqdm = tqdm(val_loader, desc="Validation", leave=False)
        with torch.no_grad():
            for batch in val_loader_tqdm:
                input_ids = batch['input_ids'].to("cuda")
                attention_mask = batch['attention_mask'].to("cuda")
                labels = batch['label'].to("cuda")

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs
                loss = criterion(logits, labels)
                val_loss += loss.item()

                val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

                val_loader_tqdm.set_postfix({"Batch Loss": loss.item()})

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(val_labels, val_preds)

        scheduler.step(val_loss)

        print(f"Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}, Learning Rate = {scheduler.optimizer.param_groups[0]['lr']:.6f}")

        epoch_log = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'val_accuracy': val_accuracy,
            'learning_rate': scheduler.optimizer.param_groups[0]['lr']
        }
        train_hist.append(epoch_log)

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            no_improvement_epochs = 0
        else:
            no_improvement_epochs += 1

        # Early stopping
        if no_improvement_epochs >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

    # Load the best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Loaded the best model based on validation loss.")

    print("Training complete.")
    return model, tokenizer, train_hist


In [21]:

def test_model(model, test_dataset, batch_size):
  model.eval()
  test_preds, test_labels = [], []
  test_loader = DataLoader(test_dataset, batch_size=batch_size)
  test_loader_tqdm = tqdm(test_loader, desc="Testing", leave=False)
  with torch.no_grad():
      for batch in test_loader_tqdm:
          input_ids = batch['input_ids'].to("cuda")
          attention_mask = batch['attention_mask'].to("cuda")
          labels = batch['label'].to("cuda")
          outputs = model(input_ids=input_ids, attention_mask=attention_mask) # Pass attention_mask to the model

          logits = outputs
          test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())  # Use logits for argmax
          test_labels.extend(labels.cpu().numpy())
  print(test_labels)
  print(test_preds)
  test_accuracy = accuracy_score(test_labels, test_preds)

  print(f"Test Accuracy: {test_accuracy:.4f}")
  return test_accuracy

In [None]:


def main(out_dir, dataset_path, neuro_genesis, baseline_name,  batch_list, epoch_list, neuro_phi):


    # torch.cuda.empty_cache()
    for run_id in range(1,6):
        results = []

        # all_labels = []
        # all_seen_test_data = []
        # all_test_sentences = []
        # all_test_labels = []

        for task_id in range(1, 11):

            train_data = read_json(f"{dataset_path}/train/run_{run_id}/task{task_id}/train_1.json")
            train_prepared = data_preparation(train_data)
            test_data = read_json(f"{dataset_path}/test/run_{run_id}/task{task_id}/test_1.json")
            test_prepared = data_preparation(test_data)
            val_data = read_json(f"{dataset_path}/train/run_{run_id}/task{task_id}/dev_1.json")
            val_prepared = data_preparation(val_data)
            # all_labels.extend([item['relation'] for item in train_prepared])
            # Create unique mapping for all labels


            train_labels = [item['relation'] for item in train_prepared]
            train_sentences = [item['sentence'] for item in train_prepared]
            test_labels = [item['relation'] for item in test_prepared]
            test_sentences = [item['sentence'] for item in test_prepared]
            val_labels = [item['relation'] for item in val_prepared]
            val_sentences = [item['sentence'] for item in val_prepared]
            label_to_int = {label: idx for idx, label in enumerate(set(train_labels))}
            # all_test_sentences.extend(test_sentences)
            # all_test_labels.extend(test_labels)

            # # Convert labels to integers using the pre-calculated mapping
            train_labels = [label_to_int[label] for label in train_labels]
            val_labels = [label_to_int[label] for label in val_labels]
            test_labels = [label_to_int[label] for label in test_labels]

            # test_labels_seen = [label_to_int[label] for label in all_test_labels]

            # Create Dataset and DataLoader
            tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')


                # Load the configuration first
            config = BertConfig.from_pretrained('bert-large-uncased', num_labels=len(label_to_int))
            # Then pass it to the model constructor
            model = BertForSequenceClassification_Neuro(config, pretrained_model_name='bert-large-uncased', num_classes=len(label_to_int), use_custom_encoder=True, neuro_genesis=neuro_genesis, phi=neuro_phi)
            # else:

            #     #The old model should have the total number of labels encountered till the last task.
            #     old_labels = model.num_labels
            #     print(f"Old labels: {old_labels}")
            #     model = increment_class_labels(model, new_num_labels=len(label_to_int)) #Use total number of labels for current task

            train_dataset = RelationDataset(train_sentences, train_labels, tokenizer, max_length=512)
            val_dataset = RelationDataset(val_sentences, val_labels, tokenizer, max_length=512)

            test_dataset = RelationDataset(test_sentences, test_labels, tokenizer, max_length=512)
            # all_seen_test_data = RelationDataset(all_test_sentences, test_labels_seen, tokenizer, max_length=512)

            for i, epoch in enumerate(epoch_list):

                for batch in batch_list:
                    model, tokenizer, train_hist = train_model(epoch, batch, val_dataset, train_dataset, model, run_id, task_id)

                    write_json(train_hist, f"{out_dir}/train_hist_{run_id}_{task_id}.json")
                    model_name_hf = f"{baseline_name}_{run_id}_{task_id}"
                    # base_model_hf = f"Sefika/bert_large_baseline_{run_id}_{task_id}"
                    model.push_to_hub(model_name_hf, private=True)
                    test_acc = test_model(model, test_dataset, batch)
                    # test_seen_acc = test_model(model, all_seen_test_data, batch)

                    print(f"Epoch-------------{epoch}=={i}")
                    result = {"run_id":run_id, "task_id": task_id, "epoch": epoch, "batch_size": batch, "test_acc": test_acc}
                    results.append(result)
                    write_json(results, f"{out_dir}/results_{run_id}.json")

                    torch.cuda.empty_cache()
                # del model
        del tokenizer
        del model

if __name__ == "__main__":
    output_dir = "./neurogenesis_results_low"
    neuro_genesis = True
    epoch_list = [1]
    batch_list = [16]
    baseline_name = "bert_large_performer_neurogenesis"
    dataset_path = "/content/tacred/final"
    neuro_genesis_type_phi = "performer"
    main(output_dir, dataset_path, neuro_genesis,baseline_name, epoch_list, batch_list, neuro_genesis_type_phi)


Epoch 1/16


Training:  48%|████▊     | 882/1820 [01:46<02:00,  7.82it/s, Batch Loss=1.03]