<a href="https://www.kaggle.com/code/skshmjn/long-former-fine-tuning-multi-label-classification?scriptVersionId=215228298" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
from datasets import load_dataset
import numpy as np
import torch 

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from transformers import (
    LongformerTokenizer,
    LongformerForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
LongformerConfig,


)

In [2]:
dataset = load_dataset("owaiskha9654/PubMed_MultiLabel_Text_Classification_Dataset_MeSH")

README.md:   0%|          | 0.00/960 [00:00<?, ?B/s]

(…)ext Classification Dataset Processed.csv:   0%|          | 0.00/120M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

## First we will pretrain our longformer for masked word on textual data, so weights are updated for biomedical language


In [3]:
list(dataset['train'].features.keys())

['Title',
 'abstractText',
 'meshMajor',
 'pmid',
 'meshid',
 'meshroot',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'L',
 'M',
 'N',
 'Z']

In [4]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):
    return tokenizer(
        examples["abstractText"],
        # padding="max_length",
        truncation=True,
        max_length=2560,  
    )

In [6]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=list(dataset['train'].features.keys()))

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
print(tokenized_datasets['train'][0:1])

{'input_ids': [[0, 597, 22129, 12, 10231, 2242, 3707, 179, 14224, 11576, 9042, 31, 1484, 19, 25599, 2911, 19036, 36, 2146, 1200, 43, 8, 19, 29851, 1668, 36, 3103, 1200, 43, 58, 24305, 4, 33264, 21, 12333, 8, 2006, 11, 80, 5612, 4, 30800, 6, 634, 4281, 9770, 6, 4986, 41217, 5708, 26929, 58, 32755, 131, 36013, 5, 1468, 4634, 4756, 21, 13773, 30, 9284, 1938, 5448, 634, 25655, 261, 40143, 1242, 219, 2794, 1105, 12, 510, 22434, 4513, 4, 13064, 13436, 9, 1907, 231, 6, 365, 6, 545, 6, 504, 6, 2357, 58, 2006, 4, 20, 181, 12, 4540, 8151, 21, 8446, 12198, 30, 13998, 2678, 661, 43941, 5448, 4, 33264, 7910, 21, 747, 3059, 19, 25599, 2911, 19036, 8, 29851, 1668, 4, 96, 29851, 1668, 4412, 33264, 545, 8, 504, 19, 239, 15, 438, 23982, 801, 58, 303, 4, 20, 181, 12, 4540, 21, 1455, 7154, 6, 8, 11, 2289, 20631, 4, 440, 22792, 21, 6373, 227, 2621, 9, 181, 12, 4540, 8, 13064, 13436, 5708, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True, 
    mlm_probability=0.15,
)

In [9]:
def compute_metrics(eval_preds):
    """
    Computes perplexity based on the predictions.
    """
    loss = eval_preds.metrics["eval_loss"]  # Hugging Face provides loss directly
    perplexity = math.exp(loss) if loss < 300 else float("inf")  # Avoid overflow
    return {"perplexity": perplexity}

In [10]:
training_args = TrainingArguments(
    output_dir="./pretrained_mlm_longformer_medical_data",  # Directory to save the model
    overwrite_output_dir=True,
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=1,  # Smaller batch size due to larger sequences
    save_steps=25000,  # Save checkpoint every 500 steps
    save_total_limit=2,  # Keep only the last two checkpoints
    logging_dir="./logs",  # Directory for logs
    logging_steps=10000,  # Log metrics every 100 steps
    prediction_loss_only=True, 
    learning_rate=5e-5,  # Learning rate
    report_to="none",  # Disable default reporting
    fp16=True, 
)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
# trainer.train()

## FineTuning

In [13]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")



In [14]:
labels_class = ['A','B','C','D','E','F','G','H','I','J','L','M','N','Z']
def preprocess_data(examples):

    ## This is supposed to be used with Batch data, 
    text = examples["abstractText"]
    
    encoding = tokenize_function(examples)
    
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels_class}
    
    
    labels_matrix = np.zeros((len(text), len(labels_class)), dtype=float)
    # # fill numpy array
    for idx, label in enumerate(labels_class):
        labels_matrix[:, idx] = labels_batch[label]
    
    encoding["labels"] = labels_matrix.tolist()
    
    
    return encoding

In [15]:
tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=list(dataset['train'].features.keys()))

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [16]:
print(tokenized_dataset['train'][0])

{'input_ids': [0, 597, 22129, 12, 10231, 2242, 3707, 179, 14224, 11576, 9042, 31, 1484, 19, 25599, 2911, 19036, 36, 2146, 1200, 43, 8, 19, 29851, 1668, 36, 3103, 1200, 43, 58, 24305, 4, 33264, 21, 12333, 8, 2006, 11, 80, 5612, 4, 30800, 6, 634, 4281, 9770, 6, 4986, 41217, 5708, 26929, 58, 32755, 131, 36013, 5, 1468, 4634, 4756, 21, 13773, 30, 9284, 1938, 5448, 634, 25655, 261, 40143, 1242, 219, 2794, 1105, 12, 510, 22434, 4513, 4, 13064, 13436, 9, 1907, 231, 6, 365, 6, 545, 6, 504, 6, 2357, 58, 2006, 4, 20, 181, 12, 4540, 8151, 21, 8446, 12198, 30, 13998, 2678, 661, 43941, 5448, 4, 33264, 7910, 21, 747, 3059, 19, 25599, 2911, 19036, 8, 29851, 1668, 4, 96, 29851, 1668, 4412, 33264, 545, 8, 504, 19, 239, 15, 438, 23982, 801, 58, 303, 4, 20, 181, 12, 4540, 21, 1455, 7154, 6, 8, 11, 2289, 20631, 4, 440, 22792, 21, 6373, 227, 2621, 9, 181, 12, 4540, 8, 13064, 13436, 5708, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [17]:
labels = [label for label in dataset['train'].features.keys() if label in labels_class]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'A': 0,
 'B': 1,
 'C': 2,
 'D': 3,
 'E': 4,
 'F': 5,
 'G': 6,
 'H': 7,
 'I': 8,
 'J': 9,
 'L': 10,
 'M': 11,
 'N': 12,
 'Z': 13}

In [18]:
# config = LongformerConfig(attention_window=2560)
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/pretrained_mlm_longformer_medical_data_check_point/transformers/default/1",  
                                                           num_labels=len(labels_class), 
                                                           id2label=id2label, 
                                                           label2id=label2id,
                                                           attention_window = 512,
                
                                                           problem_type = "multi_label_classification" )

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/pretrained_mlm_longformer_medical_data_check_point/transformers/default/1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def multi_label_metrics(eval, threshold=0.5):
    # Apply sigmoid to raw logits
    logits, labels = eval
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))
    
    # Convert probabilities to binary predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    # Compute metrics
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(labels, y_pred, average='micro')
    accuracy = accuracy_score(labels, y_pred)
    
    # Return metrics
    metrics = {
        'f1': f1_micro_average,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }
    return metrics

In [20]:
tokenized_dataset.set_format("torch")


In [21]:
train_split, temp_split = tokenized_dataset["train"].train_test_split(test_size=0.2, seed=42).values()

In [22]:
validation_split, test_split = temp_split.train_test_split(
    test_size=0.5, seed=42
).values()

In [23]:
validation_split[0]['labels']

tensor([0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1.])

In [24]:
test_split[0]

{'input_ids': tensor([    0, 40167, 45604,  4248,    83,  3755,   104,    35, 21658,   415,
         10100,   163,  6793,    36, 31529,   846,    43, 12358, 42190,   230,
            16,  3059,    19,     5,   709,     9, 45441,  3938,  7841,  8244,
         30708,  4982,    36,   725,  3376,   238,  1118,    19, 12358, 42190,
           163,     4,   152,   892,  5026,     7,  4830,   549, 22783,   846,
         12358, 43981,  2712,     5,  8474, 43671, 26374,  1575,     8,   251,
            12,  1279, 17618, 32444,     9,  1484,    71,  5350,  3693, 25806,
         14970,     9,   289,  3376,     4, 49767,   104,    35,   312,  3995,
         38994,  7931,    31,  5356,  1484,    19, 22783,   846,    12,  3368,
           289,  3376,    58,  4776,    13, 22783,   846, 12358, 42190,   634,
            10, 22481,  5448,     4, 49043,    35,   208, 29262,     9,  5356,
          1484,    36,  5607,     4,   398,  8871, 11793,  5350,  3693, 25806,
         14970,     9,   289,  3376,   

In [25]:
metric_name = "f1"
data_collator = DataCollatorWithPadding(tokenizer)
args = TrainingArguments(
    f"longformer-finetuned-multi-class-sentiment-analysis",
    eval_strategy = "epoch",
    report_to="none",
    save_strategy = "epoch",
    learning_rate=2e-5,
    auto_find_batch_size="True",
    # per_device_train_batch_size=2,
    # per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [26]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_split,
    eval_dataset=validation_split,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=multi_label_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [27]:
trainer.train()

Initializing global attention on CLS token...
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.2702,0.270517,0.862332,0.884191,0.1932
2,0.2324,0.263576,0.865935,0.887028,0.2114




TrainOutput(global_step=20000, training_loss=0.26201269187927245, metrics={'train_runtime': 15454.0022, 'train_samples_per_second': 5.177, 'train_steps_per_second': 1.294, 'total_flos': 2.302315047038568e+16, 'train_loss': 0.26201269187927245, 'epoch': 2.0})

In [28]:
trainer.evaluate(test_split)



{'eval_loss': 0.2620165944099426,
 'eval_f1': 0.8665859266670177,
 'eval_roc_auc': 0.8870936145068316,
 'eval_accuracy': 0.2072,
 'eval_runtime': 378.3044,
 'eval_samples_per_second': 13.217,
 'eval_steps_per_second': 0.827,
 'epoch': 2.0}