In [1]:
import pandas as pd 
import numpy as np 
import torch
torch.cuda.empty_cache()
from torch.cuda.amp import autocast
from datasets import load_dataset 
from datasets import Dataset 
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from transformers import AutoTokenizer, XLNetForSequenceClassification, DistilBertForSequenceClassification, TrainingArguments, Trainer
# AutoTokenizer : Automatically loads the appropriate tokenizer for a given pre-trained model.
# AutoModelForSequenceClassification:  Loads a pre-trained transformer model for sequence classification tasks (e.g., sentiment analysis, spam detection, etc.)
# XLNetTokenizer : Loads the XLNet tokenizer
# XLNetForSequenceClassification : Loads the XLNet model for sequence classification
# TrainingArguments: Contains all the hyperparameters needed for training a model
# Trainer: A high-level interface for training models

  from .autonotebook import tqdm as notebook_tqdm


## 1- Load Dataset
The dataset is downlaoded form this page:

https://huggingface.co/owaiskha9654/Multi-Label-Classification-of-PubMed-Articles

In [2]:
path = "owaiskha9654/PubMed_MultiLabel_Text_Classification_Dataset_MeSH"
dataset = load_dataset(path, split = 'train')
dataset= pd.DataFrame(dataset)
dataset

Unnamed: 0,Title,abstractText,meshMajor,pmid,meshid,meshroot,A,B,C,D,E,F,G,H,I,J,L,M,N,Z
0,Expression of p53 and coexistence of HPV in pr...,Fifty-four paraffin embedded tissue sections f...,"['DNA Probes, HPV', 'DNA, Viral', 'Female', 'H...",8549602,"[['D13.444.600.223.555', 'D27.505.259.750.600....","['Chemicals and Drugs [D]', 'Organisms [B]', '...",0,1,1,1,1,0,0,1,0,0,0,0,0,0
1,Vitamin D status in pregnant Indian women acro...,The present cross-sectional study was conducte...,"['Adult', 'Alkaline Phosphatase', 'Breast Feed...",21736816,"[['M01.060.116'], ['D08.811.277.352.650.035'],...","['Named Groups [M]', 'Chemicals and Drugs [D]'...",0,1,1,1,1,1,1,0,1,1,0,1,1,1
2,[Identification of a functionally important di...,The occurrence of individual amino acids and d...,"['Amino Acid Sequence', 'Analgesics, Opioid', ...",19060934,"[['G02.111.570.060', 'L01.453.245.667.060'], [...","['Phenomena and Processes [G]', 'Information S...",1,1,0,1,1,0,1,0,0,0,1,0,0,0
3,Multilayer capsules: a promising microencapsul...,"In 1980, Lim and Sun introduced a microcapsule...","['Acrylic Resins', 'Alginates', 'Animals', 'Bi...",11426874,"[['D05.750.716.822.111', 'D25.720.716.822.111'...","['Chemicals and Drugs [D]', 'Technology, Indus...",1,1,1,1,1,0,1,0,0,1,0,0,0,0
4,"Nanohydrogel with N,N'-bis(acryloyl)cystine cr...",Substantially improved hydrogel particles base...,"['Antineoplastic Agents', 'Cell Proliferation'...",28323099,"[['D27.505.954.248'], ['G04.161.750', 'G07.345...","['Chemicals and Drugs [D]', 'Phenomena and Pro...",1,1,0,1,1,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Five donors-one recipient: modeling a mosaic o...,BACKGROUND: A 21-year-old man was admitted to ...,"['Adult', 'Cell Transplantation', 'Cord Blood ...",18364724,"[['M01.060.116'], ['E02.095.147.500', 'E04.936...","['Named Groups [M]', 'Analytical, Diagnostic a...",1,1,1,0,1,0,1,0,0,0,0,1,0,0
49996,The role of eicosanoids in cyclosporine nephro...,Nephrotoxicity is the most troublesome complic...,"['Animals', 'Cyclosporins', 'In Vitro Techniqu...",2735953,"[['B01.050'], ['D04.345.566.235', 'D12.644.641...","['Organisms [B]', 'Chemicals and Drugs [D]', '...",1,1,0,1,1,0,0,0,0,0,0,0,0,0
49997,Impact of pancreaticoduodenal arcade dilation ...,BACKGROUND: The aim of this study was to inves...,"['Adult', 'Aged', 'Aged, 80 and over', 'Dilata...",28919282,"[['M01.060.116'], ['M01.060.116.100'], ['M01.0...","['Named Groups [M]', 'Diseases [C]', 'Anatomy ...",1,1,1,0,1,0,0,0,0,0,0,1,1,0
49998,Outcomes of Preterm Infants following Discussi...,OBJECTIVES: To describe the frequency of postn...,"['Decision Making', 'Female', 'Humans', 'Infan...",28647272,"[['F02.463.785.373'], ['B01.050.150.900.649.31...","['Psychiatry and Psychology [F]', 'Organisms [...",0,1,0,0,1,1,0,1,0,0,0,1,1,0


In [3]:
text_col = 'abstractText'
label_names = list(dataset.columns[6:])
dataset = dataset[[text_col]+ label_names ]
num_labels = len(label_names) 
print(f"Label names: {label_names}")
print(f"Number of labels: {num_labels}")
print(dataset.head())

Label names: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']
Number of labels: 14
                                        abstractText  A  B  C  D  E  F  G  H  \
0  Fifty-four paraffin embedded tissue sections f...  0  1  1  1  1  0  0  1   
1  The present cross-sectional study was conducte...  0  1  1  1  1  1  1  0   
2  The occurrence of individual amino acids and d...  1  1  0  1  1  0  1  0   
3  In 1980, Lim and Sun introduced a microcapsule...  1  1  1  1  1  0  1  0   
4  Substantially improved hydrogel particles base...  1  1  0  1  1  0  1  0   

   I  J  L  M  N  Z  
0  0  0  0  0  0  0  
1  1  1  0  1  1  1  
2  0  0  1  0  0  0  
3  0  1  0  0  0  0  
4  0  1  0  0  0  0  


In [4]:
# convert the labels to a list of integers
dataset["labels"] = dataset[label_names].apply(lambda x: list(x), axis=1)
# drop the label columns
dataset = dataset.drop(columns=label_names)
dataset  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["labels"] = dataset[label_names].apply(lambda x: list(x), axis=1)


Unnamed: 0,abstractText,labels
0,Fifty-four paraffin embedded tissue sections f...,"[0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1,The present cross-sectional study was conducte...,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1]"
2,The occurrence of individual amino acids and d...,"[1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
3,"In 1980, Lim and Sun introduced a microcapsule...","[1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]"
4,Substantially improved hydrogel particles base...,"[1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
49995,BACKGROUND: A 21-year-old man was admitted to ...,"[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]"
49996,Nephrotoxicity is the most troublesome complic...,"[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
49997,BACKGROUND: The aim of this study was to inves...,"[1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0]"
49998,OBJECTIVES: To describe the frequency of postn...,"[0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0]"


In [5]:
# divide the dataset into train (50%), val (25%), and test(25%) sets 
#copy_dataset = dataset.copy()
train_samples_numbers = int(dataset.shape[0]*0.5)
val_samples_numbers = int(dataset.shape[0]*0.75)

train_df = dataset[:train_samples_numbers] 
val_df = dataset[train_samples_numbers:val_samples_numbers]
test_df = dataset[val_samples_numbers:]

# Dataset.from_pandas is a method from the Hugging Face datasets library, used to convert a Pandas DataFrame into a Dataset object
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## 2- Fine-tune XLNet model and its tokenizer

In [6]:
def tokenizing(batched_text):
    return tokenizer(batched_text[text_col], padding = 'max_length', truncation = True)

# pre_train_weights = 'xlnet-base-cased'
# tokenizer = AutoTokenizer.from_pretrained(pre_train_weights)
# model = XLNetForSequenceClassification.from_pretrained(pre_train_weights, num_labels=num_labels)

pretrain_model = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
model = DistilBertForSequenceClassification.from_pretrained(pretrain_model, num_labels=num_labels)

# tokenize the datasets, convert text into tokens
train_encoded = train_dataset.map(tokenizing, batched=True, batch_size=100)
val_encoded = val_dataset.map(tokenizing, batched=True, batch_size=100)
test_encoded = test_dataset.map(tokenizing, batched=True, batch_size=100)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25000/25000 [00:04<00:00, 5496.39 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12500/12500 [00:02<00:00, 5622.37 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 12500/12500 [00:02<00:00, 5522.68 examples/s]


In [7]:
def compute_metrics(pred):
    y_pred, y_true = pred
    y_pred =torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    y_pred = y_pred.sigmoid()>0.5
    y_true= y_true.bool()
    precision= recall_score(y_true, y_pred, average='micro', pos_label=1)
    precision= precision_score(y_true, y_pred, average='micro', pos_label=1)
    f1= f1_score(y_true, y_pred, average='micro', pos_label=1)
    return {
        'f1': f1,
        'precision': precision,
        'recall': precision
    }

### - For single-label multi-class classification, we use softmax activation followed by a cross-entropy loss function.

### - However, for multi-label mode, we need to use different activation and loss fucntion.

### - We use torch.n.BCEWithLogitsLoss() instead of torch.nn.CrossEntropyLoss()

### - BCEWithLogitsLoss() is a loss function in PyTorch that combines binary cross-entropy (BCE) loss with a sigmoid activation function. It is commonly used for binary classification tasks and multi-label classification.

In [8]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        preds_ = logits.view(-1, self.model.config.num_labels)
        labels_ = labels.float().view(-1, self.model.config.num_labels)
        loss = loss_fct(preds_, labels_)
        return (loss, outputs) if return_outputs else loss


In [9]:
training_args = TrainingArguments(
    output_dir='./results', # output directory for model predictions and checkpoints
    num_train_epochs=30,  # total number of training epochs
    per_device_train_batch_size=8,  # reduced batch size to prevent CUDA OOM errors
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_ratio=0.1,  # ratio of warmup steps - more flexible than fixed steps
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs', # directory to save logs
    do_eval=True,  # whether to evaluate during training
    do_train=True,  # whether to train the model
    save_strategy='epoch',  # save the model after each epoch
    evaluation_strategy='epoch',  # evaluate the model after each epoch
    logging_strategy='steps',  # log steps instead of epochs for more frequent updates
    report_to='tensorboard',  # report logs to TensorBoard
    logging_steps=100,  # how often to log the training loss
    fp16=True if torch.cuda.is_available() else False,  # whether to use mixed precision training
    load_best_model_at_end=True,  # load the best model when finished training
    metric_for_best_model='f1',  # use F1 score to determine best model
    greater_is_better=True,  # higher F1 is better
    seed=42,
    dataloader_drop_last=True,  # drop last incomplete batch
    gradient_accumulation_steps=2,  # accumulate gradients for effective larger batch size
    save_total_limit=3,  # limit the total amount of checkpoints saved
)



In [10]:
trainer = MultilabelTrainer(model=model, 
                            args=training_args, 
                            train_dataset=train_encoded, 
                            eval_dataset=val_encoded, 
                            compute_metrics=compute_metrics, 
                            tokenizer=tokenizer)

trainer.train()
   


  trainer = MultilabelTrainer(model=model,


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3361,0.316611,0.836207,0.842585,0.842585
2,0.2805,0.286011,0.85313,0.859742,0.859742
3,0.2732,0.285596,0.853958,0.857415,0.857415
4,0.242,0.282578,0.856274,0.86204,0.86204
5,0.192,0.312477,0.846687,0.858307,0.858307
6,0.1525,0.361015,0.845211,0.84817,0.84817
7,0.1117,0.405244,0.842708,0.847916,0.847916
8,0.085,0.469707,0.845564,0.846967,0.846967
9,0.0576,0.551106,0.844255,0.840125,0.840125
10,0.0461,0.597017,0.841422,0.844816,0.844816


TrainOutput(global_step=46860, training_loss=0.06802474762131415, metrics={'train_runtime': 9676.3928, 'train_samples_per_second': 77.508, 'train_steps_per_second': 4.843, 'total_flos': 9.930927231138202e+16, 'train_loss': 0.06802474762131415, 'epoch': 29.98112})

## 3- Test the performance of the fine-tuned model

In [11]:
test_res = trainer.predict(test_encoded)


In [12]:
pd.Series(compute_metrics(test_res[:2])).to_frame().transpose()

Unnamed: 0,f1,precision,recall
0,0.85702,0.862079,0.862079
