In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

#set device to cuda, if not available check mps else use cpu
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
else:
    device = torch.device('mps')

    
#print device type
print("Current Device", device)
torch.manual_seed(0)
KAGGLE = 1



MPS not available because the current PyTorch install was not built with MPS enabled.
Current Device cuda


In [3]:

dataset_path = ['data/Multi-Label Text Classification Dataset.csv', '/kaggle/input/multi-label-text-cls/Multi-Label Text Classification Dataset.csv'][KAGGLE]
interrupt_save_folder = ['interrupt', '/kaggle/working'][KAGGLE]
save_folder = ['saved', '/kaggle/working'][KAGGLE]

In [4]:
data_df = pd.read_csv(dataset_path)
labels = "A,B,C,D,E,F,G,H,I,J,L,M,N,Z".split(',')
num_labels = len(labels)
print(labels, num_labels)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z'] 14


### Testing

In [5]:
from transformers import BertTokenizer, BertModel

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [6]:
# tokens = tokenizer.encode("Hello, my [MASK] is John.")
# mask_pos = tokens.index(tokenizer.mask_token_id)
# print(mask_pos)
# out = bert_model(torch.tensor([tokens]).to(device))
# print(out.last_hidden_state.shape)

## Class Definitions

In [20]:
import time
from tqdm import tqdm

class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences, labels:torch.Tensor, tokenizer):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        char = self.tokenizer(self.sequences[idx], add_special_tokens = True,return_tensors='pt', padding='max_length', truncation=True)
        encoded_seq = char['input_ids']
        attention_mask = char['attention_mask']
        #reshape the pytorch tensor to be flattened because single element
        return encoded_seq[0], attention_mask[0], self.labels[idx]
        # return self.encoded_seqs[idx],self.attention_masks[idx], self.labels[idx]

class BERT_Base_Multilabel(torch.nn.Module):
    def __init__(self, num_labels): 
        """num_labels: number of labels to classify
           database: tuple of (X, Y) where X is a list of sentences and Y is a tensor of labels
        """
        super().__init__()
        print("Initializing BERT_Base_Multilabel...")

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.cls_head = torch.nn.Sequential(
            torch.nn.Linear(768, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, num_labels),
            torch.nn.Sigmoid()
        )
        self.loss_fn = torch.nn.BCELoss()
        print("Initialized.")
    
    def forward(self, encoded_seqs, attention_masks):
        """Input: sequence (str) of shape (batch_size, seq_len)"""
        bert_out = self.bert(encoded_seqs, attention_mask=attention_masks)
        clshead_output = self.cls_head(bert_out.last_hidden_state[:, 0, :]) #use the first token to classify
        return clshead_output
    
    def predict(self, sequence):
        with torch.no_grad():
            self.eval()
            return self.forward(sequence)
    
    def save(self, path):
        torch.save(self.state_dict(), path) #save the model state dict

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def fit(self, epochs, batch_size, lr, dataset:torch.utils.data.Dataset, epochs_done = 0):
        self.train()
        
        optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        completed = epochs_done
        try:
            for epoch in range(epochs_done, epochs):
                print(f"Epoch {epoch}")
                pbar = tqdm(dataloader)
                for batch in pbar:
                    # print("here1")
                    optimizer.zero_grad()
                    encoded_seqs, attention_masks, labels = batch
                    encoded_seqs = encoded_seqs.to(device)
                    attention_masks = attention_masks.to(device)
                    labels = labels.to(device)
                    # print("here2")
                    output = self.forward(encoded_seqs, attention_masks)
                    loss = self.loss_fn(output, labels)
                    # print("here3")
                    loss.backward()
                    optimizer.step()
                    # print("here4")
                    pbar.set_description(f"Loss: {loss.item()}")
                print(f"Epoch {epoch+1} completed. Training Loss: {loss.item()}")
                completed += 1
            self.save(f"{save_folder}/bertbaseuncased_{completed}_{time.strftime('%Y-%m-%d_%H:%M:%S')}.pt")
            
        except KeyboardInterrupt:
            print("Training interrupted.")
            #save the model by date and time of interruption
            self.save(f"{interrupt_save_folder}/bertbaseuncased_interrupt_{epoch}_{time.strftime('%Y-%m-%d_%H:%M:%S')}.pt")

class BERT_BaseCased_Multilabel(torch.nn.Module):
    def __init__(self, num_labels): 
        """num_labels: number of labels to classify
           database: tuple of (X, Y) where X is a list of sentences and Y is a tensor of labels
        """
        super().__init__()
        print("Initializing BERT_BaseCased_Multilabel...")

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.cls_head = torch.nn.Sequential(
            torch.nn.Linear(768, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, num_labels),
            torch.nn.Sigmoid()
        )
        self.loss_fn = torch.nn.BCELoss()
        print("Initialized.")
    
    def forward(self, encoded_seqs, attention_masks):
        """Input: sequence (str) of shape (batch_size, seq_len)"""
        bert_out = self.bert(encoded_seqs, attention_mask=attention_masks)
        clshead_output = self.cls_head(bert_out.last_hidden_state[:, 0, :]) #use the first token to classify
        return clshead_output
    
    def predict(self, sequence):
        with torch.no_grad():
            self.eval()
            return self.forward(sequence)
    
    def save(self, path):
        torch.save(self.state_dict(), path) #save the model state dict

    def load(self, path):
        self.load_state_dict(torch.load(path))

    def fit(self, epochs, batch_size, lr, dataset:torch.utils.data.Dataset, epochs_done = 0):
        self.train()

        optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        completed = epochs_done
        try:
            for epoch in range(epochs_done, epochs):
                print(f"Epoch {epoch}")
                pbar = tqdm(dataloader)
                for batch in pbar:
                    # print("here1")
                    optimizer.zero_grad()
                    encoded_seqs, attention_masks, labels = batch
                    encoded_seqs = encoded_seqs.to(device)
                    attention_masks = attention_masks.to(device)
                    labels = labels.to(device)
                    # print("here2")
                    output = self.forward(encoded_seqs, attention_masks)
                    loss = self.loss_fn(output, labels)
                    # print("here3")
                    loss.backward()
                    optimizer.step()
                    # print("here4")
                    pbar.set_description(f"Loss: {loss.item()}")
                print(f"Epoch {epoch+1} completed. Training Loss: {loss.item()}")
                completed += 1
            self.save(f"{save_folder}/bertbasecased_{completed}_{time.strftime('%Y-%m-%d_%H:%M:%S')}.pt")
            
        except KeyboardInterrupt:
            print("Training interrupted.")
            #save the model by date and time of interruption
            self.save(f"{interrupt_save_folder}/bertbasecased_interrupt_{epoch}_{time.strftime('%Y-%m-%d_%H:%M:%S')}.pt")


In [8]:
#Prepare database
concat_text = []
for i in range(data_df.shape[0]):
    concat_text.append(f"Title : {data_df.iloc[i].Title}; Abstract : {data_df.iloc[i].abstractText}")
data_df['text'] = pd.Series(concat_text)


In [9]:
print("Number of sequences:", len(data_df))
texts = list(data_df['text'].values)
target = torch.tensor(data_df[labels].values).float()

Number of sequences: 50000


In [10]:
# cls_model = BERT_Base_Multilabel(num_labels, (texts, data_df[labels].values))
# short = texts[1]
# encoded = tokenizer(short,add_special_tokens=True ,return_tensors='pt', padding='max_length', truncation=True)
# out = bert_model(encoded['input_ids'].to(device), encoded['attention_mask'].to(device))
# print(out[1].shape)

# short_dataset = Dataset(texts[:10], target[:10], tokenizer)


# Trying Variations

## Dataset Prep

In [21]:
def construct_dataset(texts, target, tokenizer):
    """
    Input: texts: list of strings, target: tensor of shape (num_samples, num_labels)
    Output: full_dataset, train_dataset, val_dataset, test_dataset
    """
    full_dataset = Dataset(texts, target, tokenizer)
    train_size = int(0.8 * len(full_dataset))
    val_size = int(0.1 * len(full_dataset))
    test_size = len(full_dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size, test_size])
    train_dataset.tokenizer = tokenizer
    val_dataset.tokenizer = tokenizer
    test_dataset.tokenizer = tokenizer
    return full_dataset, train_dataset, val_dataset, test_dataset

In [22]:
#evaluate the model
def evaluate(model, dataset):
    model.eval()
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
    with torch.no_grad():
        accuracies = []
        pb = tqdm(dataloader, desc="Evaluating...")
        for batch in pb:
            encoded_seqs, attention_masks, labels = batch
            encoded_seqs = encoded_seqs.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            output = model(encoded_seqs, attention_masks)
            #calculate accuracy
            # print(output.round(), labels)
            accuracy = (output.round() == labels).sum().item() / labels.numel()
            accuracies.append(accuracy)
        mean_accuracy = np.mean(accuracies)
        print(f"Mean accuracy: {mean_accuracy}")
    return mean_accuracy


## BERT Base Uncased [BERT Freezed]

In [46]:
cls_model = BERT_Base_Multilabel(num_labels).to(device)
full_dataset, train_dataset, val_dataset, test_dataset = construct_dataset(texts, target, tokenizer)


Initializing BERT_Base_Multilabel...
Initialized.


In [47]:
#freeze all BERT params
for param in cls_model.bert.parameters():
    param.requires_grad = False

In [48]:
#continue training
cls_model.fit(1, 32, 1e-4, train_dataset)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Epoch 0


  0%|          | 0/1250 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not callable

In [None]:
# evaluate(cls_model, test_dataset)

## BERT Base Cased - [Bert Freezed]

In [23]:
# Trying BERT_cased for 1 epoch

cls_bertcased = BERT_BaseCased_Multilabel(num_labels).to(device)
full_dataset, train_dataset, val_dataset, test_dataset = construct_dataset(texts, target, cls_bertcased.tokenizer)
print(cls_bertcased)
#freeze all BERT params
for params in cls_bertcased.bert.parameters():
    params.requires_grad = False

Initializing BERT_BaseCased_Multilabel...
Initialized.
BERT_BaseCased_Multilabel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
    

In [None]:
cls_bertcased.fit(1, 32, 1e-4, train_dataset)

Epoch 0


Loss: 0.4240747392177582:  96%|█████████▋| 1205/1250 [17:04<00:38,  1.17it/s] 

In [25]:
test_accuracy = evaluate(cls_bertcased, test_dataset)

Evaluating...: 100%|██████████| 5000/5000 [02:25<00:00, 34.47it/s]

Mean accuracy: 0.8383





## Uncased BERTBase [Last Encoder Layer Unfrozen]

In [34]:
base_unfreeze1 =  BERT_BaseCased_Multilabel(num_labels).to(device)
full_dataset, train_dataset, val_dataset, test_dataset = construct_dataset(texts, target, base_unfreeze1.tokenizer)

cnt = 0
for param in base_unfreeze1.bert.parameters():
    param.requires_grad = False
    
for layer in base_unfreeze1.bert.encoder.layer:
    cnt += 1
    if cnt >= 12:
        for param in layer.parameters():
            param.required_grad = True

print("Last 1 Encoder Layer Unfreezed")


Initializing BERT_BaseCased_Multilabel...
Initialized.
Last 1 Encoder Layer Unfreezed


In [35]:
base_unfreeze1.fit(1, 32, 1e-4, train_dataset)

Epoch 0


Loss: 0.4362364709377289: 100%|██████████| 1250/1250 [17:41<00:00,  1.18it/s] 


Epoch 1 completed. Training Loss: 0.4362364709377289


In [36]:
test_accuracy_base_unfreeze1 = evaluate(base_unfreeze1, test_dataset)

Evaluating...: 100%|██████████| 5000/5000 [02:26<00:00, 34.04it/s]

Mean accuracy: 0.8380285714285713





In [37]:
# Lower the learning rate and train for some more time
base_unfreeze1.load("/kaggle/working/bertbasecased_1_2024-04-14_15:14:06.pt")

In [38]:
base_unfreeze1.fit(2, 32, 2e-5, train_dataset, epochs_done=1)

Epoch 1


Loss: 0.3768368065357208: 100%|██████████| 1250/1250 [17:44<00:00,  1.17it/s] 


Epoch 2 completed. Training Loss: 0.3768368065357208


In [39]:
accuracy_3 = evaluate(base_unfreeze1, test_dataset)

Evaluating...: 100%|██████████| 5000/5000 [02:25<00:00, 34.43it/s]

Mean accuracy: 0.844





## Unfreezing Embedding Layer

In [44]:

uncased_uf_emb_enc = BERT_Base_Multilabel(num_labels).to(device)
full_dataset, train_dataset, val_dataset, test_dataset = construct_dataset(texts, target, uncased_uf_emb_enc.tokenizer)

cnt = 0
for param in uncased_uf_emb_enc.bert.parameters():
    param.requires_grad = False
    
for layer in uncased_uf_emb_enc.bert.encoder.layer:
    cnt += 1
    if cnt >= 12:
        for param in layer.parameters():
            param.required_grad = True

print("Last 1 Encoder Layer Unfreezed")
for param in uncased_uf_emb_enc.bert.embeddings.parameters():
    param.requires_grad = True

print("Embedding Layer Unfreezed")

Initializing BERT_Base_Multilabel...
Initialized.
Last 1 Encoder Layer Unfreezed
Embedding Layer Unfreezed


In [45]:
uncased_uf_emb_enc.fit(1,16,1e-4, train_dataset)

Epoch 0


Loss: 0.27036356925964355: 100%|██████████| 2500/2500 [31:48<00:00,  1.31it/s]


Epoch 1 completed. Training Loss: 0.27036356925964355


In [46]:
evaluate(uncased_uf_emb_enc, test_dataset)

Evaluating...: 100%|██████████| 5000/5000 [02:33<00:00, 32.51it/s]

Mean accuracy: 0.8775142857142859





0.8775142857142859

In [48]:
# unfreeze one more layer and do 2 more epochs
uncased_uf_emb_enc.load("/kaggle/working/bertbaseuncased_1_2024-04-14_17:04:49.pt")
cnt = 0
for param in uncased_uf_emb_enc.bert.parameters():
    param.requires_grad = False
    
for layer in uncased_uf_emb_enc.bert.encoder.layer:
    cnt += 1
    if cnt >= 11:
        for param in layer.parameters():
            param.required_grad = True

print("Last 2 Encoder Layer Unfreezed")
for param in uncased_uf_emb_enc.bert.embeddings.parameters():
    param.requires_grad = True

print("Embedding Layer Unfreezed")

Last 2 Encoder Layer Unfreezed
Embedding Layer Unfreezed


In [None]:
uncased_uf_emb_enc.fit(3,16,4e-5, train_dataset, epochs_done=1)

Epoch 1


Loss: 0.2806262969970703:   6%|▌         | 154/2500 [01:57<29:33,  1.32it/s] 

In [None]:
evaluate(uncased_uf_emb_enc, test_dataset)

## Adding Domain Specific Tokens to the vocabulary and fine-tuning