Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import csv
import torch.nn.functional as F

Setting up device

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

Loading data

In [3]:
df = pd.read_csv("/kaggle/input/train-data/train_dataset.csv")
df = df.dropna()
print(df.head())

                                               Title  \
0       [Recommendations in relation to shift work].   
1  Orbital venous approach to the cavernous sinus...   
2  Effects of cadmium on photosynthetic oxygen ev...   
3  Captopril in congestive heart failure: improve...   
4  Glycerol-induced development of catalytically ...   

                                        abstractText  \
0  The purpose of the recommendations is to decre...   
1  Carotid-cavernous fistulas are abnormal commun...   
2  Scanning electrochemical microscopy (SECM) was...   
3  The oral angiotensin-converting enzyme inhibit...   
4  The reconstitutable apoprotein of Crotalus ada...   

                                           meshMajor      pmid  \
0              ['Humans', 'Work Schedule Tolerance']   1417504   
1  ['Aged', 'Arteriovenous Fistula', 'Carotid Art...   7810953   
2  ['Cadmium', 'Mustard Plant', 'Oxygen', 'Photos...  19360968   
3  ['Angiotensin-Converting Enzyme Inhibitors', '...   6291359

Combining the data in title and abstractText columns, and also the ground truth of all labels of a particular row to a list

In [4]:
train_dataset = pd.DataFrame()

train_dataset['list'] = df[df.columns[6:20]].values.tolist()

train_dataset['Combined'] = df['Title'].str.cat(df['abstractText'], sep=' ')

print(train_dataset.head())
print(train_dataset.shape)

                                         list  \
0  [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]   
1  [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]   
2  [1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]   
3  [1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]   
4  [0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]   

                                            Combined  
0  [Recommendations in relation to shift work]. T...  
1  Orbital venous approach to the cavernous sinus...  
2  Effects of cadmium on photosynthetic oxygen ev...  
3  Captopril in congestive heart failure: improve...  
4  Glycerol-induced development of catalytically ...  
(34999, 2)


In [5]:
train_data = df[df.columns[:6]].copy()
print(train_data.head())
print(train_data.shape)

                                               Title  \
0       [Recommendations in relation to shift work].   
1  Orbital venous approach to the cavernous sinus...   
2  Effects of cadmium on photosynthetic oxygen ev...   
3  Captopril in congestive heart failure: improve...   
4  Glycerol-induced development of catalytically ...   

                                        abstractText  \
0  The purpose of the recommendations is to decre...   
1  Carotid-cavernous fistulas are abnormal commun...   
2  Scanning electrochemical microscopy (SECM) was...   
3  The oral angiotensin-converting enzyme inhibit...   
4  The reconstitutable apoprotein of Crotalus ada...   

                                           meshMajor      pmid  \
0              ['Humans', 'Work Schedule Tolerance']   1417504   
1  ['Aged', 'Arteriovenous Fistula', 'Carotid Art...   7810953   
2  ['Cadmium', 'Mustard Plant', 'Oxygen', 'Photos...  19360968   
3  ['Angiotensin-Converting Enzyme Inhibitors', '...   6291359

Setting up the parameters and importing tokenizer

In [6]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 1
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Defining dataset class

In [7]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe.Combined
        self.targets = dataframe.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
#             padding='max_length',
#             truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

Creating datasets

In [8]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)

Setting up parameters and making dataloaders

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'drop_last': True
                }

training_loader = DataLoader(training_set, **train_params)

Defining the BERT neural network class

In [10]:
class BERTClass(torch.nn.Module):
    def __init__(self, fine_tune_layers=1):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 14)
        
        # Freeze all the parameters
        for param in self.l1.parameters():
            param.requires_grad = False

        # Unfreeze the top n layers
        if fine_tune_layers > 0:
            for layer in self.l1.encoder.layer[-fine_tune_layers:]:
                for param in layer.parameters():
                    param.requires_grad = True
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

Loading the model

In [11]:
model = BERTClass()
model_state = torch.load("/kaggle/input/bertmodel/bert_model.pth", map_location=torch.device(device))
model.load_state_dict(model_state)
model.to(device)  # Move the model to the appropriate device

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

Defining loss function as Binary Cross Entropy with Logits Loss 

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

Defining evaluation function

In [13]:
def evaluate(data_loader, train_data):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    
    total_loss = 0
    with torch.no_grad():
        for i, data in enumerate(data_loader, 0):
            if(i%1000==0):
                print("Data No.:",i)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            total_loss+=loss.data.item()
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    total_loss = total_loss/(len(data_loader))

    fin_outputs = np.array(fin_outputs) >= 0.5
    
    accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
    print('accuracy:', accuracy)
    
    target_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']    
    print(metrics.classification_report(fin_targets, fin_outputs, target_names=target_names))
    
    precision = metrics.precision_score(fin_targets, fin_outputs, average=None)
    precision_micro = metrics.precision_score(fin_targets, fin_outputs, average='micro')
    precision_macro = metrics.precision_score(fin_targets, fin_outputs, average='macro')
    
    recall = metrics.recall_score(fin_targets, fin_outputs, average=None)
    recall_micro = metrics.recall_score(fin_targets, fin_outputs, average='micro')
    recall_macro = metrics.recall_score(fin_targets, fin_outputs, average='macro')
    
    f1_score = metrics.f1_score(fin_targets, fin_outputs, average=None)
    f1_score_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
    f1_score_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro')
    
    fin_outputs = fin_outputs.astype(int)

    new_df = pd.DataFrame(fin_outputs, columns=target_names)

    inference = pd.concat([train_data, new_df], axis=1)

    file_path = 'train_inference.csv'
    inference.to_csv(file_path, index=False, header=True)
    
    return total_loss, accuracy, precision, precision_micro, precision_macro, recall, recall_micro, recall_macro, f1_score, f1_score_micro, f1_score_macro

Evaluation loop

In [14]:
train_loss, accuracy, precision, precision_micro, precision_macro, recall, recall_micro, recall_macro, f1_score, f1_score_micro, f1_score_macro = evaluate(training_loader, train_data)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Data No.: 0
Data No.: 1000
Data No.: 2000
Data No.: 3000
Data No.: 4000
Data No.: 5000
Data No.: 6000
Data No.: 7000
Data No.: 8000
Data No.: 9000
Data No.: 10000
Data No.: 11000
Data No.: 12000
Data No.: 13000
Data No.: 14000
Data No.: 15000
Data No.: 16000
Data No.: 17000
Data No.: 18000
Data No.: 19000
Data No.: 20000
Data No.: 21000
Data No.: 22000
Data No.: 23000
Data No.: 24000
Data No.: 25000
Data No.: 26000
Data No.: 27000
Data No.: 28000
Data No.: 29000
Data No.: 30000
Data No.: 31000
Data No.: 32000
Data No.: 33000
Data No.: 34000
accuracy: 0.23589245407011628


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           A       0.86      0.82      0.84     16229
           B       0.98      0.99      0.98     32639
           C       0.91      0.92      0.92     18547
           D       0.94      0.94      0.94     21757
           E       0.84      0.94      0.89     27487
           F       0.89      0.76      0.82      6214
           G       0.87      0.87      0.87     23478
           H       0.67      0.15      0.24      4248
           I       0.75      0.63      0.69      3935
           J       0.80      0.57      0.66      3922
           L       0.83      0.47      0.60      5274
           M       0.90      0.94      0.92     15032
           N       0.83      0.83      0.83     16079
           Z       0.83      0.82      0.82      5666

   micro avg       0.89      0.87      0.88    200507
   macro avg       0.85      0.76      0.79    200507
weighted avg       0.88      0.87      0.87    200507
 samples avg       0.89   

Train Dataset Results:

              precision    recall  f1-score   support

           A       0.86      0.82      0.84     16229
           B       0.98      0.99      0.98     32639
           C       0.91      0.92      0.92     18547
           D       0.94      0.94      0.94     21757
           E       0.84      0.94      0.89     27487
           F       0.89      0.76      0.82      6214
           G       0.87      0.87      0.87     23478
           H       0.67      0.15      0.24      4248
           I       0.75      0.63      0.69      3935
           J       0.80      0.57      0.66      3922
           L       0.83      0.47      0.60      5274
           M       0.90      0.94      0.92     15032
           N       0.83      0.83      0.83     16079
           Z       0.83      0.82      0.82      5666

   micro avg:       precision: 0.89      recall: 0.87      f1-score: 0.88    support: 200507

   macro avg:       precision: 0.85      recall: 0.76      f1-score: 0.79    support: 200507

weighted avg:       precision: 0.88      recall: 0.87      f1-score: 0.87    support: 200507

 samples avg:       precision: 0.89      recall: 0.87      f1-score: 0.87    support: 200507

In [15]:
train_results = {
    'train_loss': [train_loss],
    'accuracy': [accuracy],
    'precision': [precision],
    'precision_micro': [precision_micro],
    'precision_macro': [precision_macro],
    'recall': [recall],
    'recall_micro': [recall_micro],
    'recall_macro': [recall_macro],
    'f1_score': [f1_score],
    'f1_score_micro': [f1_score_micro],
    'f1_score_macro': [f1_score_macro]
}

In [16]:
df = pd.DataFrame(train_results)
file_path = '/kaggle/working/train_results_list.xlsx'
df.to_excel(file_path, index=False, header=True)