In [1]:
!pip install transformers==2.6.0

Collecting transformers==2.6.0
  Downloading transformers-2.6.0-py3-none-any.whl (540 kB)
[K     |████████████████████████████████| 540 kB 3.4 MB/s eta 0:00:01
Collecting tokenizers==0.5.2
  Downloading tokenizers-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 49.1 MB/s eta 0:00:01
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.7.0
    Uninstalling tokenizers-0.7.0:
      Successfully uninstalled tokenizers-0.7.0
  Attempting uninstall: transformers
    Found existing installation: transformers 2.9.0
    Uninstalling transformers-2.9.0:
      Successfully uninstalled transformers-2.9.0
Successfully installed tokenizers-0.5.2 transformers-2.6.0


In [2]:
import transformers
print(transformers.__version__)

2.6.0


In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [None]:
# !pip install transformers

In [4]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,WeightedRandomSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [None]:
# import os
# os.listdir('/kaggle/input')

In [5]:
#global max_len
data = pd.read_csv("/kaggle/input/ner-datasetcsv/ner_dataset.csv", encoding= 'unicode_escape')
data = data.fillna(method="ffill")


In [6]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [7]:
data['Tag'].nunique()

17

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
getter = SentenceGetter(data)

In [10]:
sent = getter.get_next()

In [11]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [12]:
sentences = [" ".join([word[0] for word in sentence]) for sentence in getter.sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [13]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])


['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [14]:
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

+ Apply Bert
+ Prepare the sentences and labels
+ Before we can start fine-tuning the model, we have to prepare the data set for the use with pytorch and BERT.

In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [16]:
torch.__version__

'1.5.0'

Here we fix some configurations. We will limit our sequence length to 75 tokens and we will use a batch size of 32 as suggested by the Bert paper. Note, that Bert supports sequences of up to 512 tokens.


In [17]:
MAX_LEN = 150
bs = 32

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

device

device(type='cuda')

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




Now we tokenize all sentences. Since the BERT tokenizer is based a Wordpiece tokenizer it will split tokens in subword tokens. For example ‘gunships’ will be split in the two tokens ‘guns’ and ‘##hips’. We have to deal with the issue of splitting our token-level labels to related subtokens. In practice you would solve this by a specialized data structure based on label spans, but for simplicity I do it explicitly here.



In [20]:
max_len = 0

# For every sentence...
for sent in tqdm(sentences):

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

100%|██████████| 47959/47959 [00:43<00:00, 1102.22it/s]

Max sentence length:  154





#max_len
The tokenizer.encode_plus function combines multiple steps for us:

+ Split the sentence into tokens.
+ Add the special [CLS] and [SEP] tokens.
+ Map the tokens to their IDs.
+ Pad or truncate all sentences to the same length.
+ Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.

In [None]:
# sentences[1]

In [21]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in tqdm(sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        #add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 150,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

100%|██████████| 47959/47959 [00:52<00:00, 921.22it/s] 


Original:  Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Token IDs: tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
         1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
         3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,   

In [22]:
attention_masks[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

In [23]:
tokenizer.convert_ids_to_tokens(101)


'[CLS]'

In [24]:
tokenizer.convert_ids_to_tokens(102)

'[SEP]'

In [25]:
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [26]:
input_ids[0]

tensor([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,  6186,
         1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,  1997,  2329,
         3629,  2013,  2008,  2406,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [27]:
#Modify labels accordingly
def reg_encoding_generic(sentences,labels):
    cls_token="[CLS]"
    pad_token_label_id='pad_token_label_id'
    sep_token="[SEP]"
    tokens_all=[]
    labels_all=[]
    for (sentence,label) in tqdm(zip(sentences,labels)):
        tokens_per_sentence=[]
        labels_per_sentence=[]
        for word, label in zip(sentence.split(), label):
            word_tokens = tokenizer.tokenize(word)
            if len(word_tokens) > 0:
                tokens_per_sentence.extend(word_tokens)
                labels_per_sentence.extend([label] + [pad_token_label_id] * (len(word_tokens) - 1))
#                 labels_per_sentence.extend([label] + [label] * (len(word_tokens) - 1))
        tokens_per_sentence = [cls_token] + tokens_per_sentence + [sep_token]
        labels_per_sentence = [pad_token_label_id] + labels_per_sentence + [pad_token_label_id]
#         tokens_all.append(tokens_per_sentence)
        labels_all.append(labels_per_sentence)
    return labels_all

In [None]:
#sentences[0]
#labels[0]

In [28]:
labels_new = reg_encoding_generic(sentences,labels) 

47959it [01:23, 572.59it/s]


In [55]:
dict_tag_to_id = {}
dict_tag_to_id['pad_token_label_id'] = -100


In [56]:
dict_id_to_tag={}
dict_id_to_tag[-100]='pad_token_label_id'


In [57]:
list_tags = list(data['Tag'].unique())
for i,tag in enumerate(list_tags):
    
    dict_tag_to_id[tag] = i 
    dict_id_to_tag[i]=tag
    

In [58]:
dict_tag_to_id,dict_id_to_tag

({'pad_token_label_id': -100,
  'O': 0,
  'B-geo': 1,
  'B-gpe': 2,
  'B-per': 3,
  'I-geo': 4,
  'B-org': 5,
  'I-org': 6,
  'B-tim': 7,
  'B-art': 8,
  'I-art': 9,
  'I-per': 10,
  'I-gpe': 11,
  'I-tim': 12,
  'B-nat': 13,
  'B-eve': 14,
  'I-eve': 15,
  'I-nat': 16},
 {-100: 'pad_token_label_id',
  0: 'O',
  1: 'B-geo',
  2: 'B-gpe',
  3: 'B-per',
  4: 'I-geo',
  5: 'B-org',
  6: 'I-org',
  7: 'B-tim',
  8: 'B-art',
  9: 'I-art',
  10: 'I-per',
  11: 'I-gpe',
  12: 'I-tim',
  13: 'B-nat',
  14: 'B-eve',
  15: 'I-eve',
  16: 'I-nat'})

In [31]:
tags = pad_sequences([[dict_tag_to_id[l] for l in lab] for lab in labels_new],
                     maxlen= MAX_LEN, value= dict_tag_to_id["pad_token_label_id"], padding="post",
                     dtype="long", truncating="post")

In [32]:
'''
geo = Geographical Entity,org = Organization,per = Person,gpe = Geopolitical Entity,tim = Time indicator,art = Artifact,eve = Event
nat = Natural Phenomenon
'''
dict_tag_to_id

{'pad_token_label_id': -100,
 'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [33]:
tags[0]

array([-100,    0,    0,    0,    0,    0,    0,    1,    0,    0,    0,
          0,    0,    1,    0,    0,    0,    0,    0,    2,    0,    0,
          0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100])

In [34]:
len(tags[0])

150

In [35]:
len(input_ids[0])

150

In [39]:
#Creating the test and train set

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2020, test_size=0.1)
tr_masks, val_masks, _, _ =   train_test_split(attention_masks, input_ids,
                                             random_state=2020, test_size=0.1)

Since we’re operating in pytorch, we have to convert the dataset to torch tensors.


In [40]:
# tr_inputs = torch.tensor(tr_inputs)
# val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
# tr_masks = torch.tensor(tr_masks)
# val_masks = torch.tensor(val_masks)

+ The last step is to define the dataloaders. 
+ We shuffle the data at training time with the RandomSampler and at test time we just pass them sequentially with the SequentialSampler.


In [41]:
train_data =  TensorDataset(tr_inputs, tr_masks, tr_tags)  #### ----> since we need output from the three, done through tensor dataset
train_sampler = RandomSampler(train_data)   ##### ----> per batch sample
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs) ### sampling step

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

+ The transformer package provides a BertForTokenClassification class for token-level predictions. 
+ BertForTokenClassification is a fine-tuning model that wraps BertModel and adds token-level classifier on top of the BertModel. 
+ The token-level classifier is a linear layer that takes as input the last hidden state of the sequence. 
+ We load the pre-trained bert-base-cased model and provide the number of possible labels.

In [42]:
!pip install transformers==2.6.0



In [45]:

import transformers
from transformers import BertForTokenClassification, AdamW

print(transformers.__version__)

2.6.0


In [44]:
#Defining the model

model = BertForTokenClassification.from_pretrained(
            "bert-base-uncased", #### ---- specifying the model
            num_labels = len(tag2idx), #####----- no of labels
            output_attentions = False, ##### ----- 
            output_hidden_states = False #### -----
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
# for param in model.bert.parameters():
#     param.requires_grad = False
# len(list(model.bert.parameters())[197])

Now we have to pass the model parameters to the GPU.

In [82]:
model.cuda()

FULL_FINETUNING = False

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters()) ####### all parameters
    no_decay = ['bias'] ### 
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    
    ### set require_grad = False for some layers and pass it in the optimizer_grouped_parameters
    for param in model.bert.parameters():
        param.requires_grad = False
    
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
        
#     param_optimizer = list(model.named_parameters())
#     for num,param in param_optimizer[:197]:
#         param.requires_grad = False
#     len(list(model.bert.parameters())[-197:])

    
optimizer = AdamW(
    optimizer_grouped_parameters, ##### --->. optimization
    lr=3e-5,
    eps=1e-8
)

We also add a scheduler to linearly reduce the learning rate throughout the epochs.

In [47]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0



+ First we define some metrics, we want to track while training. 
+ We use the f1_score from the seqeval package. 
+ You ca find more details here. 
+ And we use simple accuracy on a token level comparable to the accuracy in keras.

In [48]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-0.0.12.tar.gz (21 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7423 sha256=868bb49ac77cb98b9dedeab2ff2c412602a47aa073f18138d013250acf4f8d2e
  Stored in directory: /root/.cache/pip/wheels/dc/cc/62/a3b81f92d35a80e39eb9b2a9d8b31abac54c02b21b2d466edc
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [49]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    
    labels_flat = labels.flatten()
    indices=np.where(labels_flat!==100)
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
#     print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if dict_id_to_tag[l_i] != "pad_token_label_id"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if dict_id_to_tag[l_i] != "pad_token_label_id"]
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#Class --------

# class BertForTokenClassification(BertPreTrainedModel):
    
    
#     def __init__(self, config):
#         super().__init__(config)      ##### ----> using the config, pre_trained model config intialization 
#         self.num_labels = config.num_labels ### no of labels unique
#         self.bert = BertModel(config) ##### loading the bert model
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
#         self.init_weights()
    
    
#     def forward(
#         self,
#         input_ids=None,
#         attention_mask=None,
#         token_type_ids=None,
#         position_ids=None,
#         head_mask=None,
#         inputs_embeds=None,
#         labels=None,
#     ):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,)
        
#         sequence_output = outputs[0]
#         sequence_output = self.dropout(sequence_output)
#         logits = self.classifier(sequence_output)
#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        
#         if labels is not None:
#             loss_fct = CrossEntropyLoss()
#             # Only keep active parts of the loss
#             if attention_mask is not None:
#                 active_loss = attention_mask.view(-1) == 1
#                 active_logits = logits.view(-1, self.num_labels)
#                 active_labels = torch.where(
#                     active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
#                 )
#                 loss = loss_fct(active_logits, active_labels)
#             else:
#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             outputs = (loss,) + outputs
#         return outputs  # (loss), scores, (hidden_states), (attentions)