# Train an NER model

Contents:

0. Configure
1. Load & Process Data
2. Set up training & evaluation
3. Train
4. Push to Hugging Face model hub
5. Save to ONNX

### Imports and installs

In [1]:
import sys

sys.version

'3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]'

In [None]:
!conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import os, sys
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
import ast

from keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
import random

torch.__version__

'2.1.0+cu121'

In [3]:
torch.cuda.is_available()

True

In [4]:
# Load model directly
import transformers
from transformers import AutoTokenizer

In [5]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__


'4.38.2'

### Configuration

In [6]:
MAX_LEN = 50
LR = 3e-5
bs = 750
epochs = 5
max_grad_norm = 1.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

#path to save model
MODEL_PATH = "models/bert_subsampled_model_v8"

In [7]:
n_gpu

1

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
os.chdir( "/content/drive/MyDrive/datasnipper")

### Load and preproc

In [10]:
import pandas as pd
train = pd.read_csv('subsampled_train.csv')

val = pd.read_csv('valid_proc.csv',nrows=10000)
# holdout = pd.read_csv('restaurants_holdout.csv')

In [11]:
print(train.shape, val.shape)

(159672, 3) (10000, 3)


In [12]:
train["toks_exp"] = train["toks_exp"].apply(lambda x: ast.literal_eval(x))

train["ner_expanded"] = train["ner_expanded"].apply(lambda x: ast.literal_eval(x))

In [13]:
val["toks_exp"] = val["toks_exp"].apply(lambda x: ast.literal_eval(x))

val["ner_expanded"] = val["ner_expanded"].apply(lambda x: ast.literal_eval(x))

In [14]:
train["ner_expanded"].explode().value_counts()

O                                                 7545947
I-DebtInstrumentInterestRateStatedPercentage        28842
I-DebtInstrumentBasisSpreadOnVariableRate1          20898
I-LineOfCreditFacilityMaximumBorrowingCapacity      17488
I-DebtInstrumentFaceAmount                          15441
B-DebtInstrumentInterestRateStatedPercentage        15208
B-LineOfCreditFacilityMaximumBorrowingCapacity      11693
B-DebtInstrumentBasisSpreadOnVariableRate1          11445
B-DebtInstrumentFaceAmount                          10749
I-DebtInstrumentMaturityDate                         4320
B-DebtInstrumentMaturityDate                         1440
Name: ner_expanded, dtype: int64

In [15]:
val["ner_expanded"].explode().value_counts()

O                                                 465918
I-DebtInstrumentInterestRateStatedPercentage         324
I-LineOfCreditFacilityMaximumBorrowingCapacity       218
I-DebtInstrumentFaceAmount                           206
I-DebtInstrumentBasisSpreadOnVariableRate1           166
B-DebtInstrumentInterestRateStatedPercentage         162
B-DebtInstrumentFaceAmount                           155
B-LineOfCreditFacilityMaximumBorrowingCapacity       150
B-DebtInstrumentBasisSpreadOnVariableRate1            88
I-DebtInstrumentMaturityDate                          24
B-DebtInstrumentMaturityDate                           8
Name: ner_expanded, dtype: int64

In [16]:
# tokenizer = BertTokenizer.from_pretrained('dist', do_lower_case=True)

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# model = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#### Apply labels to subwords as well

In [17]:
def tokenize_and_preserve_labels(sentence, text_labels):
    """
    Handles Bert's sub word encoding
    """
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [18]:
train_sents = train["toks_exp"].tolist()

train_labels = train["ner_expanded"].tolist()

val_sents = val.sample(10000, random_state=121)["toks_exp"].tolist()

val_labels = val.sample(10000, random_state=121)["ner_expanded"].tolist()


In [None]:
# train_tokenized_texts_and_labels = [
#     tokenize_and_preserve_labels(sent, labs)
#     for sent, labs in zip(train_sents, train_labels)
# ]

In [19]:
!ls

04_evaluation_v2.ipynb	test.csv			      valid_proc.csv
models			train_proc.csv			      val_tokenized_texts_and_labels.pkl
subsampled_train.csv	train_tokenized_texts_and_labels.pkl


In [20]:
import pickle

# with open('train_tokenized_texts_and_labels.pkl', 'wb') as f:
#     pickle.dump(train_tokenized_texts_and_labels, f)

# Load the previously pickled data
with open('train_tokenized_texts_and_labels.pkl', 'rb') as f:
    train_tokenized_texts_and_labels = pickle.load(f)







In [21]:
val_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(val_sents, val_labels)
]

In [26]:

with open('val_tokenized_texts_and_labels.pkl', 'wb') as f:
    pickle.dump(val_tokenized_texts_and_labels, f)







In [22]:
train_tokenized_texts = [token_label_pair[0] for token_label_pair in train_tokenized_texts_and_labels]

train_labels = [token_label_pair[1] for token_label_pair in train_tokenized_texts_and_labels]

In [23]:
val_tokenized_texts = [token_label_pair[0] for token_label_pair in val_tokenized_texts_and_labels]

val_labels = [token_label_pair[1] for token_label_pair in val_tokenized_texts_and_labels]


In [26]:
from keras.preprocessing.sequence import pad_sequences

In [27]:
# from torch.nn.utils.rnn import pad_sequence

In [28]:
input_ids_train = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

input_ids_val = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in val_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")


In [29]:
tag_values = sorted(['B-DebtInstrumentBasisSpreadOnVariableRate1',
 'B-DebtInstrumentFaceAmount',
 'I-DebtInstrumentFaceAmount',
 'B-DebtInstrumentInterestRateStatedPercentage',
 'B-DebtInstrumentMaturityDate',
 'I-DebtInstrumentMaturityDate',
 'B-LineOfCreditFacilityMaximumBorrowingCapacity',
 'I-DebtInstrumentInterestRateStatedPercentage',
 'I-LineOfCreditFacilityMaximumBorrowingCapacity',
 'I-DebtInstrumentBasisSpreadOnVariableRate1', 'O', 'PAD'])

tag_values

['B-DebtInstrumentBasisSpreadOnVariableRate1',
 'B-DebtInstrumentFaceAmount',
 'B-DebtInstrumentInterestRateStatedPercentage',
 'B-DebtInstrumentMaturityDate',
 'B-LineOfCreditFacilityMaximumBorrowingCapacity',
 'I-DebtInstrumentBasisSpreadOnVariableRate1',
 'I-DebtInstrumentFaceAmount',
 'I-DebtInstrumentInterestRateStatedPercentage',
 'I-DebtInstrumentMaturityDate',
 'I-LineOfCreditFacilityMaximumBorrowingCapacity',
 'O',
 'PAD']

In [30]:

tag2idx = {t: i for i, t in enumerate(tag_values)}

tag2idx

{'B-DebtInstrumentBasisSpreadOnVariableRate1': 0,
 'B-DebtInstrumentFaceAmount': 1,
 'B-DebtInstrumentInterestRateStatedPercentage': 2,
 'B-DebtInstrumentMaturityDate': 3,
 'B-LineOfCreditFacilityMaximumBorrowingCapacity': 4,
 'I-DebtInstrumentBasisSpreadOnVariableRate1': 5,
 'I-DebtInstrumentFaceAmount': 6,
 'I-DebtInstrumentInterestRateStatedPercentage': 7,
 'I-DebtInstrumentMaturityDate': 8,
 'I-LineOfCreditFacilityMaximumBorrowingCapacity': 9,
 'O': 10,
 'PAD': 11}

In [31]:
train_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

val_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in val_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")


In [32]:
train_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_train]

val_attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids_val]


In [33]:
train_inputs = torch.tensor(input_ids_train)

val_inputs = torch.tensor(input_ids_val)

train_tags = torch.tensor(train_tags, dtype=torch.long )

val_tags = torch.tensor(val_tags, dtype=torch.long)

train_attention_masks = torch.tensor(train_attention_masks)

val_attention_masks = torch.tensor(val_attention_masks)

In [34]:
val_inputs.shape, val_tags.shape, val_attention_masks.shape


(torch.Size([10000, 50]), torch.Size([10000, 50]), torch.Size([10000, 50]))

In [35]:
train_inputs.shape, train_tags.shape, train_attention_masks.shape


(torch.Size([159672, 50]), torch.Size([159672, 50]), torch.Size([159672, 50]))

In [37]:
# bs=512

# bs

In [55]:
train_data = TensorDataset(train_inputs, train_attention_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_attention_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [56]:
from transformers import AutoModelForTokenClassification, AutoTokenizer


In [40]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LR,
    eps=1e-8
)



In [58]:
from transformers import get_linear_schedule_with_warmup

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

num_warmup_steps = int(0.1 * total_steps)

print(num_warmup_steps, total_steps)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)


156 1560


In [59]:
model.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [44]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


### Setup Evaluation

In [61]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [62]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import torch
import time
import re
def expand_tags(tokenizer, token_list, tag_list):

    final_tags = []

    counter = 0

    for t, l in zip(token_list, tag_list):

        t = re.sub(r'[^\x00-\x7F]+', '', t)
        if t == "":
            continue
        counter +=1


#         print(counter, l, t, counter)

        temp_counts = 0

        for toks in tokenizer.tokenize(t):
#             print(t, toks)


#             print(toks, temp_counts)
            if toks.startswith("##"):
                continue
            else:
                temp_counts +=1
                if temp_counts > 1:
                    if l != "O":
                        l = l.replace("B-", "I-")
#                 print(counter, "append", l, counter)

                final_tags.append(l)
    return final_tags

def expand_toks(tokenizer, token_list):

    final_toks = []
    for i, t in enumerate(token_list):
        t = re.sub(r'[^\x00-\x7F]+', '', t)
        if t == "":
            continue

        final_toks += combine_subtoks(tokenizer.tokenize(t))

    return final_toks


def combine_subtoks(toks):

    comb =[]

    curr = []
    for i,t in enumerate(toks):
#         print(i, curr)

        if t.startswith("##"):
            t = t[2:]
            curr.append(t)
        else:
            if len(curr) > 0:
                comb.append("".join(curr))
            curr= [t]
#             if i == len(toks)-1:
#                 print(i, comb)
    comb.append("".join(curr))



    return comb


from transformers import AutoModelForTokenClassification, AutoTokenizer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import torch
from collections import defaultdict
import copy

# # Example setup (replace these with your model and tokenizer)
# model_name = "your_model_name_here"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForTokenClassification.from_pretrained(model_name)

# Function to align tokens and labels

def predict(model,input_ids,attention_mask, mtype="pytorch"):

    if mtype=="pytorch":


        with torch.no_grad():

            outputs = model(input_ids, attention_mask=attention_mask)

            preds = torch.argmax(outputs.logits, axis=2)

            preds = [output[mask == 1] for output, mask in zip(preds, attention_mask)]

#             print(outputs.logits, outputs.logits.shape)

    elif mtype=="onnx":
        # Prepare the input as a dictionary

        inputs_onnx = {model.get_inputs()[0].name: input_ids.numpy()}

        # Run inference
        outputs = model.run(None, inputs_onnx)

#         print(len(outputs[0]), "lennn")



        preds = np.argmax(outputs[0], axis=2)

        preds = [output[mask == 1] for output, mask in zip(preds, attention_mask)]

    return preds



def pred_pipeline(text, model,mtype="pytorch", bs=100, device="cpu"):

#     print(tokenizer.encode_plus(text))

    all_outs = []

    latency = []

    with torch.no_grad():
        for bn in range(0,len(text), bs):

            max_len = min(200,max([len(x.split()) for x in text[bn:bn+bs]]))

            st = time.time()

            tok_enc = tokenizer.batch_encode_plus(text[bn:bn+bs], padding='longest', return_tensors="pt", truncation=True,max_length=500)

            #     print(tok_enc)

            bids = tok_enc["input_ids"].to(device)

            bams = tok_enc["attention_mask"].to(device)

            print(bn, max_len, bids.shape)

            # bids = input_ids

            # bams = attention_mask[bn:bn+bs]

#             print(bams.shape, bids.shape, bn, bn+bs)

            outputs = predict(model, bids,bams, mtype=mtype)

            et = time.time()

            latency.append(et-st)

        # After getting all_outs


#             print(outputs.logits.shape)

            all_outs.extend(outputs)

    print("latencies")
    print(pd.Series(latency).quantile([0.5, 0.75, 0.9, 0.99]))

    print("preds done")

#     logits = torch.cat(all_outs, dim=0)

#     print(logits.shape)

#     preds = torch.argmax(logits, axis=2)


    return all_outs


def align_predictions_with_original_words(tokenizer, text, sub_token_predictions):
    word_level_predictions = []

    for sentence, predictions in zip(text, sub_token_predictions):

        tokenized_sentence = tokenizer.tokenize(sentence)
        aligned_labels = []
        current_word = None

        for token, label_idx in zip(tokenized_sentence, predictions[1:]):
            if token.startswith("##"):
                continue  # Skip sub-tokens
            else:
                # if current_word is not None:
                  aligned_labels.append(label_idx)
                  current_word = token
        word_level_predictions.append(aligned_labels)
    return word_level_predictions

def expand_tags(tokenizer, token_list, tag_list):

    final_tags = []

    for t, l in zip(token_list, tag_list):

        for toks in tokenizer.tokenize(t):

            temp_counts = 0
#             print(toks, temp_counts)
            if toks.startswith("##"):
                continue
            else:
                temp_counts +=1
                final_tags.append(l)
    return final_tags

def pad_preds_if_necessary(pred, actual):

    if len(pred) < len(actual):
        pred += ["O"]*(len(actual)-len(pred))

    return pred

def proc_predictions(outputs, tokenizer, text_list, test_df, colname):

    preds_aligned = align_predictions_with_original_words(tokenizer, text_list, outputs)

#     print([len(x) for x in preds_aligned])

#     print(preds_aligned)

    test_df[colname] = preds_aligned

    test_df[colname + "2"] = test_df[colname].apply(lambda x: [selected_tag_names[y] for y in x if y != 11])

    test_df[colname + "_exp"] = test_df.apply(lambda x: pad_preds_if_necessary(x[colname + "2"], x["ner_expanded"]), axis=1)

    return test_df



def extract_tagged(toks, tags):

    ext = defaultdict(lambda :[])

    for tok, tag in zip(toks, tags):
        if tag!="O":
            ext[tag].append(tok)

    return ext

# aa


In [63]:
text = "The form and terms of the 3.650 % Senior Notes were established pursuant to an Officer ’ s Certificate , dated as of January 12 , 2017 , supplementing the Indenture"


text = [text, text + text]

# model_path = "HariLuru/finer_distillbert"

# model_path = "models/fine_distillbert.onnx"
mtype="pytorch"
# model = load_model(model_path, mtype=mtype)
import time

model.eval()

st = time.time()
aa = pred_pipeline(text, model, mtype=mtype,bs=10, device=device)
et = time.time()
print(et-st)

aa

0 61 torch.Size([2, 74])
latencies
0.50    0.018527
0.75    0.018527
0.90    0.018527
0.99    0.018527
dtype: float64
preds done
0.02532505989074707


[tensor([ 9, 10,  9,  0,  1,  0,  0, 11,  0,  9,  4,  9,  9,  1,  7,  1,  1,  1,
          9,  2,  2,  4,  3,  0,  1,  7,  7,  1,  2,  0,  2,  2,  1,  0,  3, 11,
          3, 10], device='cuda:0'),
 tensor([ 9, 10,  1,  0,  1,  1,  1, 11,  0,  9,  4,  0,  4,  2,  3,  1,  1,  1,
          9,  2,  1,  4,  3,  6,  1,  1,  0,  1,  2,  0,  3, 11,  1,  0,  3, 11,
          2,  0,  9,  0,  3,  0,  0, 11,  0,  9,  4,  0,  0,  2,  3,  1,  1,  1,
          2,  2,  1, 10,  3,  4,  1,  1,  7,  1,  2,  0,  3, 11,  1,  0,  3, 11,
          2,  2], device='cuda:0')]

In [64]:
selected_tag_names = tag_values

selected_tag_names

['B-DebtInstrumentBasisSpreadOnVariableRate1',
 'B-DebtInstrumentFaceAmount',
 'B-DebtInstrumentInterestRateStatedPercentage',
 'B-DebtInstrumentMaturityDate',
 'B-LineOfCreditFacilityMaximumBorrowingCapacity',
 'I-DebtInstrumentBasisSpreadOnVariableRate1',
 'I-DebtInstrumentFaceAmount',
 'I-DebtInstrumentInterestRateStatedPercentage',
 'I-DebtInstrumentMaturityDate',
 'I-LineOfCreditFacilityMaximumBorrowingCapacity',
 'O',
 'PAD']

In [65]:
val["ner_expanded"].explode().value_counts()

O                                                 465918
I-DebtInstrumentInterestRateStatedPercentage         324
I-LineOfCreditFacilityMaximumBorrowingCapacity       218
I-DebtInstrumentFaceAmount                           206
I-DebtInstrumentBasisSpreadOnVariableRate1           166
B-DebtInstrumentInterestRateStatedPercentage         162
B-DebtInstrumentFaceAmount                           155
B-LineOfCreditFacilityMaximumBorrowingCapacity       150
B-DebtInstrumentBasisSpreadOnVariableRate1            88
I-DebtInstrumentMaturityDate                          24
B-DebtInstrumentMaturityDate                           8
Name: ner_expanded, dtype: int64

In [66]:
# train_dataloader_orig = train_dataloader

In [72]:
train_dataloader = train_dataloader_orig

In [73]:
len(train_dataloader)*bs

159744

In [74]:
text = val["toks_exp"].apply(lambda x: " ".join(x)).tolist()

In [79]:
val_df["tag_exists"] = val_df["pyt_preds_exp"].apply(lambda x: len([y for y in x if y!= "O"])) + val_df["ner_expanded"].apply(lambda x: len([y for y in x if y!= "O"]))

val_df["ind_f1"] = val_df.apply(lambda x: f1_score([x["ner_expanded"]], [x["pyt_preds_exp"]]),axis=1).sort_values()

# val_df["ext_true"] = val_df.apply(lambda x: dict(extract_tagged(x["toks_exp"],x["ner_tags"])),axis=1)

val_df["ext_true_exp"] = val_df.apply(lambda x: dict(extract_tagged(x["toks_exp"],x["ner_expanded"])),axis=1)

val_df["ext_pred"] = val_df.apply(lambda x: dict(extract_tagged(x["toks_exp"],x["pyt_preds_exp"])),axis=1)

# val_df["ext_pred_onnx"] = val_df.apply(lambda x: dict(extract_tagged(x["toks_exp"],x["onnx_preds_exp"])),axis=1)

val_df.loc[val_df.tag_exists != 0,["id","ner_expanded", "ext_true_exp","ext_pred","ind_f1"]].sort_values("ind_f1").to_excel("error_analysis_v4.xlsx")

  _warn_prf(


In [81]:
# val_df.loc[val_df.tag_exists != 0,["id","ner_expanded", "ext_true_exp","ext_pred","ind_f1"]].sort_values("ind_f1")

### Train

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

max_f1 = 0

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(tqdm(train_dataloader, total = len(train_dataloader), position=0, leave=True)):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_labels = b_labels.type(torch.LongTensor).to(device)
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.


    print("val on val")
    mtype = "pytorch"

    pyt_output = pred_pipeline(text, model, mtype=mtype, bs=250, device=device)
    val_df = proc_predictions(pyt_output, tokenizer, text, val, "pyt_preds")


    print(classification_report(val_df["ner_expanded"].tolist(), val_df["pyt_preds_exp"].tolist()))
    f1_val = f1_score(val_df["ner_expanded"].tolist(), val_df["pyt_preds_exp"].tolist())
#     print("val on holdout")
#     f1 = get_f1_score_on_test_data(model, holdout)
    if (f1_val >= max_f1) & (f1_val > 0):

        #f1 = get_f1_score_on_test_data(model, holdout)
        print("saving model:", f1_val,max_f1, MODEL_PATH)
        max_f1 = f1_val
#         holdout_f1 = f1
        torch.save(model.state_dict(), MODEL_PATH)
#         f1_train = get_f1_score_on_test_data(model, train)
#         print("Train f1", f1_train)

#     print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
#     print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

100%|██████████| 312/312 [09:56<00:00,  1.91s/it]


Average train loss: 0.03767338626755354
val on val
0 200 torch.Size([250, 253])
250 200 torch.Size([250, 500])
500 200 torch.Size([250, 319])
750 126 torch.Size([250, 142])
1000 200 torch.Size([250, 274])
1250 200 torch.Size([250, 500])
1500 165 torch.Size([250, 183])
1750 200 torch.Size([250, 234])
2000 200 torch.Size([250, 268])
2250 200 torch.Size([250, 299])
2500 199 torch.Size([250, 223])
2750 182 torch.Size([250, 194])
3000 200 torch.Size([250, 234])
3250 200 torch.Size([250, 268])
3500 200 torch.Size([250, 220])
3750 200 torch.Size([250, 500])
4000 200 torch.Size([250, 277])
4250 200 torch.Size([250, 312])
4500 200 torch.Size([250, 370])
4750 200 torch.Size([250, 347])
5000 200 torch.Size([250, 394])
5250 200 torch.Size([250, 300])
5500 200 torch.Size([250, 280])
5750 200 torch.Size([250, 500])
6000 141 torch.Size([250, 155])
6250 200 torch.Size([250, 230])
6500 200 torch.Size([250, 256])
6750 200 torch.Size([250, 245])
7000 176 torch.Size([250, 190])
7250 200 torch.Size([250, 3

  _warn_prf(average, modifier, msg_start, len(result))


                                              precision    recall  f1-score   support

    DebtInstrumentBasisSpreadOnVariableRate1       0.31      0.94      0.47        88
                    DebtInstrumentFaceAmount       0.15      0.73      0.24       155
  DebtInstrumentInterestRateStatedPercentage       0.31      0.97      0.47       162
                  DebtInstrumentMaturityDate       0.00      0.00      0.00         8
LineOfCreditFacilityMaximumBorrowingCapacity       0.18      0.67      0.28       150

                                   micro avg       0.22      0.80      0.34       563
                                   macro avg       0.19      0.66      0.29       563
                                weighted avg       0.23      0.80      0.35       563

saving model: 0.3399624765478424 0 models/bert_subsampled_model_v8


Epoch:  20%|██        | 1/5 [12:02<48:09, 722.38s/it]




100%|██████████| 312/312 [09:54<00:00,  1.91s/it]


Average train loss: 0.011854782321442587
val on val
0 200 torch.Size([250, 253])
250 200 torch.Size([250, 500])
500 200 torch.Size([250, 319])
750 126 torch.Size([250, 142])
1000 200 torch.Size([250, 274])
1250 200 torch.Size([250, 500])
1500 165 torch.Size([250, 183])
1750 200 torch.Size([250, 234])
2000 200 torch.Size([250, 268])
2250 200 torch.Size([250, 299])
2500 199 torch.Size([250, 223])
2750 182 torch.Size([250, 194])
3000 200 torch.Size([250, 234])
3250 200 torch.Size([250, 268])
3500 200 torch.Size([250, 220])
3750 200 torch.Size([250, 500])
4000 200 torch.Size([250, 277])
4250 200 torch.Size([250, 312])
4500 200 torch.Size([250, 370])
4750 200 torch.Size([250, 347])
5000 200 torch.Size([250, 394])
5250 200 torch.Size([250, 300])
5500 200 torch.Size([250, 280])
5750 200 torch.Size([250, 500])
6000 141 torch.Size([250, 155])
6250 200 torch.Size([250, 230])
6500 200 torch.Size([250, 256])
6750 200 torch.Size([250, 245])
7000 176 torch.Size([250, 190])
7250 200 torch.Size([250, 

  _warn_prf(average, modifier, msg_start, len(result))


                                              precision    recall  f1-score   support

    DebtInstrumentBasisSpreadOnVariableRate1       0.42      0.97      0.59        88
                    DebtInstrumentFaceAmount       0.17      0.74      0.27       155
  DebtInstrumentInterestRateStatedPercentage       0.36      0.97      0.52       162
                  DebtInstrumentMaturityDate       0.00      0.00      0.00         8
LineOfCreditFacilityMaximumBorrowingCapacity       0.20      0.81      0.33       150

                                   micro avg       0.25      0.85      0.39       563
                                   macro avg       0.23      0.70      0.34       563
                                weighted avg       0.27      0.85      0.40       563

saving model: 0.3857949959644875 0.3399624765478424 models/bert_subsampled_model_v8


Epoch:  40%|████      | 2/5 [23:59<35:57, 719.12s/it]




100%|██████████| 312/312 [09:57<00:00,  1.91s/it]


Average train loss: 0.009644827005477289
val on val
0 200 torch.Size([250, 253])
250 200 torch.Size([250, 500])
500 200 torch.Size([250, 319])
750 126 torch.Size([250, 142])
1000 200 torch.Size([250, 274])
1250 200 torch.Size([250, 500])
1500 165 torch.Size([250, 183])
1750 200 torch.Size([250, 234])
2000 200 torch.Size([250, 268])
2250 200 torch.Size([250, 299])
2500 199 torch.Size([250, 223])
2750 182 torch.Size([250, 194])
3000 200 torch.Size([250, 234])
3250 200 torch.Size([250, 268])
3500 200 torch.Size([250, 220])
3750 200 torch.Size([250, 500])
4000 200 torch.Size([250, 277])
4250 200 torch.Size([250, 312])
4500 200 torch.Size([250, 370])
4750 200 torch.Size([250, 347])
5000 200 torch.Size([250, 394])
5250 200 torch.Size([250, 300])
5500 200 torch.Size([250, 280])
5750 200 torch.Size([250, 500])
6000 141 torch.Size([250, 155])
6250 200 torch.Size([250, 230])
6500 200 torch.Size([250, 256])
6750 200 torch.Size([250, 245])
7000 176 torch.Size([250, 190])
7250 200 torch.Size([250, 

Epoch:  60%|██████    | 3/5 [35:58<23:58, 719.17s/it]




100%|██████████| 312/312 [09:55<00:00,  1.91s/it]


Average train loss: 0.008237907764477989
val on val
0 200 torch.Size([250, 253])
250 200 torch.Size([250, 500])
500 200 torch.Size([250, 319])
750 126 torch.Size([250, 142])
1000 200 torch.Size([250, 274])
1250 200 torch.Size([250, 500])
1500 165 torch.Size([250, 183])
1750 200 torch.Size([250, 234])
2000 200 torch.Size([250, 268])
2250 200 torch.Size([250, 299])
2500 199 torch.Size([250, 223])
2750 182 torch.Size([250, 194])
3000 200 torch.Size([250, 234])
3250 200 torch.Size([250, 268])
3500 200 torch.Size([250, 220])
3750 200 torch.Size([250, 500])
4000 200 torch.Size([250, 277])
4250 200 torch.Size([250, 312])
4500 200 torch.Size([250, 370])
4750 200 torch.Size([250, 347])
5000 200 torch.Size([250, 394])
5250 200 torch.Size([250, 300])
5500 200 torch.Size([250, 280])
5750 200 torch.Size([250, 500])
6000 141 torch.Size([250, 155])
6250 200 torch.Size([250, 230])
6500 200 torch.Size([250, 256])
6750 200 torch.Size([250, 245])
7000 176 torch.Size([250, 190])
7250 200 torch.Size([250, 

Epoch:  80%|████████  | 4/5 [47:56<11:58, 718.75s/it]




  4%|▎         | 11/312 [00:21<09:51,  1.96s/it]

In [1]:
# val_df["ner_expanded"].explode().value_counts()

NameError: name 'val_df' is not defined

In [None]:
val_df["pyt_preds_exp"].explode().value_counts()

O                                                 464271
I-DebtInstrumentInterestRateStatedPercentage         681
I-LineOfCreditFacilityMaximumBorrowingCapacity       515
I-DebtInstrumentFaceAmount                           470
B-DebtInstrumentInterestRateStatedPercentage         370
B-DebtInstrumentFaceAmount                           338
B-LineOfCreditFacilityMaximumBorrowingCapacity       316
I-DebtInstrumentBasisSpreadOnVariableRate1           256
B-DebtInstrumentBasisSpreadOnVariableRate1           202
Name: pyt_preds_exp, dtype: int64

In [None]:
!mkdir models

### Push to hub

In [None]:
model.push_to_hub("HariLuru/finer_distillbert_v2", token="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")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HariLuru/finer_distillbert_v2/commit/a1f3f73bec14dab6b9b42c3fbb4cd10fff696eb2', commit_message='Upload DistilBertForTokenClassification', commit_description='', oid='a1f3f73bec14dab6b9b42c3fbb4cd10fff696eb2', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("HariLuru/finer_distillbert_v2", token="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")

CommitInfo(commit_url='https://huggingface.co/HariLuru/finer_distillbert_v2/commit/6b4505961fa4855e073f08f6e29d885cc8b19251', commit_message='Upload tokenizer', commit_description='', oid='6b4505961fa4855e073f08f6e29d885cc8b19251', pr_url=None, pr_revision=None, pr_num=None)

### Save to ONNX

In [None]:
import torch
from transformers import AutoConfig

# Prepare model input
text = "The form and terms of the 3.650 % Senior Notes were established pursuant to an Officer ’ s."
inputs = tokenizer(text, return_tensors="pt")
model.to("cpu")
# Exporting
output_path = "models/fine_distillbert_v6.onnx"
torch.onnx.export(model,               # model being run
                 args=(inputs['input_ids'],),  # model input (or a tuple for multiple inputs)
                 f=output_path,        # where to save the model
                 opset_version=11,     # the ONNX version to export the model to
                 do_constant_folding=True,  # whether to execute constant folding for optimization
                 input_names=['input_ids'],   # the model's input names
                 output_names=['output'],    # the model's output names
                 dynamic_axes={'input_ids': {0: 'batch_size', 1: 'seq_len'},  # Dynamic axes for inputs
                                'output': {0: 'batch_size', 1: 'seq_len'}})
