In [1]:
import pandas as pd
import os
import random
import numpy as np
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [2]:
df = pd.read_csv('../../data/BIOUL_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tokens    250 non-null    object
 1   ner_tags  250 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB


In [4]:
#We have only 250 sentences annotated which is very less but fine for small POC

In [5]:
df.head()

Unnamed: 0,tokens,ner_tags
0,"['Summary', '\n\n', 'Summary', 'Companies', 'T...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"['The', 'U.S.', 'stock', 'market', 'has', 'suf...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,"['""', 'The', 'U.S.', 'yield', 'markets', '(', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,"['Register', 'now', 'for', 'FREE', 'unlimited'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,"['All', '11', 'major', 'S&P', '500', '(', '.SP...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [6]:
type(df['ner_tags'][0])


str

In [7]:
type(df['tokens'][0])

str

In [8]:
#Convert str to list of str
import ast
df['ner_tags'] = df.ner_tags.apply(lambda s: list(ast.literal_eval(s)))
df['tokens'] = df.tokens.apply(lambda s: list(ast.literal_eval(s)))

In [9]:
import itertools
#Get list of NER tags from data collection notebook
ner_tags = set(itertools.chain.from_iterable(df.ner_tags))
ner_tags

{'B-CUSTOM_ORG',
 'B-CUSTOM_PERSON',
 'B-CUSTOM_PLACE',
 'B-CUSTOM_ROLE',
 'I-CUSTOM_ORG',
 'I-CUSTOM_PERSON',
 'I-CUSTOM_PLACE',
 'I-CUSTOM_ROLE',
 'L-CUSTOM_ORG',
 'L-CUSTOM_PERSON',
 'L-CUSTOM_PLACE',
 'L-CUSTOM_ROLE',
 'O',
 'U-CUSTOM_ORG',
 'U-CUSTOM_PERSON',
 'U-CUSTOM_PLACE',
 'U-CUSTOM_ROLE'}

In [10]:
'''
The string "-" is used where the entity offsets don’t align with the tokenization in the Doc object. 
The training algorithm will view these as missing values. 
O denotes a non-entity token. 
B denotes the beginning of a multi-token entity, 
I the inside of an entity of three or more tokens,
and L the end of an entity of two or more tokens. 
U denotes a single-token entity.

'''

'\nThe string "-" is used where the entity offsets don’t align with the tokenization in the Doc object. \nThe training algorithm will view these as missing values. \nO denotes a non-entity token. \nB denotes the beginning of a multi-token entity, \nI the inside of an entity of three or more tokens,\nand L the end of an entity of two or more tokens. \nU denotes a single-token entity.\n\n'

In [11]:
id_to_label = dict(enumerate(ner_tags))
label_to_id = {v:k for k, v in id_to_label.items()}

In [12]:
label_to_id

{'L-CUSTOM_ROLE': 0,
 'I-CUSTOM_ROLE': 1,
 'I-CUSTOM_ORG': 2,
 'U-CUSTOM_PERSON': 3,
 'L-CUSTOM_PERSON': 4,
 'L-CUSTOM_PLACE': 5,
 'U-CUSTOM_PLACE': 6,
 'I-CUSTOM_PLACE': 7,
 'U-CUSTOM_ROLE': 8,
 'B-CUSTOM_PERSON': 9,
 'L-CUSTOM_ORG': 10,
 'I-CUSTOM_PERSON': 11,
 'U-CUSTOM_ORG': 12,
 'B-CUSTOM_PLACE': 13,
 'B-CUSTOM_ROLE': 14,
 'B-CUSTOM_ORG': 15,
 'O': 16}

In [13]:
label_to_id = {'B-CUSTOM_PLACE': 0,
 'L-CUSTOM_ORG': 1,
 'B-CUSTOM_PERSON': 2,
 'B-CUSTOM_ROLE': 3,
 'U-CUSTOM_PERSON': 4,
 'I-CUSTOM_ROLE': 5,
 'I-CUSTOM_PLACE': 6,
 'U-CUSTOM_PLACE': 7,
 'L-CUSTOM_ROLE': 8,
 'I-CUSTOM_PERSON': 9,
 'U-CUSTOM_ROLE': 10,
 'L-CUSTOM_PERSON': 11,
 'I-CUSTOM_ORG': 12,
 'U-CUSTOM_ORG': 13,
 'L-CUSTOM_PLACE': 14,
 'B-CUSTOM_ORG': 15,
 'O': 16}

In [14]:
id_to_label

{0: 'L-CUSTOM_ROLE',
 1: 'I-CUSTOM_ROLE',
 2: 'I-CUSTOM_ORG',
 3: 'U-CUSTOM_PERSON',
 4: 'L-CUSTOM_PERSON',
 5: 'L-CUSTOM_PLACE',
 6: 'U-CUSTOM_PLACE',
 7: 'I-CUSTOM_PLACE',
 8: 'U-CUSTOM_ROLE',
 9: 'B-CUSTOM_PERSON',
 10: 'L-CUSTOM_ORG',
 11: 'I-CUSTOM_PERSON',
 12: 'U-CUSTOM_ORG',
 13: 'B-CUSTOM_PLACE',
 14: 'B-CUSTOM_ROLE',
 15: 'B-CUSTOM_ORG',
 16: 'O'}

In [15]:
id_to_label = {0: 'B-CUSTOM_PLACE',
 1: 'L-CUSTOM_ORG',
 2: 'B-CUSTOM_PERSON',
 3: 'B-CUSTOM_ROLE',
 4: 'U-CUSTOM_PERSON',
 5: 'I-CUSTOM_ROLE',
 6: 'I-CUSTOM_PLACE',
 7: 'U-CUSTOM_PLACE',
 8: 'L-CUSTOM_ROLE',
 9: 'I-CUSTOM_PERSON',
 10: 'U-CUSTOM_ROLE',
 11: 'L-CUSTOM_PERSON',
 12: 'I-CUSTOM_ORG',
 13: 'U-CUSTOM_ORG',
 14: 'L-CUSTOM_PLACE',
 15: 'B-CUSTOM_ORG',
 16: 'O'}

In [16]:
def replace_fn(ls):
    new_ls = [label_to_id[key] for key in ls]
    return new_ls

In [17]:
df['ner_tags'] = df['ner_tags'].apply(replace_fn)

In [18]:
df.iloc[0]['ner_tags']

[16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 15,
 1,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 13,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16]

In [19]:
#df.explode('ner_tags').value_counts()
class_counts = pd.Series(sum([item for item in df.ner_tags], [])).value_counts()

In [20]:
from collections import OrderedDict, defaultdict
class_counts_dict = class_counts.to_dict(OrderedDict)

In [21]:
class_counts_dict

OrderedDict([(16, 7305),
             (2, 89),
             (11, 89),
             (15, 52),
             (1, 52),
             (7, 48),
             (13, 46),
             (12, 32),
             (8, 17),
             (3, 17),
             (5, 10),
             (4, 8),
             (10, 8),
             (0, 7),
             (14, 7),
             (9, 7),
             (6, 3)])

In [22]:
total_count =  class_counts.sum()
total_count

7797

In [23]:
majority_sample = class_counts_dict[16]
majority_sample

7305

In [24]:
#labels are imbalanced,We can use weightsampler to oversample data but instead let's use different strategy.class_weights
# we'll create class_weights to give more weightage to weak class during training.
# we can use strategy for majority class , class_weight = 1-(class sample/total sample)
# for others, class_weight = 1-(class count/total samples- majority class sample)
class_weights = []
for i in range(len(class_counts_dict.keys())):
    if i==16:
        class_weight = 1-(class_counts_dict[i]/total_count)
    else:
        class_weight = 1-(class_counts_dict[i]/(total_count-majority_sample))
    class_weights.append(class_weight)
    


In [25]:
class_weights

[0.9857723577235772,
 0.8943089430894309,
 0.8191056910569106,
 0.9654471544715447,
 0.983739837398374,
 0.9796747967479675,
 0.9939024390243902,
 0.9024390243902439,
 0.9654471544715447,
 0.9857723577235772,
 0.983739837398374,
 0.8191056910569106,
 0.9349593495934959,
 0.9065040650406504,
 0.9857723577235772,
 0.8943089430894309,
 0.06310119276644865]

In [26]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#Since the input has already been split into words, set is_split_into_words=True
tokenized_input = tokenizer(df["tokens"][0], is_split_into_words=True)

In [27]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [28]:
tokens

['[CLS]',
 'summary',
 'summary',
 'companies',
 'tesla',
 'down',
 'as',
 'q',
 '##3',
 'deliveries',
 'miss',
 'market',
 'estimates',
 'u',
 '.',
 's',
 '.',
 'factory',
 'activity',
 'slow',
 '##est',
 'in',
 '~',
 '2',
 '.',
 '5',
 'years',
 'in',
 'sept',
 '-',
 'is',
 '##m',
 'credit',
 'sui',
 '##sse',
 ',',
 'ci',
 '##ti',
 'cut',
 '202',
 '##2',
 'year',
 '-',
 'end',
 'target',
 'for',
 's',
 '&',
 'p',
 '500',
 'index',
 '##es',
 'up',
 ':',
 'dow',
 '2',
 '.',
 '66',
 '%',
 ',',
 's',
 '&',
 'p',
 '500',
 '2',
 '.',
 '59',
 '%',
 ',',
 'nas',
 '##da',
 '##q',
 '2',
 '.',
 '27',
 '%',
 'oct',
 '3',
 '(',
 'reuters',
 ')',
 '-',
 'wall',
 'street',
 "'",
 's',
 'three',
 'major',
 'index',
 '##es',
 'rallied',
 'to',
 'close',
 'over',
 '2',
 '%',
 'on',
 'monday',
 'as',
 'u',
 '.',
 's',
 '.',
 'treasury',
 'yields',
 'tumbled',
 'on',
 'weaker',
 '-',
 'than',
 '-',
 'expected',
 'manufacturing',
 'data',
 ',',
 'increasing',
 'the',
 'appeal',
 'of',
 'stocks',
 'at',
 '

In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#Since the input has already been split into words, set is_split_into_words=True
tokenized_input = tokenizer(['I','am','Elon'], is_split_into_words=True)
print("token_ids:",tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print("tokens:",tokens)
print("word_ids:",tokenized_input.word_ids())# index of orifginal word

token_ids: {'input_ids': [101, 1045, 2572, 3449, 2239, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}
tokens: ['[CLS]', 'i', 'am', 'el', '##on', '[SEP]']
word_ids: [None, 0, 1, 2, 2, None]


In [30]:
'''Adding the special tokens [CLS] and [SEP] and subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may be split into two subwords. You will need to realign the tokens and labels by:

1.Mapping all tokens to their corresponding word with the word_ids method.
2.Assigning the label -100 to the special tokens [CLS] and [SEP] so the PyTorch loss function ignores them.
3.Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.
https://huggingface.co/docs/transformers/tasks/token_classification#preprocess
'''

'Adding the special tokens [CLS] and [SEP] and subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may be split into two subwords. You will need to realign the tokens and labels by:\n\n1.Mapping all tokens to their corresponding word with the word_ids method.\n2.Assigning the label -100 to the special tokens [CLS] and [SEP] so the PyTorch loss function ignores them.\n3.Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.\nhttps://huggingface.co/docs/transformers/tasks/token_classification#preprocess\n'

In [31]:
X = list(df['tokens'])
y = list(df['ner_tags'])

In [32]:
def FindMaxLength(lst):
    maxList = max(lst, key = lambda i: len(i))
    maxLength = len(maxList)
     
    return maxLength

In [33]:
FindMaxLength(X)

103

In [34]:
#We have maximum tokens as 103 but Let's assume max_length as 200
max_length = 200

In [35]:
from transformers import AutoTokenizer

class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y,max_length,id_to_label):
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.X = X
        self.y = y
        self.id_to_label = id_to_label
    

    def __getitem__(self, idx):
        return self.tokenize_and_align_labels(self.X[idx],self.y[idx])

    def __len__(self):
        return len( self.X)
    
    def tokenize_and_align_labels(self,x_el,y_el):
        #print(row["tokens"])
        #print( row['ner_tags'])
        tokenized_inputs = self.tokenizer(x_el, truncation=True,padding="max_length", is_split_into_words=True,max_length=max_length)
        data = {key: torch.tensor(val) for key, val in tokenized_inputs.items()}
        #print(tokenized_inputs)
        ner_tags = y_el
        labels = []
        word_ids = tokenized_inputs.word_ids(batch_index=0)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(ner_tags[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

        data["labels"] = torch.tensor(labels).squeeze()
        return data

train_dataset = Dataset(X, y,max_length,id_to_label)

In [36]:
for data in train_dataset:
    print(data['input_ids'])
    print(data['labels'])
    break

tensor([  101, 12654, 12654,  3316, 26060,  2091,  2004,  1053,  2509, 23534,
         3335,  3006, 10035,  1057,  1012,  1055,  1012,  4713,  4023,  4030,
         4355,  1999,  1066,  1016,  1012,  1019,  2086,  1999, 17419,  1011,
         2003,  2213,  4923, 24086, 11393,  1010, 25022,  3775,  3013, 16798,
         2475,  2095,  1011,  2203,  4539,  2005,  1055,  1004,  1052,  3156,
         5950,  2229,  2039,  1024, 23268,  1016,  1012,  5764,  1003,  1010,
         1055,  1004,  1052,  3156,  1016,  1012,  5354,  1003,  1010, 17235,
         2850,  4160,  1016,  1012,  2676,  1003, 13323,  1017,  1006, 26665,
         1007,  1011,  2813,  2395,  1005,  1055,  2093,  2350,  5950,  2229,
        24356,  2000,  2485,  2058,  1016,  1003,  2006,  6928,  2004,  1057,
         1012,  1055,  1012,  9837, 16189, 18303,  2006, 15863,  1011,  2084,
         1011,  3517,  5814,  2951,  1010,  4852,  1996,  5574,  1997, 15768,
         2012,  1996,  2707,  1997,  1996,  2095,  1005,  1055, 

In [37]:
'''# Custom Model training
from torch.utils.data import Dataset, TensorDataset,DataLoader
train_loader = DataLoader(train_dataset,batch_size=16, shuffle=True)

NUM_EPOCHS = 1
LEARNING_RATE = 0.01
optimizer =torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) 
for i in range(NUM_EPOCHS):
  model.train()
  for X_batch,y_batch in train_loader:
    output = model(X_batch,labels=y_batch)
    output.loss.backward()
    optimizer.step()
    optimizer.zero_grad()'''

'# Custom Model training\nfrom torch.utils.data import Dataset, TensorDataset,DataLoader\ntrain_loader = DataLoader(train_dataset,batch_size=16, shuffle=True)\n\nNUM_EPOCHS = 1\nLEARNING_RATE = 0.01\noptimizer =torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE) \nfor i in range(NUM_EPOCHS):\n  model.train()\n  for X_batch,y_batch in train_loader:\n    output = model(X_batch,labels=y_batch)\n    output.loss.backward()\n    optimizer.step()\n    optimizer.zero_grad()'

In [38]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=17)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [39]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    report_to =None

)

#We should override comput_loss to inform trainerr about class imbalance(we have majority as "O" i.e not an entity)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 17 labels with different weights)
        #class_weights = torch.tensor([3.0, 3.0, 3.0,3.0, 3.0, 3.0,3.0, 3.0, 3.0,3.0, 3.0, 3.0,3.0, 3.0, 3.0,3.0, 0.2])
        class_weights = torch.tensor([0.9857723577235772,0.8943089430894309, 0.8191056910569106, 0.9654471544715447, 0.983739837398374,
                        0.9796747967479675, 0.9939024390243902, 0.9024390243902439, 0.9654471544715447, 0.9857723577235772, 0.983739837398374, 0.8191056910569106,
                        0.9349593495934959, 0.9065040650406504, 0.9857723577235772, 0.8943089430894309, 0.06310119276644865])
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights,reduction='mean')
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)






In [40]:
os.environ["WANDB_DISABLED"] = "true"

In [41]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 32
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


  0%|          | 0/32 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 120.7696, 'train_samples_per_second': 2.07, 'train_steps_per_second': 0.265, 'train_loss': 1.5432997941970825, 'epoch': 1.0}


TrainOutput(global_step=32, training_loss=1.5432997941970825, metrics={'train_runtime': 120.7696, 'train_samples_per_second': 2.07, 'train_steps_per_second': 0.265, 'train_loss': 1.5432997941970825, 'epoch': 1.0})

In [55]:
trainer.save_model("../models/custom_ner_transformers")

Saving model checkpoint to ../models/custom_ner_transformers
Configuration saved in ../models/custom_ner_transformers\config.json
Model weights saved in ../models/custom_ner_transformers\pytorch_model.bin


In [56]:
#Inference Code
from transformers import AutoModelForTokenClassification,AutoTokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
saved_model = AutoModelForTokenClassification.from_pretrained("../models/custom_ner_transformers")




loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\metes/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.3",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\metes/.cache\huggingface\transformers\0e1bbfda7f6

In [78]:
#Inference Code
import spacy
nlp = spacy.load("en_core_web_lg")
#text = 'The Tesla factory is seen in Fremont, California, U.S'
text = 'My name is Elon Musk and I am CEO of Tesla'
spacy_tokens = nlp(text)
text_tokens = [token.text for token in spacy_tokens]
tokenized_inputs = tokenizer(text_tokens,truncation=True,padding="max_length", is_split_into_words=True,max_length=200, return_tensors="pt")
outputs = saved_model(**tokenized_inputs)

In [79]:
outputs.logits.shape
#batch_size*seq_length*num_labels

torch.Size([1, 200, 17])

In [80]:
outputs.logits.squeeze()[0]

tensor([-0.3212,  0.0098,  0.1914,  0.0162, -0.3617, -0.2911, -0.3385,  0.2026,
        -0.3815, -0.8662, -0.7390,  0.2112, -0.6786, -0.2968, -0.7633,  0.1688,
         2.2246], grad_fn=<SelectBackward0>)

In [81]:
torch.argmax(outputs.logits.squeeze()[0])

tensor(16)

In [82]:
predictions = torch.argmax(outputs.logits.squeeze(), axis=1)
print(predictions.shape)
predictions = [id_to_label[int(i)] for i in predictions]
print(predictions)
words = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])
print(words)
word_ids = tokenized_inputs.word_ids()
print("word_ids:",word_ids)# index of original word
print("words:",text_tokens)

torch.Size([200])
['O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'B-CUSTOM_PERSON', 'L-CUSTOM_PERSON', 'L-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'L-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'L-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'O', 'O', 'B-CUSTOM_PERSON', 'B-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'L-CUSTOM_PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-CUSTOM_PERSON', 'O', 'O', 'B-CUST

In [83]:
token_entity = {}
for idx,word in enumerate(text_tokens):
    print(idx,word)
    for id,word_id in enumerate(word_ids):
        if idx==word_id:
            if word not in token_entity:
                token_entity[word] = predictions[id]
            

0 My
1 name
2 is
3 Elon
4 Musk
5 and
6 I
7 am
8 CEO
9 of
10 Tesla


In [84]:
token_entity

{'My': 'O',
 'name': 'O',
 'is': 'O',
 'Elon': 'B-CUSTOM_PERSON',
 'Musk': 'L-CUSTOM_PERSON',
 'and': 'O',
 'I': 'O',
 'am': 'O',
 'CEO': 'O',
 'of': 'O',
 'Tesla': 'O'}

In [85]:
for token in spacy_tokens:
  print(token.text, token.idx, token.idx + len(token.text))

My 0 2
name 3 7
is 8 10
Elon 11 15
Musk 16 20
and 21 24
I 25 26
am 27 29
CEO 30 33
of 34 36
Tesla 37 42


In [86]:
output = []
for token in spacy_tokens:
    if token_entity[token.text] !='O':
        output.append(
                {
                    "start": token.idx,
                    "end": token.idx + len(token.text),
                    "label":token_entity[token.text],
                }
            )


In [87]:
output

[{'start': 11, 'end': 15, 'label': 'B-CUSTOM_PERSON'},
 {'start': 16, 'end': 20, 'label': 'L-CUSTOM_PERSON'}]

In [88]:
ex = [{"text": text, "ents": output}]
spacy.displacy.render(
                ex, style="ent", manual=True
            )