In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import mps
from transformers import BertTokenizer, BertForTokenClassification, BertConfig


device = 'mps' if mps.is_available() else 'cpu'


In [12]:
path = 'data/ner_data.csv'
data = pd.read_csv(path, encoding = 'unicode_escape')

In [17]:
data.head(5)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [13]:
data = data.fillna(method='ffill')

  data = data.fillna(method='ffill')


In [14]:
#convert iob tags to base tag
data['base_tag'] = data['Tag'].apply(lambda x: x.split('-')[-1])

In [15]:
freqs = data['Tag'].value_counts()
print("IOB tag count")
freqs

IOB tag count


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [16]:
iob_tags = []
for t, f in zip(freqs.index, freqs):
    iob_tags.append(t)

unique_tag =  []
for tag in iob_tags:
    s = tag.split('-')
    unique_tag.append(s[-1])

unique_tag = list(set(unique_tag))
print(f'Unique base tag: {unique_tag}')

Unique base tag: ['O', 'gpe', 'org', 'tim', 'nat', 'geo', 'per', 'eve', 'art']


In [17]:
data['base_tag'].value_counts()

base_tag
O      887908
geo     45058
org     36927
per     34241
tim     26861
gpe     16068
art       699
eve       561
nat       252
Name: count, dtype: int64

In [18]:
# art eve nat is not defined properly, removing them
to_remove = ['art','nat','eve']

data = data[~data.base_tag.isin(to_remove)]

In [19]:
data.head(5)

Unnamed: 0,Sentence #,Word,POS,Tag,base_tag
0,Sentence: 1,Thousands,NNS,O,O
1,Sentence: 1,of,IN,O,O
2,Sentence: 1,demonstrators,NNS,O,O
3,Sentence: 1,have,VBP,O,O
4,Sentence: 1,marched,VBN,O,O


In [20]:
labels = data['Tag'].value_counts().index

label2id = {}
id2label = {}
for idx, label in enumerate(labels):
    label2id[label] = idx
    id2label[idx] = label

In [27]:
id2label

{0: 'O',
 1: 'B-geo',
 2: 'B-tim',
 3: 'B-org',
 4: 'I-per',
 5: 'B-per',
 6: 'I-org',
 7: 'B-gpe',
 8: 'I-geo',
 9: 'I-tim',
 10: 'I-gpe'}

In [26]:
label2id

{'O': 0,
 'B-geo': 1,
 'B-tim': 2,
 'B-org': 3,
 'I-per': 4,
 'B-per': 5,
 'I-org': 6,
 'B-gpe': 7,
 'I-geo': 8,
 'I-tim': 9,
 'I-gpe': 10}

In [21]:
data['Sentence'] = data[['Sentence #','Word', "Tag"]].groupby(['Sentence #'])["Word"].transform(lambda x: ' '.join(x))
data['Word_labels'] = data[['Sentence #','Word', "Tag"]].groupby(['Sentence #'])["Tag"].transform(lambda x: ' '.join(x))

In [22]:
data.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag,base_tag,Sentence,Word_labels
0,Sentence: 1,Thousands,NNS,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Sentence: 1,of,IN,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
2,Sentence: 1,demonstrators,NNS,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
3,Sentence: 1,have,VBP,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
4,Sentence: 1,marched,VBN,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
5,Sentence: 1,through,IN,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
6,Sentence: 1,London,NNP,B-geo,geo,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
7,Sentence: 1,to,TO,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
8,Sentence: 1,protest,VB,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
9,Sentence: 1,the,DT,O,O,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...


In [23]:
data = data[["Sentence", "Word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,Sentence,Word_labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Families of soldiers killed in the conflict jo...,O O O O O O O O O O O O O O O O O O B-per O O ...
2,They marched from the Houses of Parliament to ...,O O O O O O O O O O O B-geo I-geo O
3,"Police put the number of marchers at 10,000 wh...",O O O O O O O O O O O O O O O
4,The protest comes on the eve of the annual con...,O O O O O O O O O O O B-geo O O B-org I-org O ...


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [73]:
item = {}
input_ids = []
attention_mask = []
labels = []

for sent in data["Sentence"]:
    encoded_dict =  tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = 128,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_mask.append(encoded_dict['attention_mask'])
    
for iob_labels in data["Word_labels"]:
    l = iob_labels.split(' ')
    temp = [label2id[x] for x in l]
    temp.insert(0,-100)
    temp.insert(len(t)+1, -100)
    pad_array = [-100] * (128 - len(temp))
    temp.extend(pad_array)
    temp = torch.tensor(temp)
    labels.append(temp)

for i in range(data.shape[0]):
    temp_dict = {}
    temp_dict['attention_mask'] = attention_mask[i]
    temp_dict['input_ids'] = input_ids[i]
    temp_dict['labels'] = labels[i]
    item[i] = temp_dict


In [76]:
len(input_ids),len(attention_mask), len(labels)

(47610, 47610, 47610)

In [115]:
"""
class MakeDataset(Dataset):
    
    def __init__(self, data, label2id, tokenizer) -> None:
        self.data = data
        self.label2id =label2id
        self.len = data.shape
        self.tokenizer = tokenizer
        

    def __getitem__(self, index):

        item = {}
        input_ids = []
        attention_mask = []
        labels = []

        for sent in self.data["Sentence"]:
            encoded_dict =  self.tokenizer.encode_plus(
                sent,
                add_special_tokens = True,
                max_length = 128,
                pad_to_max_length = True,
                return_attention_mask = True,
                return_tensors = 'pt'
            )

            input_ids.append(encoded_dict['input_ids'])
            attention_mask.append(encoded_dict['attention_mask'])
            
        for iob_labels in self.data["Word_labels"]:
            l = iob_labels.split(' ')
            temp = [label2id[x] for x in l]
            temp.insert(0,-100)
            temp.insert(len(t)+1, -100)
            pad_array = [-100] * (128 - len(temp))
            temp.extend(pad_array)
            temp = torch.tensor(temp)
            labels.append(temp)

        for i in range(self.len[0]):
            temp_dict = {}
            temp_dict['attention_mask'] = attention_mask[i]
            temp_dict['input_ids'] = input_ids[i]
            temp_dict['labels'] = labels[i]
            item[i] = temp_dict
        
        return item
    
    def __len__(self):
        return self.len

"""
from torch.utils.data import Dataset
import torch

class MakeDataset(Dataset):
    def __init__(self, data, label2id, tokenizer, max_length=128):
        self.data = data
        self.label2id = label2id
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Pre-tokenize and prepare data
        self.tokenized_data = self._prepare_data()

    def _prepare_data(self):
        tokenized_data = []

        for sentence, word_labels in zip(self.data["Sentence"], self.data["Word_labels"]):
            # Tokenize sentence
            encoded_dict = self.tokenizer.encode_plus(
                sentence,
                add_special_tokens = True,
                max_length = 128,
                pad_to_max_length = True,
                return_attention_mask = True,
                return_tensors = 'pt'
            )

            # Prepare labels
            labels = word_labels.split(' ')
            label_ids = [self.label2id[label] for label in labels]
            label_ids = [-100] + label_ids + [-100]  # Add special token labels
            label_ids += [-100] * (self.max_length - len(label_ids))  # Pad labels to max_length
            label_ids = label_ids[:self.max_length]  # Truncate if necessary

            # Add to tokenized data
            tokenized_data.append({
                "input_ids": encoded_dict['input_ids'].squeeze(0),
                "attention_mask": encoded_dict['attention_mask'].squeeze(0),
                "labels": torch.tensor(label_ids)
            })

        return tokenized_data

    def __getitem__(self, index):
        return self.tokenized_data[index]

    def __len__(self):
        return len(self.data)


In [116]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [117]:
train_dataset = MakeDataset(train_dataset, label2id, tokenizer)
test_dataset = MakeDataset(test_dataset, label2id, tokenizer)



In [118]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [119]:
train_params = {
    'batch_size' : TRAIN_BATCH_SIZE,
    'shuffle' : True,
    'num_workers' : 0
}

test_params = {
    'batch_size' : VALID_BATCH_SIZE,
    'shuffle' : True,
    'num_workers' : 0
}

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [32]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id), id2label=id2label, label2id=label2id)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [102]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [105]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        # Forward pass
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits
        
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1)  # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100  # shape (batch_size, seq_len)
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels.cpu().numpy())
        tr_preds.extend(predictions.cpu().numpy())

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")


In [106]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.427699089050293
Training loss per 100 training steps: 0.9587471392190102
Training loss per 100 training steps: 0.8004043713137878
Training loss per 100 training steps: 0.7306420120388962
Training loss per 100 training steps: 0.6839140895150249
Training loss per 100 training steps: 0.6417619716264531
Training loss per 100 training steps: 0.6146598805498313
Training loss per 100 training steps: 0.5940929532300856
Training loss per 100 training steps: 0.5744178736659137
Training loss per 100 training steps: 0.55676809682425
Training loss per 100 training steps: 0.5429105507591357
Training loss per 100 training steps: 0.5293225755527629
Training loss per 100 training steps: 0.5159109711343055
Training loss per 100 training steps: 0.5068017360120782
Training loss per 100 training steps: 0.4964483862145831
Training loss per 100 training steps: 0.4842327230006238
Training loss per 100 training steps: 0.47692175881284465
Training loss p

In [120]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            output = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = output.loss
            eval_logits = output.logits
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [121]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.2171991914510727
Validation loss per 100 evaluation steps: 0.16491883393408185
Validation loss per 100 evaluation steps: 0.17520182845599727
Validation loss per 100 evaluation steps: 0.18514091232899837
Validation loss per 100 evaluation steps: 0.18372580567785618
Validation loss per 100 evaluation steps: 0.1852459982550492
Validation loss per 100 evaluation steps: 0.18562130825999876
Validation loss per 100 evaluation steps: 0.18496346324346774
Validation loss per 100 evaluation steps: 0.1876130340192929
Validation loss per 100 evaluation steps: 0.1878167449637974
Validation loss per 100 evaluation steps: 0.18571304383778506
Validation loss per 100 evaluation steps: 0.18518013304820455
Validation loss per 100 evaluation steps: 0.18390244609012027
Validation loss per 100 evaluation steps: 0.18339966008756692
Validation loss per 100 evaluation steps: 0.18428015137563838
Validation loss per 100 evaluation steps: 0.1846353479253555
Validation lo

In [127]:
# save model & tokenizer

path = '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/'

model.save_pretrained(path+'bert_trained')
tokenizer.save_pretrained(path+'tokenizer')

('/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/tokenizer/tokenizer_config.json',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/tokenizer/special_tokens_map.json',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/tokenizer/vocab.txt',
 '/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/tokenizer/added_tokens.json')

In [128]:
from sklearn.metrics import classification_report

print(classification_report(labels, predictions))


              precision    recall  f1-score   support

       B-geo       0.76      0.67      0.71      7492
       B-gpe       0.83      0.80      0.82      3189
       B-org       0.80      0.46      0.59      4050
       B-per       0.77      0.68      0.72      3388
       B-tim       0.73      0.69      0.71      4114
       I-geo       0.66      0.54      0.60      1479
       I-gpe       0.37      0.16      0.22        45
       I-org       0.71      0.55      0.62      3379
       I-per       0.71      0.83      0.77      3483
       I-tim       0.62      0.61      0.61      1337
           O       0.96      0.98      0.97    176754

    accuracy                           0.93    208710
   macro avg       0.72      0.63      0.67    208710
weighted avg       0.93      0.93      0.93    208710



In [None]:
base_label = pd.DataFrame({
    'y_true' : labels,
    'y_pred' : predictions
})

base_label['y_true_base'] = base_label["y_true"].apply(lambda x : x.split('-')[-1])
base_label['y_pred_base'] = base_label["y_pred"].apply(lambda x : x.split('-')[-1])

In [135]:
from sklearn.metrics import classification_report

print(classification_report(base_label['y_true_base'], base_label['y_pred_base']))

              precision    recall  f1-score   support

           O       0.96      0.98      0.97    176754
         geo       0.77      0.67      0.72      8971
         gpe       0.83      0.79      0.81      3234
         org       0.81      0.54      0.65      7429
         per       0.82      0.85      0.83      6871
         tim       0.78      0.74      0.76      5451

    accuracy                           0.94    208710
   macro avg       0.83      0.76      0.79    208710
weighted avg       0.94      0.94      0.94    208710



In [63]:
sent  = data["Sentence"][23421]

print(' Original: ', sent)

print('Tokenized: ', tokenizer.tokenize(sent))

print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent)))

 Original:  But Mr. Nikiforov said the file still lacked such important information as the general 's promotion record during the mid-1990s in the midst of the conflict in Bosnia-Herzegovina .
Tokenized:  ['but', 'mr', '.', 'nik', '##if', '##oro', '##v', 'said', 'the', 'file', 'still', 'lacked', 'such', 'important', 'information', 'as', 'the', 'general', "'", 's', 'promotion', 'record', 'during', 'the', 'mid', '-', '1990s', 'in', 'the', 'midst', 'of', 'the', 'conflict', 'in', 'bosnia', '-', 'herzegovina', '.']
Token IDs:  [2021, 2720, 1012, 23205, 10128, 14604, 2615, 2056, 1996, 5371, 2145, 10858, 2107, 2590, 2592, 2004, 1996, 2236, 1005, 1055, 4712, 2501, 2076, 1996, 3054, 1011, 4134, 1999, 1996, 12930, 1997, 1996, 4736, 1999, 9562, 1011, 11453, 1012]


In [35]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


model_path =  "/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/bert_trained/"
tokenizer_path = "/Users/shakibibnashameem/Documents/Practice/bert/bert-ner/artifacts/tokenizer/"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

text = "This is London, Sam loves to it. But he needs to go to the hospital."

inputs = tokenizer(text, return_tensors='pt')

with torch.no_grad():
    logits = model(**inputs).logits

predictions = torch.argmax(logits, dim=2)
predct_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

In [55]:
from transformers import pipeline

text = "The Golden State Warriors are an American professional basketball team based in San Francisco."

classifire = pipeline("ner", model=model_path, tokenizer=tokenizer, device=device)
classifire(text)

[{'entity': 'B-org',
  'score': np.float32(0.8038998),
  'index': 2,
  'word': 'golden',
  'start': 4,
  'end': 10},
 {'entity': 'I-org',
  'score': np.float32(0.5654787),
  'index': 3,
  'word': 'state',
  'start': 11,
  'end': 16},
 {'entity': 'I-org',
  'score': np.float32(0.7688067),
  'index': 4,
  'word': 'warriors',
  'start': 17,
  'end': 25},
 {'entity': 'B-gpe',
  'score': np.float32(0.9857943),
  'index': 7,
  'word': 'american',
  'start': 33,
  'end': 41},
 {'entity': 'B-geo',
  'score': np.float32(0.9810952),
  'index': 13,
  'word': 'san',
  'start': 80,
  'end': 83},
 {'entity': 'I-geo',
  'score': np.float32(0.95863223),
  'index': 14,
  'word': 'francisco',
  'start': 84,
  'end': 93}]

## tokenizer deep dive start

In [173]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split()):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [170]:
sentence = data["Sentence"][2242]
labels= data["Word_labels"][2242]

In [137]:
print(sentence)
print(labels)

Mr. Viera took power in a 1980 coup and ruled 19 years until he was ousted during a civil war .
B-per I-per O O O O B-tim O O O B-tim O O O O O O O O O O


In [160]:
def v1_tokenize_and_preserve_labels(sentence, labels, tokenizer):

    tokenized_sentence = []
    preserved_labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), labels.split()):
        
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        ext_lbl = [label] * n_subwords

        preserved_labels.extend(ext_lbl)

    return tokenized_sentence, preserved_labels


In [174]:
s, l = tokenize_and_preserve_labels(sentence, labels, bt)

In [175]:
print(s)
print(l)

['mr', '.', 'vie', '##ra', 'took', 'power', 'in', 'a', '1980', 'coup', 'and', 'ruled', '19', 'years', 'until', 'he', 'was', 'ousted', 'during', 'a', 'civil', 'war', '.']
['B-per', 'B-per', 'I-per', 'I-per', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
tokenized_sentence = []
l = []

for word, label in zip(sentence.split(), labels.split()):
    
    tokenized_word = bt.tokenize(word)
    n_subwords = len(tokenized_word)

    tokenized_sentence.extend(tokenized_word)

    ext_lbl = [label] * n_subwords
    l.extend(ext_lbl)
    

In [127]:
print(tokenized_sentence)
print(l)

['mr', '.', 'vie', '##ra', 'took', 'power', 'in', 'a', '1980', 'coup', 'and', 'ruled', '19', 'years', 'until', 'he', 'was', 'ousted', 'during', 'a', 'civil', 'war', '.']
['B-per', 'B-per', 'I-per', 'I-per', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [77]:
tokenized_word = bt.tokenize("British")
tokenized_word

['british']

In [36]:
from transformers import BertTokenizer, BertTokenizerFast

In [106]:
text  = "This is Washington"


print(text)
bt = BertTokenizer.from_pretrained('bert-base-uncased')
b_t = bt.tokenize(text)
print(b_t)


bft = BertTokenizerFast.from_pretrained("bert-base-uncased")
b_ft = bft.tokenize(text)
print(b_ft)

This is Washington
['this', 'is', 'washington']
['this', 'is', 'washington']


In [132]:
 
tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, label, bt)
        

In [133]:
tokenized_sentence, label

(['mr', '.'], 'O')

## tokenizer deep dive end