adapted code from video : https://www.youtube.com/watch?v=Q1i4bIIFOFc

In [1]:
!nvidia-smi

Wed Feb 28 18:09:58 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              50W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
%%capture
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

In [3]:
import pandas as pd
import os
from google.colab import drive
from datasets import load_dataset

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_data_path = '/content/drive/MyDrive/kaggle competitions/pii_detection_nlp/data/pii-detection-removal-from-educational-data/train.json'
test_data_path = '/content/drive/MyDrive/kaggle competitions/pii_detection_nlp/data/pii-detection-removal-from-educational-data/test.json'

In [5]:
data = load_dataset('json', data_files=train_data_path, split='train')
data

Dataset({
    features: ['document', 'labels', 'full_text', 'trailing_whitespace', 'tokens'],
    num_rows: 6807
})

In [6]:
data.features
# the labels are a Sequence of Value classes
# need to turn Value into ClassLabel

{'document': Value(dtype='int64', id=None),
 'labels': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'full_text': Value(dtype='string', id=None),
 'trailing_whitespace': Sequence(feature=Value(dtype='bool', id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [7]:
pd.DataFrame(data[:])[['tokens', 'labels']].iloc[0]

tokens    [Design, Thinking, for, innovation, reflexion,...
labels    [O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...
Name: 0, dtype: object

In [8]:
example = data[0]
pd.DataFrame([example["tokens"][:20], example["labels"][:20]],  # only visualise the 20 first tokens
['Tokens', 'Labels'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,Design,Thinking,for,innovation,reflexion,-,Avril,2021,-,Nathalie,Sylla,\n\n,Challenge,&,selection,\n\n,The,tool,I,use
Labels,O,O,O,O,O,O,O,O,O,B-NAME_STUDENT,I-NAME_STUDENT,O,O,O,O,O,O,O,O,O


In [9]:
import datasets


data = data.cast_column("labels", datasets.Sequence(datasets.ClassLabel(names=['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-ID_NUM',
 'B-EMAIL',
 'I-STREET_ADDRESS',
 'I-PHONE_NUM',
 'B-USERNAME',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-URL_PERSONAL',
 'I-ID_NUM'])))
data.features

{'document': Value(dtype='int64', id=None),
 'labels': Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM', 'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM'], id=None), length=-1, id=None),
 'full_text': Value(dtype='string', id=None),
 'trailing_whitespace': Sequence(feature=Value(dtype='bool', id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [10]:
pd.DataFrame(data[:])[['tokens', 'labels']].iloc[0]

tokens    [Design, Thinking, for, innovation, reflexion,...
labels    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [11]:
example = data[0]
pd.DataFrame([example["tokens"][:20], example["labels"][:20]],  # only visualise the 20 first tokens
['Tokens', 'Labels'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,Design,Thinking,for,innovation,reflexion,-,Avril,2021,-,Nathalie,Sylla,\n\n,Challenge,&,selection,\n\n,The,tool,I,use
Labels,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0


In [12]:
# remove irrelevant features
data = data.remove_columns(["full_text", "trailing_whitespace", "document"])
data

Dataset({
    features: ['labels', 'tokens'],
    num_rows: 6807
})

In [13]:
# We only need the names for longformer finetunining -> rename all other labels as Other?
# all labels that are not names become 'O'
# we only need the tags 0 , 1, 2

def replace_non_names(example):
  example['labels']
  names = [1, 2]
  name_labels_lists = []
  for label in example['labels']:
    if label in names:
        name_labels_lists.append(label)
    else:
        name_labels_lists.append(0)
  example['ner_tags'] = name_labels_lists
  return example

In [14]:
data = data.map(replace_non_names)
data

Dataset({
    features: ['labels', 'tokens', 'ner_tags'],
    num_rows: 6807
})

In [15]:
data.features['ner_tags'].feature

Value(dtype='int64', id=None)

In [16]:
data = data.cast_column("ner_tags", datasets.Sequence(datasets.ClassLabel(names=['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT'])))
data.features

{'labels': Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM', 'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT'], id=None), length=-1, id=None)}

In [17]:
# remove the labels
data = data.remove_columns(["labels"])
data

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 6807
})

### split train-validation (test)

In [18]:
data = data.train_test_split(test_size=0.2)
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5445
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1362
    })
})

In [19]:
tags = data['train'].features['ner_tags'].feature
tags.names

['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT']

In [20]:
tags_dict = {tag: 0 for tag in tags.names}
print(tags_dict)

def count_tags(example):
    for tag in example['ner_tags']:
        tag = tags.int2str(tag)
        tags_dict[tag] += 1

# count tags in the train set
data['train'].map(count_tags)
print('Train dataset tags:', tags_dict)

# reset dict and count the tags in the test set
tags_dict = {tag: 0 for tag in tags.names}
data['test'].map(count_tags)
print('Validation dataset tags:', tags_dict)

# data is unbalanced - 'O' label has overwhlemingly more examples
# sutend name label is relatively balanced

{'O': 0, 'B-NAME_STUDENT': 0, 'I-NAME_STUDENT': 0}


Map:   0%|          | 0/5445 [00:00<?, ? examples/s]

Train dataset tags: {'O': 4009359, 'B-NAME_STUDENT': 1145, 'I-NAME_STUDENT': 881}


Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Validation dataset tags: {'O': 980713, 'B-NAME_STUDENT': 220, 'I-NAME_STUDENT': 215}


In [21]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags']].iloc[0]

tokens      [Thai, Tamang, \n\n, Reflection, -, Learning, ...
ner_tags    [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [22]:
tags = data['train'].features['ner_tags'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}
index2tag, tag2index

({0: 'O', 1: 'B-NAME_STUDENT', 2: 'I-NAME_STUDENT'},
 {'O': 0, 'B-NAME_STUDENT': 1, 'I-NAME_STUDENT': 2})

In [23]:
tags.int2str(1)

'B-NAME_STUDENT'

In [24]:
def create_tag_names(batch):
    tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
    return tag_name

In [25]:
data = data.map(create_tag_names)
data

Map:   0%|          | 0/5445 [00:00<?, ? examples/s]

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_str'],
        num_rows: 5445
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_str'],
        num_rows: 1362
    })
})

In [26]:
pd.DataFrame(data['train'][:])[['tokens', 'ner_tags', 'ner_tags_str']].iloc[0]

tokens          [Thai, Tamang, \n\n, Reflection, -, Learning, ...
ner_tags        [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ner_tags_str    [B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...
Name: 0, dtype: object

### model building - tokenize

In [27]:
from transformers import AutoTokenizer

model_checkpoint = 'allenai/longformer-base-4096'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [28]:
tokenizer.is_fast

True

In [29]:
inputs = data['train'][0]['tokens']  # these are 'pretokenized' so it needs to tokenize it further
inputs = tokenizer(inputs, is_split_into_words=True)  # we need to tell the tokenizer the data is split into words otherwize it will just consider it as one input
print(inputs.tokens())

['<s>', 'ĠThai', 'ĠTam', 'ang', 'Ġ', 'ĊĊ', 'ĠRef', 'lection', 'Ġ-', 'ĠLearning', 'ĠLaunch', 'Ġ', 'ĊĊ', 'ĠChallenge', 'Ġ', 'ĊĊ', 'ĠSo', 'Ġas', 'Ġa', 'Ġcollege', 'Ġstudent', 'Ġand', 'Ġa', 'Ġbusiness', 'Ġentrepreneur', 'Ġ,', 'ĠI', 'Ġwill', 'Ġapply', 'Ġall', 'Ġthe', 'Ġknowledge', 'Ġand', 'Ġinformation', 'Ġthat', 'ĠI', 'Ġ', 'Ġlearned', 'Ġin', 'Ġthis', 'Ġcourse', 'Ġ,', 'Ġlike', 'Ġthinking', 'Ġout', 'Ġof', 'Ġthe', 'Ġbox', 'Ġin', 'Ġorder', 'Ġto', 'Ġexplore', 'Ġmore', 'Ġthings', 'Ġthat', 'Ġwill', 'Ġimprove', 'Ġmy', 'Ġ', 'Ġquality', 'Ġof', 'Ġthe', 'Ġproduct', 'Ġthat', 'ĠI', 'Ġsell', 'Ġin', 'Ġorder', 'Ġto', 'Ġhave', 'Ġa', 'Ġhigher', 'Ġdemand', 'Ġand', 'Ġto', 'Ġexpand', 'Ġmy', 'Ġbusiness', 'Ġ.', 'ĠTo', 'Ġ', 'Ġexpand', 'Ġmy', 'Ġown', 'Ġbusiness', 'ĠI', 'Ġwill', 'Ġconsult', 'Ġmy', 'Ġfamily', 'Ġand', 'Ġfriends', 'Ġto', 'Ġhave', 'Ġa', 'Ġfeedback', 'Ġin', 'Ġorder', 'Ġto', 'Ġknow', 'Ġwhat', 'Ġ', 'Ġare', 'Ġthe', 'Ġpositive', 'Ġand', 'Ġnegative', 'Ġaspects', 'Ġof', 'Ġmy', 'Ġown', 'Ġbusiness', 'Ġ.', 'Ġ', '

In [30]:
inputs

{'input_ids': [0, 9130, 7736, 1097, 1437, 50140, 8526, 20576, 111, 13807, 22217, 1437, 50140, 10045, 1437, 50140, 407, 25, 10, 1564, 1294, 8, 10, 265, 11777, 2156, 38, 40, 3253, 70, 5, 2655, 8, 335, 14, 38, 1437, 2435, 11, 42, 768, 2156, 101, 2053, 66, 9, 5, 2233, 11, 645, 7, 5393, 55, 383, 14, 40, 1477, 127, 1437, 1318, 9, 5, 1152, 14, 38, 1331, 11, 645, 7, 33, 10, 723, 1077, 8, 7, 3003, 127, 265, 479, 598, 1437, 3003, 127, 308, 265, 38, 40, 12777, 127, 284, 8, 964, 7, 33, 10, 6456, 11, 645, 7, 216, 99, 1437, 32, 5, 1313, 8, 2430, 5894, 9, 127, 308, 265, 479, 1437, 50140, 30418, 1437, 50140, 1773, 38, 128, 119, 202, 10, 1564, 1294, 2156, 38, 202, 2239, 15, 141, 7, 1477, 8, 26818, 5, 1318, 8, 518, 1437, 15, 127, 308, 1152, 2156, 53, 38, 202, 745, 103, 30528, 8, 15491, 15, 141, 40, 5, 1152, 28, 1437, 2782, 8, 33, 10, 205, 1318, 2820, 479, 38, 202, 120, 5, 6456, 29, 31, 127, 916, 2156, 142, 13, 1437, 162, 25, 265, 1945, 6456, 29, 31, 5, 916, 16, 65, 9, 5, 144, 505, 2433, 15, 5, 1437, 265

In [31]:
inputs.word_ids() # 7,7 blow means thay the tokens belong to one single token/word in the original dataset

[None,
 0,
 1,
 1,
 2,
 2,
 3,
 3,
 4,
 5,
 6,
 7,
 7,
 8,
 9,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 107,
 108,
 109,
 109,
 110,
 111,
 112,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 17

In [32]:
len(inputs.tokens()), len(data['train'][0]['tokens'])

(461, 435)

In [33]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            new_labels.append(-100)  # used in Pytorch to tell it to ignore the tag

        else:
            label = labels[word_id]
            if label%2 == 1:
                label = label + 1
            new_labels.append(label)

    return new_labels

In [34]:
labels = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels)
print(word_ids)

[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [35]:
align_labels_with_tokens(labels, word_ids)

[-100,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [36]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    all_labels = examples['ner_tags']

    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [37]:
tokenized_dataset = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)
tokenized_dataset

Map:   0%|          | 0/5445 [00:00<?, ? examples/s]

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5445
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1362
    })
})

### data collation
transform into tensors for use with pytorch/transformers lib


In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [39]:
batch = data_collator([tokenized_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[    0,  9130,  7736,  ...,     1,     1,     1],
        [    0, 28859,    44,  ...,  1437, 50140,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100,    1,    2,  ..., -100, -100, -100],
        [-100,    0,    0,  ...,    0,    0, -100]])}

### metrics

In [40]:
%%capture
!pip install seqeval
!pip install evaluate

In [41]:
import evaluate

metric = evaluate.load('seqeval')

In [42]:
ner_fetature = data['train'].features['ner_tags']
ner_fetature

Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT'], id=None), length=-1, id=None)

In [43]:
label_names = ner_fetature.feature.names
label_names

['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT']

In [44]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

['B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 '

In [45]:
preditctions = labels.copy()
preditctions[2] = 'O'

metric.compute(predictions=[preditctions], references=[labels])

{'NAME_STUDENT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [46]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

### train model

In [47]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,
                                                       num_labels=tags.num_classes,
                                                       gradient_checkpointing=False,
                                                       id2label=index2tag,
                                                       label2id=tag2index
                                                        )

  return self.fget.__get__(instance, owner)()
Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-NAME_STUDENT",
    "2": "I-NAME_STUDENT"
  },
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-NAME_STUDENT": 1,
    "I-NAME_STUDENT": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "transformers_version": "4.38.1",
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [49]:
model.config.num_labels

3

In [50]:
from transformers import TrainingArguments
from tqdm import tqdm


num_epochs = 3
batch_size = 2
logging_steps = len(tokenized_dataset['train']) // batch_size

args = TrainingArguments('distilbert-finetuned-ner',
                         evaluation_strategy='epoch',
                         log_level="error",
                         num_train_epochs=num_epochs,
                         per_device_eval_batch_size=batch_size,
                         per_device_train_batch_size=batch_size,
                         save_steps=1e6,
                         disable_tqdm=False,
                         learning_rate=2e-5,
                         weight_decay=0.01)


In [51]:
%%time
from transformers import Trainer

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['test'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0007,0.000637,0.579235,0.968037,0.724786,0.999726
2,0.0003,0.000332,0.834646,0.968037,0.896406,0.999888
3,0.0001,0.000519,0.733788,0.981735,0.839844,0.99985


CPU times: user 41min 13s, sys: 9min 32s, total: 50min 46s
Wall time: 50min 30s


TrainOutput(global_step=8169, training_loss=0.0009327197641458954, metrics={'train_runtime': 3029.0532, 'train_samples_per_second': 5.393, 'train_steps_per_second': 2.697, 'total_flos': 1.0482129275810346e+16, 'train_loss': 0.0009327197641458954, 'epoch': 3.0})

In [52]:
from transformers import pipeline

model_checkpoint = ''