adapted code from video : https://www.youtube.com/watch?v=Q1i4bIIFOFc

In [1]:
!nvidia-smi

Thu Mar 14 13:31:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0              50W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
%%capture
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

In [3]:
import pandas as pd
import os
from datasets import Dataset
from google.colab import drive
from datasets import load_dataset

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data_path = '/content/drive/MyDrive/kaggle competitions/pii_detection_nlp/data/split_tokens_short/train_data_short.json'

In [5]:
data = load_dataset('json', data_files=data_path, split='train')
data

Dataset({
    features: ['doc_number', 'split', 'tokens', 'labels', 'number of tokens'],
    num_rows: 15889
})

In [6]:
data.features
# the labels are a Sequence of Value classes
# need to turn Value into ClassLabel

{'doc_number': Value(dtype='int64', id=None),
 'split': Value(dtype='int64', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'number of tokens': Value(dtype='int64', id=None)}

In [7]:
pd.DataFrame(data[:])[['tokens', 'labels']].iloc[0]

tokens    [Design, Thinking, for, innovation, reflexion,...
labels    [O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...
Name: 0, dtype: object

In [8]:
example = data[0]
pd.DataFrame([example["tokens"][:], example["labels"][:]],  # only visualise the 20 first tokens
['Tokens', 'Labels'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
Tokens,Design,Thinking,for,innovation,reflexion,-,Avril,2021,-,Nathalie,...,to,the,topic,to,be,addressed,.,In,the,type
Labels,O,O,O,O,O,O,O,O,O,B-NAME_STUDENT,...,O,O,O,O,O,O,O,O,O,O


In [9]:
print(len(example['tokens']))
print(type(example['tokens']))
# in datasets huggingface the lists are saved as strings

400
<class 'list'>


In [10]:
import datasets


data = data.cast_column("labels", datasets.Sequence(datasets.ClassLabel(names=['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-ID_NUM',
 'B-EMAIL',
 'I-STREET_ADDRESS',
 'I-PHONE_NUM',
 'B-USERNAME',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-URL_PERSONAL',
 'I-ID_NUM'])))
data.features

{'doc_number': Value(dtype='int64', id=None),
 'split': Value(dtype='int64', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM', 'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM'], id=None), length=-1, id=None),
 'number of tokens': Value(dtype='int64', id=None)}

In [11]:
pd.DataFrame(data[:])[['tokens', 'labels']].iloc[0]

tokens    [Design, Thinking, for, innovation, reflexion,...
labels    [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [12]:
example = data[0]
pd.DataFrame([example["tokens"][:20], example["labels"][:20]],  # only visualise the 20 first tokens
['Tokens', 'Labels'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Tokens,Design,Thinking,for,innovation,reflexion,-,Avril,2021,-,Nathalie,Sylla,\n\n,Challenge,&,selection,\n\n,The,tool,I,use
Labels,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0


In [13]:
# remove irrelevant features
data = data.remove_columns(["doc_number", "split", "number of tokens"])
data

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 15889
})

In [14]:
# select only the rows with labeled data
# get the idexes of the rows with labeled data
labeled_rows_idx = []
for idx, ls in enumerate(data['labels']):
    for i in ls:
        if i != 0:
            labeled_rows_idx.append(idx)
            break


len(labeled_rows_idx)

1144

In [15]:
small_dataset = data.select(labeled_rows_idx)

In [16]:
small_dataset.features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM', 'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM'], id=None), length=-1, id=None)}

### split train-validation (test)

In [17]:
data = small_dataset.train_test_split(test_size=0.2)
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 915
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 229
    })
})

In [18]:
tags = data['train'].features['labels'].feature
tags.names

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-ID_NUM',
 'B-EMAIL',
 'I-STREET_ADDRESS',
 'I-PHONE_NUM',
 'B-USERNAME',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-URL_PERSONAL',
 'I-ID_NUM']

In [19]:
tags_dict = {tag: 0 for tag in tags.names}
print(tags_dict)

def count_tags(example):
    for tag in example['labels']:
        tag = tags.int2str(tag)
        tags_dict[tag] += 1

# count tags in the train set
data['train'].map(count_tags)
print('Train dataset tags:', tags_dict)

# reset dict and count the tags in the test set
tags_dict = {tag: 0 for tag in tags.names}
data['test'].map(count_tags)
print('Validation dataset tags:', tags_dict)

# data is unbalanced - 'O' label has overwhlemingly more examples
# sutend name label is relatively balanced

{'O': 0, 'B-NAME_STUDENT': 0, 'I-NAME_STUDENT': 0, 'B-URL_PERSONAL': 0, 'B-ID_NUM': 0, 'B-EMAIL': 0, 'I-STREET_ADDRESS': 0, 'I-PHONE_NUM': 0, 'B-USERNAME': 0, 'B-PHONE_NUM': 0, 'B-STREET_ADDRESS': 0, 'I-URL_PERSONAL': 0, 'I-ID_NUM': 0}


Map:   0%|          | 0/915 [00:00<?, ? examples/s]

Train dataset tags: {'O': 330672, 'B-NAME_STUDENT': 1098, 'I-NAME_STUDENT': 873, 'B-URL_PERSONAL': 94, 'B-ID_NUM': 65, 'B-EMAIL': 31, 'I-STREET_ADDRESS': 20, 'I-PHONE_NUM': 12, 'B-USERNAME': 4, 'B-PHONE_NUM': 5, 'B-STREET_ADDRESS': 2, 'I-URL_PERSONAL': 1, 'I-ID_NUM': 1}


Map:   0%|          | 0/229 [00:00<?, ? examples/s]

Validation dataset tags: {'O': 83340, 'B-NAME_STUDENT': 267, 'I-NAME_STUDENT': 223, 'B-URL_PERSONAL': 16, 'B-ID_NUM': 13, 'B-EMAIL': 8, 'I-STREET_ADDRESS': 0, 'I-PHONE_NUM': 3, 'B-USERNAME': 2, 'B-PHONE_NUM': 1, 'B-STREET_ADDRESS': 0, 'I-URL_PERSONAL': 0, 'I-ID_NUM': 0}


In [20]:
pd.DataFrame(data['train'][:])[['tokens', 'labels']].iloc[0]

tokens    [Mohamad, Afiq,            , 29, December, 202...
labels    [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object

In [21]:
tags = data['train'].features['labels'].feature

index2tag = {idx:tag for idx, tag in enumerate(tags.names)}
tag2index = {tag:idx for idx, tag in enumerate(tags.names)}
index2tag, tag2index

({0: 'O',
  1: 'B-NAME_STUDENT',
  2: 'I-NAME_STUDENT',
  3: 'B-URL_PERSONAL',
  4: 'B-ID_NUM',
  5: 'B-EMAIL',
  6: 'I-STREET_ADDRESS',
  7: 'I-PHONE_NUM',
  8: 'B-USERNAME',
  9: 'B-PHONE_NUM',
  10: 'B-STREET_ADDRESS',
  11: 'I-URL_PERSONAL',
  12: 'I-ID_NUM'},
 {'O': 0,
  'B-NAME_STUDENT': 1,
  'I-NAME_STUDENT': 2,
  'B-URL_PERSONAL': 3,
  'B-ID_NUM': 4,
  'B-EMAIL': 5,
  'I-STREET_ADDRESS': 6,
  'I-PHONE_NUM': 7,
  'B-USERNAME': 8,
  'B-PHONE_NUM': 9,
  'B-STREET_ADDRESS': 10,
  'I-URL_PERSONAL': 11,
  'I-ID_NUM': 12})

In [22]:
tags.int2str(1)

'B-NAME_STUDENT'

In [23]:
def create_tag_names(batch):
    tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['labels']]}
    return tag_name

In [24]:
data = data.map(create_tag_names)
data

Map:   0%|          | 0/915 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'ner_tags_str'],
        num_rows: 915
    })
    test: Dataset({
        features: ['tokens', 'labels', 'ner_tags_str'],
        num_rows: 229
    })
})

In [25]:
pd.DataFrame(data['train'][:])[['tokens', 'labels', 'ner_tags_str']].iloc[0]

tokens          [Mohamad, Afiq,            , 29, December, 202...
labels          [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ner_tags_str    [B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...
Name: 0, dtype: object

### model building - tokenize

In [26]:
from transformers import AutoTokenizer

model_checkpoint = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [27]:
tokenizer.is_fast

True

In [28]:
inputs = data['train'][0]['tokens']  # these are 'pretokenized' so it needs to tokenize it further
inputs = tokenizer(inputs, is_split_into_words=True)  # we need to tell the tokenizer the data is split into words otherwize it will just consider it as one input
print(inputs.tokens())

['<s>', 'ĠMoh', 'am', 'ad', 'ĠAf', 'iq', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ29', 'ĠDecember', 'Ġ2020', 'Ġ', 'ĊĊ', 'ĠM', 'IND', 'ĠM', 'APP', 'ING', 'Ġ', 'ĊĊ', 'Ġ1', 'Ġ.', 'ĠCH', 'ALL', 'EN', 'GE', 'Ġ', 'ĊĊ', 'ĠI', 'Ġwas', 'Ġasked', 'Ġto', 'Ġco', 'Ġ-', 'Ġfacilitate', 'Ġa', 'Ġdesign', 'Ġthinking', 'Ġworkshop', 'Ġwithin', 'Ġthe', 'ĠDigital', 'Ġ&', 'Ġ', 'ĠTechnology', 'ĠSector', 'Ġof', 'Ġour', 'Ġorganisation', 'Ġa', 'Ġfew', 'Ġmonths', 'Ġago', 'Ġ,', 'Ġduring', 'Ġthe', 'Ġsecond', 'Ġwave', 'Ġof', 'Ġ', 'Ġthe', 'Ġpand', 'emic', 'Ġin', 'Ġmy', 'Ġcountry', 'Ġ.', 'ĠOur', 'Ġorganisation', 'Ġrecommended', 'Ġall', 'Ġstaff', 'Ġto', 'Ġwork', 'Ġ', 'Ġremotely', 'Ġ,', 'Ġthus', 'Ġour', 'Ġworkshop', 'Ġhad', 'Ġto', 'Ġadapt', 'Ġto', 'Ġa', 'Ġvirtual', 'Ġenvironment', 'Ġwhich', 'Ġwas', 'Ġnew', 'Ġ', 'Ġto', 'Ġnot', 'Ġonly', 'Ġthe', 'Ġfacilit', 'ators', 'Ġbut', 'Ġparticipants', 'Ġ.', 'ĠThe', 'Ġintention', 'Ġof', 'Ġthe', 'Ġdesign', 'Ġthinking', 'Ġ', 'Ġworkshop', 'Ġwas', 'Ġto', 'Ġsolicit', 'Ġuser

In [29]:
inputs

{'input_ids': [0, 8097, 424, 625, 6710, 19582, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1132, 719, 2760, 1437, 50140, 256, 13796, 256, 17167, 1862, 1437, 50140, 112, 479, 3858, 7981, 2796, 8800, 1437, 50140, 38, 21, 553, 7, 1029, 111, 9666, 10, 1521, 2053, 9780, 624, 5, 6282, 359, 1437, 3777, 15816, 9, 84, 6010, 10, 367, 377, 536, 2156, 148, 5, 200, 4605, 9, 1437, 5, 23387, 14414, 11, 127, 247, 479, 1541, 6010, 5131, 70, 813, 7, 173, 1437, 18684, 2156, 4634, 84, 9780, 56, 7, 9037, 7, 10, 6229, 1737, 61, 21, 92, 1437, 7, 45, 129, 5, 30893, 3629, 53, 3597, 479, 20, 6589, 9, 5, 1521, 2053, 1437, 9780, 21, 7, 22706, 3018, 1652, 31, 70, 695, 4389, 8, 265, 1066, 1437, 11, 5, 1293, 15, 5, 2502, 9, 7350, 2316, 8, 3563, 2239, 11, 70, 1437, 911, 9, 5, 1651, 479, 1541, 1231, 3522, 2817, 2528, 14, 10, 25490, 1437, 1548, 74, 1157, 5838, 3077, 11, 5, 11, 111, 790, 709, 50, 13803, 1437, 9, 10, 2472, 14, 4758, 268, 13, 5, 6010, 44, 27, 29, 782, 479, 1541, 165, 9, 80, 30893, 36

In [30]:
inputs.word_ids() # 7,7 blow means thay the tokens belong to one single token/word in the original dataset

[None,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 4,
 5,
 6,
 6,
 7,
 7,
 8,
 8,
 8,
 9,
 9,
 10,
 11,
 12,
 12,
 12,
 12,
 13,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 152,
 153,
 154,
 155,
 156,
 156,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,

In [31]:
len(inputs.tokens()), len(data['train'][0]['tokens'])

(446, 400)

In [32]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            new_labels.append(-100)  # used in Pytorch to tell it to ignore the tag

        else:
            label = labels[word_id]
            if label%2 == 1:
                label = label + 1
            new_labels.append(label)

    return new_labels

In [33]:
labels = data['train'][0]['labels']
word_ids = inputs.word_ids()
print(labels)
print(word_ids)

[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [34]:
align_labels_with_tokens(labels, word_ids)

[-100,
 1,
 2,
 2,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [35]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'],
                                 truncation=True,
                                 padding=True,
                                 is_split_into_words=True,

                                 )

    all_labels = examples['labels']

    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [36]:
tokenized_dataset = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)
tokenized_dataset

Map:   0%|          | 0/915 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 915
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 229
    })
})

In [37]:
len(tokenized_dataset['train'][0]['input_ids'])

512

### data collation
transform into tensors for use with pytorch/transformers lib


In [38]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [39]:
batch = data_collator([tokenized_dataset['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[    0,  8097,   424,  ...,     1,     1,     1],
        [    0, 17487,  6834,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    1,    2,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100]])}

In [40]:
len(batch['input_ids'][0])

512

### metrics

In [41]:
%%capture
!pip install seqeval
!pip install evaluate

In [42]:
import evaluate

metric = evaluate.load('seqeval')

In [43]:
ner_fetature = data['train'].features['labels']
ner_fetature

Sequence(feature=ClassLabel(names=['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM', 'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM'], id=None), length=-1, id=None)

In [44]:
label_names = ner_fetature.feature.names
label_names

['O',
 'B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-ID_NUM',
 'B-EMAIL',
 'I-STREET_ADDRESS',
 'I-PHONE_NUM',
 'B-USERNAME',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-URL_PERSONAL',
 'I-ID_NUM']

In [45]:
labels = data['train'][0]['labels']
labels = [label_names[i] for i in labels]
labels

['B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 '

In [46]:
preditctions = labels.copy()
preditctions[2] = 'O'

metric.compute(predictions=[preditctions], references=[labels])

{'NAME_STUDENT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [47]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

### train model

In [48]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,
                                                       num_labels=tags.num_classes,
                                                       # gradient_checkpointing=False,
                                                       id2label=index2tag,
                                                       label2id=tag2index,
                                                        )

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
model.config

RobertaConfig {
  "_name_or_path": "FacebookAI/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-NAME_STUDENT",
    "2": "I-NAME_STUDENT",
    "3": "B-URL_PERSONAL",
    "4": "B-ID_NUM",
    "5": "B-EMAIL",
    "6": "I-STREET_ADDRESS",
    "7": "I-PHONE_NUM",
    "8": "B-USERNAME",
    "9": "B-PHONE_NUM",
    "10": "B-STREET_ADDRESS",
    "11": "I-URL_PERSONAL",
    "12": "I-ID_NUM"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-EMAIL": 5,
    "B-ID_NUM": 4,
    "B-NAME_STUDENT": 1,
    "B-PHONE_NUM": 9,
    "B-STREET_ADDRESS": 10,
    "B-URL_PERSONAL": 3,
    "B-USERNAME": 8,
    "I-ID_NUM": 12,
    "I-NAME_STUDENT": 2,
    "I-PHONE_NUM": 7,
    "I-STREET_ADDRESS": 6,
    "I-URL_PERSONAL": 11,
    "O": 0
 

In [50]:
model.config.num_labels, model.config.hidden_size

(13, 768)

In [51]:
# import torch
# import torch.nn as nn

# class CustomTokenClassificationModel(nn.Module):
#     def __init__(self, model):
#         super().__init__()
#         self.model = model
#         self.token_classification_head = nn.Linear(model.config.hidden_size,  model.config._num_labels)
#         self.loss_fn = nn.CrossEntropyLoss()

#     def forward(self, input_ids, attention_mask, labels=None):
#         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         logits = outputs.logits
#         loss = None
#         if labels is not None:
#             loss = self.loss_fn(logits.view(-1, model.config._num_labels), labels.view(-1))
#         return loss, logits

# # Instantiate your model
# model = CustomTokenClassificationModel(model)


In [52]:
from transformers import TrainingArguments
from tqdm import tqdm


num_epochs = 3
batch_size = 5
logging_steps = len(tokenized_dataset['train']) // batch_size

args = TrainingArguments('roberta-for-token-classification-finetuned',
                         evaluation_strategy='epoch',
                         save_strategy="epoch",
                         num_train_epochs=num_epochs,
                         per_device_eval_batch_size=batch_size,
                         per_device_train_batch_size=batch_size,
                         save_steps=1e6,
                         disable_tqdm=False,
                         learning_rate=2e-5,
                         weight_decay=0.01)


In [53]:
from transformers import Trainer

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['test'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.compute_loss

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [54]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.009565,0.756131,0.865835,0.807273,0.99706
2,No log,0.007882,0.802521,0.893916,0.845756,0.997943
3,0.034300,0.006253,0.842105,0.923557,0.880952,0.998158


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CPU times: user 1min 47s, sys: 5.69 s, total: 1min 53s
Wall time: 2min 2s


TrainOutput(global_step=549, training_loss=0.03161682934926073, metrics={'train_runtime': 121.6822, 'train_samples_per_second': 22.559, 'train_steps_per_second': 4.512, 'total_flos': 717330928942080.0, 'train_loss': 0.03161682934926073, 'epoch': 3.0})

In [55]:
from transformers import pipeline

model_checkpoint = ''