# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate -U
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# !apt install git-lfs



In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!unzip -nq archive_pkl.zip

In [None]:
import pandas as pd
df = pd.read_pickle('./df_token_with_classes.pkl')


Drop some data

In [None]:
#df = df.sample(frac=0.33, random_state=42).reset_index(drop=True)

In [None]:
df.sample(random_state=42)

Unnamed: 0,tokens,classes
197580,"[Область, Вологодская, ,, Грязовецкий, Район, ...","[2, 1, 11, 3, 4, 11, 7, 8]"


In [None]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df)
train_testvalid = dataset.train_test_split(test_size=0.33, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
raw_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test']})


In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'classes'],
        num_rows: 1017943
    })
    validation: Dataset({
        features: ['tokens', 'classes'],
        num_rows: 250688
    })
    test: Dataset({
        features: ['tokens', 'classes'],
        num_rows: 250688
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['обл',
 'Псковская',
 ',',
 'Район',
 'Велико-ий',
 'Территория',
 'СНТ',
 'Рябинушка']

In [None]:
raw_datasets["train"][0]["classes"]

[2, 1, 11, 4, 3, 6, 6, 5]

In [None]:
ner_feature = raw_datasets["train"].features["classes"]
ner_feature

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [None]:
# label_names = ner_feature.feature.names
label_names = [
'OTHER',
'REGION',
'REGION_TYPE',
'AREA',
'AREA_TYPE',
'TERRITORY',
'TERRITORY_TYPE',
'CITY',
'CITY_TYPE',
'STREET',
'STREET_TYPE',
'DELIMITER'
]
label_names

['OTHER',
 'REGION',
 'REGION_TYPE',
 'AREA',
 'AREA_TYPE',
 'TERRITORY',
 'TERRITORY_TYPE',
 'CITY',
 'CITY_TYPE',
 'STREET',
 'STREET_TYPE',
 'DELIMITER']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["classes"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

обл         Псковская ,         Район     Велико-ий Территория     СНТ            Рябинушка 
REGION_TYPE REGION    DELIMITER AREA_TYPE AREA      TERRITORY_TYPE TERRITORY_TYPE TERRITORY 


In [None]:
from transformers import AutoTokenizer

# model_checkpoint = "bert-base-cased"
model_checkpoint = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'обл',
 'Псковская',
 ',',
 'Район',
 'Велико',
 '-',
 'и',
 '##й',
 'Территория',
 'СНТ',
 'Ря',
 '##бин',
 '##ушка',
 '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 7, None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["classes"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[2, 1, 11, 4, 3, 6, 6, 5]
[-100, 2, 1, 11, 4, 3, 3, 3, 3, 6, 6, 5, 5, 5, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["classes"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/1017943 [00:00<?, ? examples/s]

Map:   0%|          | 0/250688 [00:00<?, ? examples/s]

Map:   0%|          | 0/250688 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(3)])
batch["labels"]

tensor([[-100,    2,    1,   11,    4,    3,    3,    3,    3,    6,    6,    5,
            5,    5, -100, -100, -100, -100, -100, -100, -100],
        [-100,    1,    1,    2,   11,    8,    7,    7,   11,   10,    9,    9,
         -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    2,    1,    1,    4,    4,    4,    3,    3,    3,   11,    8,
            7,    7,    7,   11,    9,    9,    9,   10, -100]])

In [None]:
for i in range(3):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 2, 1, 11, 4, 3, 3, 3, 3, 6, 6, 5, 5, 5, -100]
[-100, 1, 1, 2, 11, 8, 7, 7, 11, 10, 9, 9, -100]
[-100, 2, 1, 1, 4, 4, 4, 3, 3, 3, 11, 8, 7, 7, 7, 11, 9, 9, 9, 10, -100]


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c88c3235d56e351ac552f2689f7b0bfedfb3090c4036cc4814a5875405fee38a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
labels = raw_datasets["train"][0]["classes"]
labels = [label_names[i] for i in labels]
labels

['REGION_TYPE',
 'REGION',
 'DELIMITER',
 'AREA_TYPE',
 'AREA',
 'TERRITORY_TYPE',
 'TERRITORY_TYPE',
 'TERRITORY']

In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'EGION': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'EGION_TYPE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ELIMITER': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'ERRITORY': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ERRITORY_TYPE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'REA': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'REA_TYPE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.8571428571428571,
 'overall_f1': 0.923076923076923,
 'overall_accuracy': 0.875}

In [None]:
import numpy as np



def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

12

In [None]:
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "rubert-address-elements",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0191,0.01341,0.99561,0.994378,0.994993,0.996212


TrainOutput(global_step=127243, training_loss=0.029688896072007073, metrics={'train_runtime': 3855.6677, 'train_samples_per_second': 264.012, 'train_steps_per_second': 33.002, 'total_flos': 386404398026496.0, 'train_loss': 0.029688896072007073, 'epoch': 1.0})

In [None]:
model.save_pretrained("rubert-address-elements")

In [None]:
mymodel = AutoModelForTokenClassification.from_pretrained("rubert-address-elements")

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
# model_checkpoint = "rubert-address-elements"
token_classifier = pipeline(
    task = "token-classification",
    model=mymodel,
    tokenizer=tokenizer,
    grouped_entities=True,
    ignore_subwords=True
)
res = token_classifier("Калужская Область Грайвороновский Район, Село Пищево Улица Земельная")
res = token_classifier("Новосибирская Кировская, Область, Ново Лукский город")
res

[{'entity_group': 'REGION',
  'score': 0.9997916,
  'word': 'Новосибирская Кировская',
  'start': 0,
  'end': 23},
 {'entity_group': 'DELIMITER',
  'score': 0.99999857,
  'word': ',',
  'start': 23,
  'end': 24},
 {'entity_group': 'REGION_TYPE',
  'score': 0.98216313,
  'word': 'Область',
  'start': 25,
  'end': 32},
 {'entity_group': 'DELIMITER',
  'score': 0.99999905,
  'word': ',',
  'start': 32,
  'end': 33},
 {'entity_group': 'CITY',
  'score': 0.9023154,
  'word': 'Ново Лукский',
  'start': 34,
  'end': 46},
 {'entity_group': 'CITY_TYPE',
  'score': 0.85998344,
  'word': 'город',
  'start': 47,
  'end': 52}]

In [None]:
token_classifier("Ставропольский край г Лермонтов территория садоводческого некоммерческого товарищества имени И.В. Мичурина, ул массив 3 линия 3")

[{'entity_group': 'REGION',
  'score': 0.9999989,
  'word': 'Ставропольский',
  'start': 0,
  'end': 14},
 {'entity_group': 'REGION_TYPE',
  'score': 0.99999535,
  'word': 'край',
  'start': 15,
  'end': 19},
 {'entity_group': 'CITY_TYPE',
  'score': 0.9999908,
  'word': 'г',
  'start': 20,
  'end': 21},
 {'entity_group': 'CITY',
  'score': 0.9999931,
  'word': 'Лермонтов',
  'start': 22,
  'end': 31},
 {'entity_group': 'TERRITORY_TYPE',
  'score': 0.99661547,
  'word': 'территория',
  'start': 32,
  'end': 42},
 {'entity_group': 'TERRITORY',
  'score': 0.9995291,
  'word': 'садоводческого некоммерческого товарищества имени И. В. Мичурина',
  'start': 43,
  'end': 106},
 {'entity_group': 'DELIMITER',
  'score': 0.99997985,
  'word': ',',
  'start': 106,
  'end': 107},
 {'entity_group': 'STREET_TYPE',
  'score': 0.9998938,
  'word': 'ул',
  'start': 108,
  'end': 110},
 {'entity_group': 'STREET',
  'score': 0.9999504,
  'word': 'массив 3 линия 3',
  'start': 111,
  'end': 127}]

In [None]:
token_classifier("Респ Северная Осетия - Алания, р-н Пригородный, тер. Кавказ автомобильная дорога М-4 Дон-Владикавказ-Грозный-Махачкала-граница с Азербайджанской Республикой, км 564-ый")

[{'entity_group': 'REGION_TYPE',
  'score': 0.9999993,
  'word': 'Респ',
  'start': 0,
  'end': 4},
 {'entity_group': 'REGION',
  'score': 0.9999993,
  'word': 'Северная Осетия - Алания',
  'start': 5,
  'end': 29},
 {'entity_group': 'DELIMITER',
  'score': 0.9999989,
  'word': ',',
  'start': 29,
  'end': 30},
 {'entity_group': 'AREA_TYPE',
  'score': 0.99999756,
  'word': 'р - н',
  'start': 31,
  'end': 34},
 {'entity_group': 'AREA',
  'score': 0.999998,
  'word': 'Пригородный',
  'start': 35,
  'end': 46},
 {'entity_group': 'DELIMITER',
  'score': 0.99999344,
  'word': ',',
  'start': 46,
  'end': 47},
 {'entity_group': 'TERRITORY_TYPE',
  'score': 0.99971807,
  'word': 'тер.',
  'start': 48,
  'end': 52},
 {'entity_group': 'TERRITORY',
  'score': 0.99955285,
  'word': 'Кавказ автомобильная дорога М - 4 Дон - Владикавказ - Грозный - Махачкала - граница с Азербайджанской Республикой',
  'start': 53,
  'end': 156},
 {'entity_group': 'DELIMITER',
  'score': 0.9995184,
  'word': ',',
 

In [None]:
token_classifier("Респ. ВЫмышленная Неизвестный улус тер. урочище Ыт атага")

[{'entity_group': 'REGION_TYPE',
  'score': 0.999998,
  'word': 'Респ.',
  'start': 0,
  'end': 5},
 {'entity_group': 'REGION',
  'score': 0.9999924,
  'word': 'ВЫмышленная',
  'start': 6,
  'end': 17},
 {'entity_group': 'AREA',
  'score': 0.83924913,
  'word': 'Неизвестный',
  'start': 18,
  'end': 29},
 {'entity_group': 'STREET_TYPE',
  'score': 0.94468063,
  'word': 'улус',
  'start': 30,
  'end': 34},
 {'entity_group': 'TERRITORY_TYPE',
  'score': 0.99970555,
  'word': 'тер.',
  'start': 35,
  'end': 39},
 {'entity_group': 'TERRITORY',
  'score': 0.9978629,
  'word': 'урочище Ыт атага',
  'start': 40,
  'end': 56}]

## Push to HF hub

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
!git config --global user.email "resheto@gmail.com"
!git config --global user.name "qwazer"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub(commit_message="Training complete")