In [1]:
import os
import numpy as np
import pandas as pd
import itertools
import torch
from tqdm import tqdm
from importlib import reload
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

if torch.cuda.is_available():
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
from datasets import load_dataset, Dataset

# data = load_dataset('AmazonScience/massive')
#data = Dataset.from_pandas(dict(tuple(data['train'].to_pandas().groupby('locale')))['ru-RU'])
#data = load_dataset('AmazonScience/massive', split='train[345420:345520]')
# dataset = dataset.train_test_split(test_size=0.1)
# data = data.remove_columns('__index_level_0__')
# data = data.select(range(100))

train_dataset = load_dataset('AmazonScience/massive', "ru-RU",split="train[:10]")
test_dataset = load_dataset('AmazonScience/massive', "ru-RU",split="test[:10]")

Downloading builder script:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Downloading and preparing dataset massive/ru-RU to /root/.cache/huggingface/datasets/AmazonScience___massive/ru-RU/1.0.0/71d360eb7d7a18565ff8c10609cebf714fce3cc390e173ba5b02ffd48543cdc1...


Downloading data:   0%|          | 0.00/40.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset massive downloaded and prepared to /root/.cache/huggingface/datasets/AmazonScience___massive/ru-RU/1.0.0/71d360eb7d7a18565ff8c10609cebf714fce3cc390e173ba5b02ffd48543cdc1. Subsequent calls will reuse this data.


In [3]:
print(len(train_dataset),len(test_dataset))

10 10


In [4]:
train_dataset['utt'][:10] 

['разбуди меня в девять утра в пятницу',
 'поставь будильник на два часа вперед',
 'олли тихо',
 'отстановись',
 'олли остановись на десять секунд',
 'остановись на десять секунд',
 'сделай освещение здесь чуть более тёплым',
 'пожалуйста сделай свет подходящий для чтения',
 'время идти спать',
 'олли время спать']

In [5]:
train_dataset['annot_utt'][:10]

['разбуди меня в [time : девять утра] в [date : пятницу]',
 'поставь будильник [time : на два часа вперед]',
 'олли тихо',
 'отстановись',
 'олли остановись на [time : десять секунд]',
 'остановись на [time : десять секунд]',
 'сделай освещение здесь чуть более [color_type : тёплым]',
 'пожалуйста сделай свет [color_type : подходящий для чтения]',
 'время идти спать',
 'олли время спать']

In [6]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
MODEL = AutoModel.from_pretrained("ai-forever/ruBert-base")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")                         
# model = AutoModel.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
from typing import List
import regex as re

'''

PARSER FOR THE DATASET NER TAG FORMAT

'''

class Parser:
    
    # RE patterns for tag extraction
    LABEL_PATTERN = r"\[(.*?)\]"
    PUNCTUATION_PATTERN = r"([.,\/#!$%\^&\*;:{}=\-_`~()'\"’¿])"
    
    # initialise, first word/id tag is O (outside)
    def __init__(self):
        self.tag_to_id = {
            "O": 0
        }
        self.id_to_tag = {
            0: "O"
        }
        
    '''
    
    CREATE TAGS
    
    '''
    # input : sentence, tagged sentence
    
    def __call__(self, sentence: str, annotated: str) -> List[str]:
    
        ''' Create Dictionary of Identified Tags'''
    
        # 1. set label B or I    
        
        matches = re.findall(self.LABEL_PATTERN, annotated)
        word_to_tag = {}
        for match in matches:
            tag, phrase = match.split(" : ")
            words = phrase.split(" ") 
            word_to_tag[words[0]] = f"B-{tag.upper()}"
            for w in words[1:]:
                word_to_tag[w] = f"I-{tag.upper()}"
                
        ''' Tokenise Sentence & add tags to not tagged words (O)'''
        
        # 2. add token tag to main tag dictionary

        tags = []
        sentence = re.sub(self.PUNCTUATION_PATTERN, r" \1 ", sentence)
        for w in sentence.split():
            if w not in word_to_tag:
                tags.append("O")
            else:
                tags.append(word_to_tag[w])
                self.__add_tag(word_to_tag[w])
        
        return tags
    
    '''
    
    TAG CONVERSION
    
    '''
    # to word2id (tag_to_id)
    # to id2word (id_to_tag)

    def __add_tag(self, tag: str):
        if tag in self.tag_to_id:
            return
        id_ = len(self.tag_to_id)
        self.tag_to_id[tag] = id_
        self.id_to_tag[id_] = tag
        
    ''' Get Tag Number ID '''
    # or just number id for token
        
    def get_id(self, tag: str):
        return self.tag_to_id[tag]
    
    ''' Get Tag Token from Number ID'''
    # given id get its token
    
    def get_label(self, id_: int):
        return self.get_tag_label(id_)
    
parser = Parser()
parser(train_dataset["utt"][0], train_dataset["annot_utt"][0])

['O', 'O', 'O', 'B-TIME', 'I-TIME', 'O', 'B-DATE']

In [8]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruBert-base")
MODEL = AutoModel.from_pretrained("ai-forever/ruBert-base")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")                         
# model = AutoModel.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.processed_data = self.__preprocess(dataset)
    
    def __len__(self):
        return len(self.processed_data)
    
    def __getitem__(self, idx):
        return self.processed_data[idx]
    
    def __preprocess(self, dataset):
        
        tmp = {}
        for idx in tqdm(range(len(dataset))):
            item = dataset[idx]
            tags = parser(item["utt"], item["annot_utt"])     # get list of tags
            tokenizer_output = self.tokenizer(item["utt"],    # tokenise document (incl. <bos>,<eos>)
                                              padding=True, 
                                              truncation=True, 
                                              return_tensors='pt')
            
            # token word identifier (each word can have multiple tokens)
            word_ids = tokenizer_output.word_ids() 
            
            # for each word, how many subtokens are there (starts with 1 - first word)
            subword_group = [
                (key + 1, len(list(group))) 
                for key, group in itertools.groupby(word_ids) 
                    if key is not None
            ] # index to aggregate tokens

            # define bio tags for each word in numerical format using parser
            target = [parser.get_id(t) for t in tags] 
            
            # group all relevant data that will be used in forward pass
            tmp[idx] = {
                **tokenizer_output,
                "subword_group": torch.tensor(subword_group),
                "target": torch.tensor(target)
            }
            
            # check consistency
            try:
                assert (len(subword_group) == len(target))
            except:
                print(item["annot_utt"], subword_group, target)
                
        return tmp

train = NERDataset(train_dataset, tokenizer)
test = NERDataset(test_dataset, tokenizer)

  0%|          | 0/10 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 10/10 [00:00<00:00, 445.84it/s]
100%|██████████| 10/10 [00:00<00:00, 1332.20it/s]


In [10]:
import torch.nn as nn
from transformers import PreTrainedModel
from transformers import PretrainedConfig
from transformers import AutoModel, AutoConfig

class MyConfig(PretrainedConfig):
    model_type = 'mymodel'
    def __init__(self, important_param=42, **kwargs):
        super().__init__(**kwargs)
        self.important_param = important_param

# PreTrainedModel has nn.Module

class NERClassifier(PreTrainedModel):
    
    config_class = MyConfig
    def __init__(self,config):
        super().__init__(config)
        self.bert = MODEL
        self.seq = nn.Sequential(
            nn.Linear(768, 256), 
            nn.ReLU(),
            nn.Linear(256, CLASSES),
        )
        
    def forward(self, inputs): # returns list of targets
        
        # standard inputs for BERT
        bert_output = self.bert(
            inputs["input_ids"],
            inputs["attention_mask"]
        )
        
        # output of transformer encoder will be our hidden state for 
        # each input_ids 
        last_hidden_state = bert_output["last_hidden_state"]
        
        # tokens correspond tokenizer divisions 
        # each word can be split into multiple tokens
        # ie. get the mean word embedding 
        
        target = []
        for group in inputs["subword_group"]:
            b, e = group
            word_embedding = last_hidden_state[:, b:b+e]       # get the token embeddings
            agg_embedding = torch.mean(word_embedding, dim=1)  # mean word embeddings for tokens
            
            # input mean word embedding (1,768) pass into nn.Sequential linear tail end
            proba = self.__forward_one(agg_embedding)    # logits data 
            target.append(proba)
        
        word_logits = torch.stack(target).squeeze(1)
        
        return word_logits
        
    def __forward_one(self, x):
        logits = self.seq(x)
        return logits

In [11]:
import torch.optim as optim

# define data loaders
train_loader = DataLoader(train)
test_loader = DataLoader(test)
CLASSES = len(parser.tag_to_id); print(f'{CLASSES} labels')
config = MyConfig(4)

# define classifier model, loss fucntion & optimiser
clf = NERClassifier(config).to(device)
criterion = nn.CrossEntropyLoss()  # for multiclass classification 
optimizer = optim.Adam(clf.parameters(), lr=1e-5) 

9 labels


In [49]:
from sklearn.metrics import accuracy_score

for epoch in range(10):
    
    '''
    
    (1) TRAINING LOOP
    
    '''
    
    loss_count, loss_sum = 0, 0
    y_true, y_pred = [], []
    
    # switch to training mode, ie backpropagation on
    clf.train()
    for data in tqdm(train_loader):
        
        # move data to device
        inputs = {
            key: val.squeeze(0).to(device)
            for key, val in data.items()
        }
        
        # logits of belonging to each of the tag class 
        # for all words in document
        outputs = clf(inputs)
        
        # predicted word tag 
        word_tag = torch.argmax(outputs, dim=1).tolist() 
        
        y_true.extend(inputs["target"].tolist())
        y_pred.extend(word_tag)        
        
        # calcualate loss
        loss = criterion(outputs, inputs["target"])
        loss_count += 1
        loss_sum += loss.item()
        
#         nn.utils.clip_grad_norm_(
#             parameters=clf.parameters(), max_norm=20
#         )
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch-{epoch + 1}: loss: {loss_sum / loss_count}; acc: {accuracy_score(y_true, y_pred)}")
        
    '''
    
    (2) VALIDATION LOOP
    
    '''
        
    test_loss_sum, test_loss_count = 0, 0
    test_true, test_pred = [], []
    
    # switch to inference mode
    with torch.no_grad():
        for test_rows in tqdm(test_loader):
            
            # move data to device
            test_inputs = {
                key: val.squeeze(0).to(device)
                for key, val in test_rows.items()
            }
            test_outputs = clf(test_inputs)

            # add metric data
            test_true.extend(test_inputs["target"].tolist())
            test_pred.extend(torch.argmax(test_outputs, dim=1).tolist())

            test_loss = criterion(test_outputs, test_inputs["target"])
            test_loss_count += 1
            test_loss_sum += test_loss.item()
        
        
    print(f"Epoch-{epoch + 1}: loss: {loss_sum / loss_count}; acc: {accuracy_score(y_true, y_pred)},\
          val_loss: {test_loss_sum / test_loss_count}, val_acc: {accuracy_score(test_true, test_pred)}")



100%|██████████| 10/10 [00:10<00:00,  1.04s/it]


Epoch-1: loss: 0.6805069640278816; acc: 0.6976744186046512


100%|██████████| 10/10 [00:00<00:00, 10.48it/s]


Epoch-1: loss: 0.6805069640278816; acc: 0.6976744186046512,          val_loss: 0.8717933624982834, val_acc: 0.813953488372093


100%|██████████| 10/10 [00:10<00:00,  1.09s/it]


Epoch-2: loss: 0.635441517829895; acc: 0.7906976744186046


100%|██████████| 10/10 [00:01<00:00,  9.74it/s]


Epoch-2: loss: 0.635441517829895; acc: 0.7906976744186046,          val_loss: 0.8502975344657898, val_acc: 0.813953488372093


100%|██████████| 10/10 [00:10<00:00,  1.06s/it]


Epoch-3: loss: 0.591285602748394; acc: 0.7674418604651163


100%|██████████| 10/10 [00:00<00:00, 10.45it/s]


Epoch-3: loss: 0.591285602748394; acc: 0.7674418604651163,          val_loss: 0.8234651923179627, val_acc: 0.8372093023255814


100%|██████████| 10/10 [00:10<00:00,  1.05s/it]


Epoch-4: loss: 0.5393777281045914; acc: 0.8372093023255814


100%|██████████| 10/10 [00:00<00:00, 11.22it/s]


Epoch-4: loss: 0.5393777281045914; acc: 0.8372093023255814,          val_loss: 0.7809332594275474, val_acc: 0.8372093023255814


100%|██████████| 10/10 [00:10<00:00,  1.01s/it]


Epoch-5: loss: 0.5065406486392021; acc: 0.8372093023255814


100%|██████████| 10/10 [00:00<00:00, 11.58it/s]


Epoch-5: loss: 0.5065406486392021; acc: 0.8372093023255814,          val_loss: 0.7536620497703552, val_acc: 0.8372093023255814


100%|██████████| 10/10 [00:10<00:00,  1.03s/it]


Epoch-6: loss: 0.4675091072916985; acc: 0.9069767441860465


100%|██████████| 10/10 [00:00<00:00, 10.80it/s]


Epoch-6: loss: 0.4675091072916985; acc: 0.9069767441860465,          val_loss: 0.7254292577505111, val_acc: 0.8604651162790697


100%|██████████| 10/10 [00:10<00:00,  1.03s/it]


Epoch-7: loss: 0.4321024067699909; acc: 0.9302325581395349


100%|██████████| 10/10 [00:00<00:00, 11.65it/s]


Epoch-7: loss: 0.4321024067699909; acc: 0.9302325581395349,          val_loss: 0.7173458933830261, val_acc: 0.8604651162790697


100%|██████████| 10/10 [00:10<00:00,  1.03s/it]


Epoch-8: loss: 0.4029550924897194; acc: 0.9302325581395349


100%|██████████| 10/10 [00:00<00:00, 13.94it/s]


Epoch-8: loss: 0.4029550924897194; acc: 0.9302325581395349,          val_loss: 0.692103661596775, val_acc: 0.8604651162790697


100%|██████████| 10/10 [00:10<00:00,  1.01s/it]


Epoch-9: loss: 0.3754826165735722; acc: 0.9534883720930233


100%|██████████| 10/10 [00:00<00:00, 10.94it/s]


Epoch-9: loss: 0.3754826165735722; acc: 0.9534883720930233,          val_loss: 0.6928185507655144, val_acc: 0.8604651162790697


100%|██████████| 10/10 [00:10<00:00,  1.04s/it]


Epoch-10: loss: 0.3559957958757877; acc: 0.9534883720930233


100%|██████████| 10/10 [00:00<00:00, 11.35it/s]

Epoch-10: loss: 0.3559957958757877; acc: 0.9534883720930233,          val_loss: 0.6674651488661766, val_acc: 0.8604651162790697





In [52]:
clf.save_pretrained('./my_model_dir')
tokenizer.save_pretrained('./my_model_dir')

('./my_model_dir/tokenizer_config.json',
 './my_model_dir/special_tokens_map.json',
 './my_model_dir/vocab.txt',
 './my_model_dir/added_tokens.json',
 './my_model_dir/tokenizer.json')

In [55]:
new_model = NERClassifier.from_pretrained('./my_model_dir')
new_model

NERClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [56]:
def ner_inference(text,model):

    # Tokenise input
    tokenizer_output = tokenizer(text,    
                                 padding=True, 
                                 truncation=True, 
                                 return_tensors='pt')

    # token word identifier (each word can have multiple tokens)
    word_ids = tokenizer_output.word_ids() 

    # for each word, how many subtokens are there (starts with 1 - first word)
    subword_group = [
        (key + 1, len(list(group))) 
        for key, group in itertools.groupby(word_ids) 
            if key is not None
    ] # index to aggregate tokens

    # group all relevant data that will be used in forward pass 
    output = {
        **tokenizer_output,
        "subword_group": torch.tensor(subword_group),
    }
    
    # get the highest logits value
    tag_pred = torch.argmax(model(output),axis=1).tolist()
    tag_pred 
    
    return tag_pred

ner_inference('В девять утра я улетаю в Зимбабве',new_model)

[0, 2, 0, 0, 0, 0, 0]