In [27]:
import pandas as pd
import re 
import string
from datasets import Dataset, DatasetDict
from transformers import AutoModelForMaskedLM, AutoTokenizer

https://huggingface.co/Babelscape/wikineural-multilingual-ner

https://huggingface.co/IlyaGusev/rubert_ext_sum_gazeta

 tgpt "let's assume that i trained BERT for masked language modeling. Now i wan't to fine-tune for named entity recognition. How should i change my model architecture to do so?"
let's assume that i trained BERT for masked language modeling. Now i wan't to fine-tune for named entity recognition. How should i change my model architecture to do so?

To fine-tune a pre-trained BERT model for Named Entity Recognition (NER), you would need to make
 some modifications to the model architecture. Here are the steps you can follow:

1. Load the pre-trained BERT model and add an additional layer on top of it for NER.
2. Replace the final layer of the pre-trained BERT model with a new classification layer that
 predicts the entity label for each token in the input sequence.
3. Modify the loss function to calculate the cross-entropy loss between the predicted entity
 labels and the true entity labels for each token.
4. Fine-tune the entire model on your NER dataset by updating the weights of all layers, including
 the pre-trained BERT layers and the new classification layer.

Here's some sample code to help you get started:

```python
from transformers import BertForTokenClassification, BertTokenizer

# Load pre-trained BERT model for masked language modeling
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Add a new classification layer for NER
num_labels = 4 # Number of entity labels in your dataset
ner_model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=num_labels
)

# Replace the final layer of the pre-trained BERT model with the new classification layer
ner_model.bert = model.bert
ner_model.classifier = nn.Linear(768, num_labels)

# Modify the loss function to calculate the cross-entropy loss between predicted and true entity
 labels
loss_fn = nn.CrossEntropyLoss()

# Fine-tune the entire model on your NER dataset
optim = AdamW(ner_model.parameters(), lr=5e-5)
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, labels = batch
        outputs = ner_model(**inputs)
        loss = loss_fn(outputs.logits.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optim.step()
        optim.zero_grad()
```

In this example code, num_labels represents the number of entity labels in your dataset. You
 would need to replace it with the actual number of entity labels in your NER dataset.

I hope this helps! Let me know if you have any further questions or concerns.


# Preparing data

In [3]:
df = pd.read_csv("../data/train_unsupervised_dataset.csv")
df.head()

Unnamed: 0,id,name
0,0,Зубная щетка Орал Би Три эффект Деликатное отб...
1,1,салфетки VISTER влажные для ко
2,2,Платье женское DR8517K 7Л8999 Светло-серый 449...
3,3,"ЛАКОМСТВО ""ДЕРЕВЕНСКИЕ ЛАКОМСТВА"" д/собак мини..."
4,4,Суппорт гитарный Ergo Play Troster


In [4]:
len(df)

1000000

In [8]:
#функции для препроцесса
#тут можно еще сделать ru_to_eng(дада и такие странные примеры есть)
def eng_to_ru(word: str) -> str:
    if len(word) <= 1:
        return word
    a = ord('а')
    ru_alphabet = ''.join([chr(i) for i in range(a,a+33)])
    eng_alphabet = string.ascii_lowercase[:26]
    change = {
        "a": "а",
        "e": "е",
        "o": "о",
        "k": "к",
        "3": "з",
        "p": "р",
        "c": "с",
        "m": "м",
        "x": "х",
        "t": "т",
        "y": "у",
        "z": "з",
    }

    new_word = ""
    for i in range(len(word)):
        curr = word[i]
        if curr in eng_alphabet:
            if i - 1 < 0:
                prev_letter = "j"
            else:
                prev_letter = word[i - 1]

            if i + 1 >= len(word):
                next_letter = "j"
            else:
                next_letter = word[i + 1]

            if next_letter in ru_alphabet or prev_letter in ru_alphabet:
                curr = change.get(curr, curr)
        new_word += curr

    return new_word

def only_ones(word: str) -> str:
    if word.isdigit() or re.match(r'^-?\d+(?:\.\d+)$', word):
        return "1"
    else:
        new_word = ""
        i = 0
        while i < len(word):
            curr = word[i]
            while i < len(word) and word[i].isdigit():
                curr = "1"
                i += 1
            new_word += curr
            if curr != "1":
                i += 1

    return new_word

def remove_punct(word: str) -> str:
    word = re.sub(r'[^\w\s]+', ' ', word)
    word = re.sub(r'\s+', ' ', word)
    return word.strip()


def preprocess_string(word: str) -> str:
    new_word = ""


    word = remove_punct(word)

    if word != "товара нет":
        new_word = eng_to_ru(only_ones(word).lower())

    return new_word


In [11]:
df = df.fillna("")
df.name = df.name.apply(preprocess_string)
df.head()

Unnamed: 0,id,name
0,0,зубная щетка орал би три эффект деликатное отб...
1,1,салфетки vister влажные для ко
2,2,платье женское dr1k 1л1 светло серый 1 1 1 1
3,3,лакомство деревенские лакомства д собак мини п...
4,4,суппорт гитарный ergo play troster


In [20]:
n = len(df)
train = df[:int(n * 0.8)]
val = df[int(n * 0.8):]
dataset_dict = {"train": train, "validation": val}

train_dataset = Dataset.from_dict(train)
validation_dataset = Dataset.from_dict(val)

In [21]:
train_dataset

Dataset({
    features: ['id', 'name'],
    num_rows: 800000
})

In [22]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'name'],
        num_rows: 800000
    })
    validation: Dataset({
        features: ['id', 'name'],
        num_rows: 200000
    })
})

# model

In [28]:
model_checkpoint = "cointegrated/rubert-tiny"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# model.cuda()
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json: 100%|███████████████████████| 632/632 [00:00<00:00, 1.51MB/s]
Downloading model.safetensors: 100%|█████████████████████████| 47.7M/47.7M [00:04<00:00, 10.1MB/s]
Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)okenizer_config.json: 100%|████████████████████████| 341/341 [00:00<00:00, 779kB/s]
Downloading (…)solve/main/vocab.txt: 241kB [00:00, 1.13MB/s]
Do

In [31]:
def tokenize_function(examples):
    result = tokenizer(examples["name"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset_dict.map(
    tokenize_function, batched=True, remove_columns=["name", "id"]
)
tokenized_datasets

                                                                                                  

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 800000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 200000
    })
})

In [32]:
tokenizer.model_max_length

512

In [36]:
chunk_size = 128

In [37]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 26'
'>>> Review 1 length: 13'
'>>> Review 2 length: 21'


In [38]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 60'


In [45]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 60'


In [48]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [49]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

                                                                                                  

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 107221
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 26795
    })
})

In [50]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'##окколи [SEP] [CLS] малавиткрем гель д лица пит увлаж 1мл [SEP] [CLS] 1 1 майка женс dfy dws1 [SEP] [CLS] 1 sko корсет майка джинсово синий l [SEP] [CLS] 1 сумка подарочная 1 1мм bc [SEP] [CLS] салфетки фортуна 1 1 целлюлозные 1шт [SEP] [CLS] 1 1 1 ринонорм спрей наз для взрослых 1 1 1мл [SEP] [CLS] а диски replay mercedes mr1 1 1 r1 pcd 1 1 et 1'

In [51]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'##окколи [SEP] [CLS] малавиткрем гель д лица пит увлаж 1мл [SEP] [CLS] 1 1 майка женс dfy dws1 [SEP] [CLS] 1 sko корсет майка джинсово синий l [SEP] [CLS] 1 сумка подарочная 1 1мм bc [SEP] [CLS] салфетки фортуна 1 1 целлюлозные 1шт [SEP] [CLS] 1 1 1 ринонорм спрей наз для взрослых 1 1 1мл [SEP] [CLS] а диски replay mercedes mr1 1 1 r1 pcd 1 1 et 1'

In [52]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [54]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] зубная щетка орал би три эффект деликатное отбе [MASK]ние 1 средняя 1штап 1 [SEP] [CLS] салфетки vis [MASK] влажные для ко [SEP] [CLS] платье [MASK]нское dr1k 1л1 светло серый 1 1водят 1 [SEP] [CLS] лакомство деревенские лакомrrow [MASK] [MASK]бак мини пород косточки из полковникдейки 1 [MASK] [SEP] [CLS] су [MASK]портболтарный [MASK] [MASK] play [MASK]ster [SEP] [CLS] котлет [MASK] из индейки мираторг 1г [SEP] [CLS] 1 салат из крабов [MASK] пало [MASK] и бр'

'>>> ##окколи [SEP] [CLS] малавитк [MASK] гель д лица пит увлаж [MASK] [MASK]л [SEP] [CLS] 1 1 майка женс dfy dws1 [SEP] [CLS] 1 sko кор [MASK] [MASK] джинсово синий [MASK] [SEP] [CLS] 1 сумка подарочная 1 1мм bc [SEP] [CLS] салфетки фортуна 1 1 целлюлозные 1шт [SEP] [CLS] 1 1 1 ринонорм спрей наз для взрос [MASK] 1 1 1мл [SEP] [CLS] а диски replay mercedes mr1 1 1 r1 pc [MASK] 1 1 et 1'


In [55]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [56]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] зубная щетка [MASK] [MASK] би три [MASK] [MASK] деликатное отбеливание 1 [MASK] 1шт 1 1 [SEP] [CLS] салфетки [MASK] [MASK] влажные для ко [SEP] [CLS] платье женское dr1k [MASK] [MASK] [MASK] [MASK] [MASK] серый 1 [MASK] 1 1 [SEP] [CLS] лакомство [MASK] [MASK] [MASK] лакомства д собак мини пород косточки [MASK] индейки 1г [SEP] [CLS] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] ergo play troster [SEP] [CLS] котлеты из индейки мираторг 1г [SEP] [CLS] 1 [MASK] [MASK] из крабовых палочек и бр'

'>>> ##окколи [SEP] [CLS] малавиткрем гель д лица [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] 1мл [SEP] [CLS] 1 1 майка [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [SEP] [CLS] 1 sko корсет [MASK] джинсово синий l [SEP] [CLS] 1 сумка подарочная 1 1мм bc [SEP] [CLS] салфетки [MASK] [MASK] [MASK] 1 1 целлюлозные 1шт [SEP] [CLS] [MASK] 1 1 ринонорм спрей [MASK] [MASK] для взрослых [MASK] 1 1мл [SEP] [CLS] а диски replay mercedes [MASK] [MASK] [MASK] [MASK] 1 r1 pcd 1 1 [MASK] 1'


# downsample dataest

In [58]:
train_size = 50_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=1337
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 5000
    })
})

# Training

In [59]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")