In [1]:
from utils.russian_superglue_models import SpanClassificationModel
from transformers import AutoModel, PreTrainedModel

In [18]:
from utils.dataset_configs import (
    TASK_NUM_CLASSES,
    TASK_TO_CONFIG,
    TASK_TO_NAME,
    TASK_TYPES,
)
from utils.russian_superglue_models import (
    BertForEntityChoice,
    RobertaForEntityChoice,
    SpanClassificationModel,
    EntityChoiceModel
)


In [14]:
from dataclasses import dataclass 
from typing import Dict
from transformers import BertConfig, RobertaConfig, AutoModelForSequenceClassification, BertTokenizer, RobertaTokenizer

@dataclass
class ModelData:
    config: object
    tokenizer: object
    task_types: Dict[str, object]


MODEL_CLASSES: Dict[str, ModelData] = {
    "bert": ModelData(
        config=BertConfig,
        tokenizer=BertTokenizer,
        task_types={
            "classification": BertForSequenceClassification,
            "entity_choice": BertForEntityChoice,
            "span_classification": SpanClassificationModel,
        },
    ),
    "roberta": ModelData(
        config=RobertaConfig,
        tokenizer=RobertaTokenizer,
        task_types={
            "classification": RobertaForSequenceClassification,
            "entity_choice": RobertaForEntityChoice,
            "span_classification": SpanClassificationModel,
        },
    ),
}

In [15]:
def get_model(args) -> PreTrainedModel:
    """
    Returns a pre-trained model for a given task.

    Args:
        args: An object that contains the following fields:
            - model_name: A string that represents the name of the pre-trained model.
            - task_name: A string that represents the name of the task.

    Returns:
        An instance of a pre-trained model for the given task.

    Raises:
        ValueError: If the model or task name is not found in the dictionaries.
    """
    model_data = MODEL_CLASSES.get(args.model_name)
    if not model_data:
        raise ValueError(f"Unknown model name: {args.model_name}")
    model_type = model_data.task_types.get(TASK_TYPES[args.task_name])
    if not model_type:
        raise ValueError(f"Unknown task name: {args.task_name}")
    num_classes = TASK_NUM_CLASSES.get(args.task_name, 2)
    if TASK_TYPES[args.task_name] == 'span_classification':
        return SpanClassificationModel(
            backbone=AutoModel.from_pretrained(args.model_name_or_path),
            num_labels=num_classes,
        )
    elif TASK_TYPES[args.task_name] == 'entity_choice':
        return EntityChoiceModel(
            backbone=AutoModel.from_pretrained(args.model_name_or_path)
        )
    else:
        return AutoModelForSequenceClassification.from_pretrained(
            args.model_name_or_path, num_labels=num_classes
        )

In [16]:
class Args:
    def __init__(self, task_name, model_name, model_name_or_path):
        self.task_name = task_name
        self.model_name = model_name
        self.model_name_or_path = model_name_or_path

In [19]:
for task in TASK_TO_NAME.keys():
    args = Args(task, 'bert', 'bert-base-uncased')
    get_model(args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

TypeError: EntityChoiceModel.__init__() got an unexpected keyword argument 'num_labels'

In [7]:
args = Args('russe', 'bert')

In [8]:
get_model(args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SpanClassificationModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [1]:
import os
from datasets import Dataset, DatasetDict
from utils.dataset_configs import TASK_TO_NAME

def load_data(task_name: str, data_path: str = "data/combined/") -> DatasetDict:
    """
    Loads data for a given task from JSON files.

    Args:
        task_name (str): The name of the task for which to load data.
        data_path (str, optional): The directory containing the JSON files. Defaults to "data/combined/".

    Returns:
        A `DatasetDict` object containing the loaded training, validation, and test datasets.

    Raises:
        FileNotFoundError: If any of the required JSON files cannot be found.

    """
    task_path = os.path.join(data_path, TASK_TO_NAME[task_name])
    train_file = os.path.join(task_path, "train.jsonl")
    val_file = os.path.join(task_path, "val.jsonl")
    # test_file = os.path.join(task_path, "test.jsonl")

    if not all(os.path.isfile(p) for p in [train_file, val_file]):
        raise FileNotFoundError(
            f"Could not find required files for task '{task_name}' in directory '{data_path}'"
        )

    train_dataset = Dataset.from_json(train_file)
    val_dataset = Dataset.from_json(val_file)
    # test_dataset = Dataset.from_json(test_file)

    return DatasetDict(train=train_dataset, validation=val_dataset)#, test=test_dataset)

  MCC = load_metric("matthews_correlation")


In [2]:
dataset = load_data('muserc')

Using custom data configuration default-fcf9dfb403e77a59
Found cached dataset json (/home/moskovskiy/.cache/huggingface/datasets/json/default-fcf9dfb403e77a59/0.0.0)
Using custom data configuration default-74a5d76912bd0130
Found cached dataset json (/home/moskovskiy/.cache/huggingface/datasets/json/default-74a5d76912bd0130/0.0.0)


In [3]:
from utils.dataset_configs import TASK_TO_CONFIG
config = TASK_TO_CONFIG['muserc'](dataset)

In [4]:
from transformers import BertTokenizer
from functools import partial
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')
processed_dataset = dataset.map(
    partial(
        config.process_data, tokenizer=tokenizer, max_length=512
    ),
    num_proc=32,
    keep_in_memory=True,
    batched=True,
)

                                 


#0:   0%|          | 0/1 [00:00<?, ?ba/s]

{'idx': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'passage': [{'text': '(1) Но люди не могут существовать без природы, поэтому в парке стояли железобетонные скамейки — деревянные моментально ломали. (2) В парке бегали ребятишки, водилась шпана, которая развлекалась игрой в карты, пьянкой, драками, «иногда насмерть». (3) «Имали они тут и девок...» (4) Верховодил шпаной Артемка-мыло, с вспененной белой головой. (5) Людочка сколько ни пыталась усмирить лохмотья на буйной голове Артемки, ничего у неё не получалось. (6) Его «кудри, издали напоминавшие мыльную пену, изблизя оказались что липкие рожки из вокзальной столовой — сварили их, бросили комком в пустую тарелку, так они, слипшиеся, неподъёмно и лежали. (7) Да и не ради причёски приходил парень к Людочке. (8) Как только её руки становились занятыми ножницами и расчёской, Артемка начинал хватать её за разные места. (9) Людочка сначала увёртывалась от хватких рук Артемки, а когда не помогло, стукнула его машинкой по голове 

ValueError: something wrong with examples