In [None]:
! pip install torch
! pip install transformers
! pip install scikit-learn
! pip install tqdm
! pip install numpy
! pip install datasets
! pip install nltk
import nltk
nltk.download('stopwords')
! pip install scipy
! pip install transformers[torch] accelerate


Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.6 MB/s[0m eta [36m0:00:0

In [None]:
from torch import nn
from transformers import Trainer


class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss





In [None]:
import logging
import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional

import tqdm
import re

from filelock import FileLock
from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
import datasets

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    """

    input_ids: List[List[int]]
    attention_mask: Optional[List[List[int]]]
    token_type_ids: Optional[List[List[int]]]
    label: Optional[int]


class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


if is_torch_available():
    import torch
    from torch.utils.data.dataset import Dataset

    class MultipleChoiceDataset(Dataset):
        """
        PyTorch multiple choice dataset class
        """

        features: List[InputFeatures]

        def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            dataset = datasets.load_dataset('lex_glue', task)
            tokenizer_name = re.sub('[^a-z]+', ' ', tokenizer.name_or_path).title().replace(' ', '')
            cached_features_file = os.path.join(
                '.cache',
                task,
                "cached_{}_{}_{}_{}".format(
                    mode.value,
                    tokenizer_name,
                    str(max_seq_length),
                    task,
                ),
            )

            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            if not os.path.exists(os.path.join('.cache', task)):
                if not os.path.exists('.cache'):
                    os.mkdir('.cache')
                os.mkdir(os.path.join('.cache', task))
            with FileLock(lock_path):

                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {task}")
                    if mode == Split.dev:
                        examples = dataset['validation']
                    elif mode == Split.test:
                        examples = dataset['test']
                    elif mode == Split.train:
                        examples = dataset['train']
                    logger.info("Training examples: %s", len(examples))
                    self.features = convert_examples_to_features(
                        examples,
                        max_seq_length,
                        tokenizer,
                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


if is_tf_available():
    import tensorflow as tf

    class TFMultipleChoiceDataset:
        """
        TensorFlow multiple choice dataset class
        """

        features: List[InputFeatures]

        def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = 256,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            dataset = datasets.load_dataset('lex_glue')

            logger.info(f"Creating features from dataset file at {task}")
            if mode == Split.dev:
                examples = dataset['validation']
            elif mode == Split.test:
                examples = dataset['test']
            else:
                examples = dataset['train']
            logger.info(f"{mode.name.title()} examples: %s", len(examples))

            self.features = convert_examples_to_features(
                examples,
                max_seq_length,
                tokenizer,
            )

            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
                    if ex_index % 10000 == 0:
                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))

                    yield (
                        {
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "token_type_ids": ex.token_type_ids,
                        },
                        ex.label,
                    )

            self.dataset = tf.data.Dataset.from_generator(
                gen,
                (
                    {
                        "input_ids": tf.int32,
                        "attention_mask": tf.int32,
                        "token_type_ids": tf.int32,
                    },
                    tf.int64,
                ),
                (
                    {
                        "input_ids": tf.TensorShape([None, None]),
                        "attention_mask": tf.TensorShape([None, None]),
                        "token_type_ids": tf.TensorShape([None, None]),
                    },
                    tf.TensorShape([]),
                ),
            )

        def get_dataset(self):
            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))

            return self.dataset

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


def convert_examples_to_features(
    examples: datasets.Dataset,
    max_length: int,
    tokenizer: PreTrainedTokenizer,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
    """
    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_inputs = []
        for ending_idx, ending in enumerate(example['endings']):
            context = example['context']
            inputs = tokenizer(
                context,
                ending,
                add_special_tokens=True,
                max_length=max_length,
                padding="max_length",
                truncation=True,
            )

            choices_inputs.append(inputs)

        label = example['label']

        input_ids = [x["input_ids"] for x in choices_inputs]
        attention_mask = (
            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
        )
        token_type_ids = (
            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
        )

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            )
        )

    for f in features[:2]:
        logger.info("*** Example ***")
        logger.info("feature: %s" % f)

    return features


In [None]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on CaseHOLD (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
import random
import shutil
import glob

import transformers
from transformers import (
	AutoConfig,
	AutoModelForMultipleChoice,
	AutoTokenizer,
	EvalPrediction,
	HfArgumentParser,
	Trainer,
	TrainingArguments,
	set_seed,
)
from transformers.trainer_utils import is_main_process
from transformers import EarlyStoppingCallback
#from casehold_helpers import MultipleChoiceDataset, Split
from sklearn.metrics import f1_score
#from models.deberta import DebertaForMultipleChoice


logger = logging.getLogger(__name__)
output_dir = os.getcwd()

@dataclass
class ModelArguments:
	"""
	Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
	"""

	model_name_or_path: str = field(
		metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
	)
	config_name: Optional[str] = field(
		default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
	)
	tokenizer_name: Optional[str] = field(
		default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
	)
	cache_dir: Optional[str] = field(
		default=None,
		metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
	)


@dataclass
class DataTrainingArguments:
	"""
	Arguments pertaining to what data we are going to input our model for training and eval.
	"""

	task_name: str = field(default="case_hold", metadata={"help": "The name of the task to train on"})
	max_seq_length: int = field(
		default=256,
		metadata={
			"help": "The maximum total input sequence length after tokenization. Sequences longer "
			"than this will be truncated, sequences shorter will be padded."
		},
	)
	pad_to_max_length: bool = field(
		default=True,
		metadata={
			"help": "Whether to pad all samples to `max_seq_length`. "
			"If False, will pad the samples dynamically when batching to the maximum length in the batch."
		},
	)
	max_train_samples: Optional[int] = field(
		default=None,
		metadata={
			"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
			"value if set."
		},
	)
	max_eval_samples: Optional[int] = field(
		default=None,
		metadata={
			"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
			"value if set."
		},
	)
	max_predict_samples: Optional[int] = field(
		default=None,
		metadata={
			"help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
			"value if set."
		},
	)
	overwrite_cache: bool = field(
		default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
	)


def main():
	# See all possible arguments in src/transformers/training_args.py
	# or by passing the --help flag to this script.
	# We now keep distinct sets of args, for a cleaner separation of concerns.

	parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
	# Add custom arguments for computing pre-train loss
	parser.add_argument("--ptl", type=bool, default=False)
	model_args, data_args, training_args, custom_args = parser.parse_args_into_dataclasses()

	if (
		os.path.exists(training_args.output_dir)
		and os.listdir(training_args.output_dir)
		and training_args.do_train
		and not training_args.overwrite_output_dir
	):
		raise ValueError(
			f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
		)

	# Setup logging
	logging.basicConfig(
		format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
		datefmt="%m/%d/%Y %H:%M:%S",
		level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
	)
	logger.warning(
		"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
		training_args.local_rank,
		training_args.device,
		training_args.n_gpu,
		bool(training_args.local_rank != -1),
		training_args.fp16,
	)
	# Set the verbosity to info of the Transformers logger (on main process only):
	if is_main_process(training_args.local_rank):
		transformers.utils.logging.set_verbosity_info()
		transformers.utils.logging.enable_default_handler()
		transformers.utils.logging.enable_explicit_format()
	logger.info("Training/evaluation parameters %s", training_args)

	# Set seed
	set_seed(training_args.seed)

	# Load pretrained model and tokenizer
	config = AutoConfig.from_pretrained(
		model_args.config_name if model_args.config_name else model_args.model_name_or_path,
		num_labels=5,
		finetuning_task=data_args.task_name,
		cache_dir=model_args.cache_dir,
	)

	if config.model_type == 'big_bird':
		config.attention_type = 'original_full'
	elif config.model_type == 'longformer':
		config.attention_window = [data_args.max_seq_length] * config.num_hidden_layers

	tokenizer = AutoTokenizer.from_pretrained(
		model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
		cache_dir=model_args.cache_dir,
		# Default fast tokenizer is buggy on CaseHOLD task, switch to legacy tokenizer
		use_fast=True,
	)

	if config.model_type != 'deberta':
		model = AutoModelForMultipleChoice.from_pretrained(
			model_args.model_name_or_path,
			from_tf=bool(".ckpt" in model_args.model_name_or_path),
			config=config,
			cache_dir=model_args.cache_dir,
		)
	else:
		model = DebertaForMultipleChoice.from_pretrained(
			model_args.model_name_or_path,
			from_tf=bool(".ckpt" in model_args.model_name_or_path),
			config=config,
			cache_dir=model_args.cache_dir,
		)

	train_dataset = None
	eval_dataset = None

	# If do_train passed, train_dataset by default loads train split from file named train.csv in data directory
	if training_args.do_train:
		train_dataset = \
			MultipleChoiceDataset(
				tokenizer=tokenizer,
				task=data_args.task_name,
				max_seq_length=data_args.max_seq_length,
				overwrite_cache=data_args.overwrite_cache,
				mode=Split.train,
			)

	# If do_eval or do_predict passed, eval_dataset by default loads dev split from file named dev.csv in data directory
	if training_args.do_eval:
		eval_dataset = \
			MultipleChoiceDataset(
				tokenizer=tokenizer,
				task=data_args.task_name,
				max_seq_length=data_args.max_seq_length,
				overwrite_cache=data_args.overwrite_cache,
				mode=Split.dev,
			)

	if training_args.do_predict:
		predict_dataset = \
			MultipleChoiceDataset(
				tokenizer=tokenizer,
				task=data_args.task_name,
				max_seq_length=data_args.max_seq_length,
				overwrite_cache=data_args.overwrite_cache,
				mode=Split.test,
			)

	if training_args.do_train:
		if data_args.max_train_samples is not None:
			train_dataset = train_dataset[:data_args.max_train_samples]
		# Log a few random samples from the training set:
		for index in random.sample(range(len(train_dataset)), 3):
			logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

	if training_args.do_eval:
		if data_args.max_eval_samples is not None:
			eval_dataset = eval_dataset[:data_args.max_eval_samples]

	if training_args.do_predict:
		if data_args.max_predict_samples is not None:
			predict_dataset = predict_dataset[:data_args.max_predict_samples]

	# Define custom compute_metrics function, returns macro F1 metric for CaseHOLD task
	def compute_metrics(p: EvalPrediction):
		preds = np.argmax(p.predictions, axis=1)
		# Compute macro and micro F1 for 5-class CaseHOLD task
		macro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='macro', zero_division=0)
		micro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro', zero_division=0)
		return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

	# Initialize our Trainer
	trainer = Trainer(
		model=model,
		args=training_args,
		train_dataset=train_dataset,
		eval_dataset=eval_dataset,
		compute_metrics=compute_metrics,
		callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
	)

	# Training
	if training_args.do_train:
		trainer.train(
			model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
		)
		trainer.save_model()
		# Re-save the tokenizer for model sharing
		if trainer.is_world_process_zero():
			tokenizer.save_pretrained(training_args.output_dir)

	# Evaluation on eval_dataset
	if training_args.do_eval:
		logger.info("*** Evaluate ***")
		metrics = trainer.evaluate(eval_dataset=eval_dataset)

		max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
		metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

		trainer.log_metrics("eval", metrics)
		trainer.save_metrics("eval", metrics)

	# Predict on eval_dataset
	if training_args.do_predict:
		logger.info("*** Predict ***")

		predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

		max_predict_samples = (
			data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
		)
		metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

		trainer.log_metrics("predict", metrics)
		trainer.save_metrics("predict", metrics)

		output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
		if trainer.is_world_process_zero():
			with open(output_predict_file, "w") as writer:
				for index, pred_list in enumerate(predictions):
					pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list])
					writer.write(f"{index}\t{pred_line}\n")

	# Clean up checkpoints
	checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
	for checkpoint in checkpoints:
		shutil.rmtree(checkpoint)


def _mp_fn(index):
	# For xla_spawn (TPUs)
	main()


if __name__ == "__main__":
	main()


usage: ipykernel_launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
                             [--config_name CONFIG_NAME]
                             [--tokenizer_name TOKENIZER_NAME]
                             [--cache_dir CACHE_DIR] [--task_name TASK_NAME]
                             [--max_seq_length MAX_SEQ_LENGTH]
                             [--pad_to_max_length [PAD_TO_MAX_LENGTH]]
                             [--no_pad_to_max_length]
                             [--max_train_samples MAX_TRAIN_SAMPLES]
                             [--max_eval_samples MAX_EVAL_SAMPLES]
                             [--max_predict_samples MAX_PREDICT_SAMPLES]
                             [--overwrite_cache [OVERWRITE_CACHE]]
                             --output_dir OUTPUT_DIR
                             [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
                             [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
                             [--do_predict [DO_PREDICT]]
   

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import json
import random
import tqdm
from collections import Counter

# NOTE: The dataset has been first enriched with metadata from SEC-EDGAR
# to figure out the year of submission for the original filings. This
# part is missing from the script.

# Parse original (augmented) dataset
categories = []
with open('ledgar.jsonl') as file:
    for line in tqdm.tqdm(file.readlines()):
        data = json.loads(line)
        categories.extend(data['labels'])

# Find the top-100 labels.
categories = set([label for label, count in Counter(categories).most_common()[:100]])


# Subsample examples labeled with one of the top-100 labels.
with open('ledgar_small.jsonl', 'w') as out_file:
    with open('ledgar.jsonl') as file:
        for line in tqdm.tqdm(file.readlines()):
            data = json.loads(line)
            if set(data['labels']).intersection(categories):
                labels = set(data['labels']).intersection(categories)
                if len(labels) == 1:
                    data['labels'] = sorted(list(labels))
                    data.pop('clause_types', None)
                    out_file.write(json.dumps(data)+'\n')


# Organize examples in clusters by year
years = []
samples = {year: [] for year in ['2016', '2017', '2018', '2019']}
with open('ledgar_small.jsonl') as file:
    for line in tqdm.tqdm(file.readlines()):
        data = json.loads(line)
        years.append(data['year'])
        data.pop('filer_cik', None)
        data.pop('filer_name', None)
        data.pop('filer_state', None)
        data.pop('filer_industry', None)
        samples[data['year']].append(data)


# Write final dataset 60k/10k/10k
random.seed(1)
with open('ledgar.jsonl', 'w') as file:
    final_samples = random.sample(samples['2016'], 30000)
    final_samples += random.sample(samples['2017'], 30000)
    for sample in final_samples:
        sample['data_type'] = 'train'
        file.write(json.dumps(sample) + '\n')
    final_samples = random.sample(samples['2018'], 10000)
    for sample in final_samples:
        sample['data_type'] = 'dev'
        file.write(json.dumps(sample) + '\n')
    final_samples = random.sample(samples['2019'], 10000)
    for sample in final_samples:
        sample['data_type'] = 'test'
        file.write(json.dumps(sample) + '\n')


In [None]:

from datasets import load_dataset
dataset_dict = load_dataset("lex_glue",'ecthr_a')
#print(dataset)
#Divide into train,dev,test

from sklearn.model_selection import train_test_split

#data_list = list(dataset_dict.items())

train_set_dict = dataset_dict['train'].data
test_set_dict = dataset_dict['test'].data
validation_set_dict = dataset_dict['validation'].data
print(train_set_dict[0])

In [None]:
!pip install nlpaug
import random
import nlpaug.augmenter.word as naw
from datasets import load_dataset

# Initialize the augmentation object
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
dataset = load_dataset('lex_glue', 'unfair_tos')

# Get the training data
train_data = dataset['train']

# Augment the data
augmented_texts = []
augmented_labels = []

for example in train_data:
    text = example['text']
    label = example['labels']

    augmented_text = aug.augment(text)
    augmented_texts.append(augmented_text)
    augmented_labels.append(label)

# Combine original and augmented data
combined_data = list(zip(augmented_texts, augmented_labels))
random.shuffle(combined_data)
augmented_texts, augmented_labels = zip(*combined_data)

# Print some augmented examples
for text, label in zip(augmented_texts[:10], augmented_labels[:10]):
    print("Augmented Text:", text)
    print("Label:", label)
    print("=" * 50)


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/410.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m337.9/410.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/511k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5532 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1607 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2275 [00:00<?, ? examples/s]

Augmented Text: ["18. 10 avc / h. 264 notice : if the software is used to make video calls ( i ) between a personal smartphone and a device that is not a personal computer with ( ii ) between devices that are not family computers, the avc / h. 264 technology may be used to facilitate video functionality in which case the following notice applies : the avc video functionality in this product remains reserved under any avc patent portfolio license for the personal and non - commercial use of a consumer to ( i ) encode video through compliance with the avc standard ( ` ` avc video'' ) and / or ( ii ) decode avc video that was encoded by a consumer engaged in a personal nor non - commercial activity and / or was obtained from a television provider licensed to provide avc video."]
Label: []
Augmented Text: ["don't post content that contains anything that, in under armour's original determination, appears objectionable or ban any other person from officially actively modifying the site, or t

In [None]:
for text, label in zip(augmented_texts[:10], augmented_labels[:10]):
    print("Augmented Text:", text)
    print("Label:", label)
    print("=" * 50)

NameError: ignored

In [None]:
import nltk
nltk.download('stopwords')
import pandas
import torch
from sklearn.utils import parallel_backend
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from datasets import load_dataset
import logging
import os
import argparse
import random
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np

dataset_n_classes = {'ecthr_a': 10, 'ecthr_b': 10, 'scotus': 14, 'eurlex': 100, 'ledgar': 100, 'unfair_tos': 8, 'case_hold': 5}

def load_augmented_dataset(dataset, percentage):
    split = 'train'
    split_data = dataset[split]['text']
    sample_size = int(len(split_data) * percentage)
    sampled_data = random.sample(split_data, sample_size)
    return sampled_data

def main():
    config = {
        'dataset': 'unfair_tos',
        'task_type': 'multi_label',
        'text_limit': -1
    }
    n_classes = 100
    cwd = os.getcwd()
    print(cwd)

    if not os.path.exists(f"logs/{config.get('dataset')}"):
        if not os.path.exists(f'logs'):
            os.mkdir(f'logs')
        os.mkdir(f"logs/{config.get('dataset')}")
    handlers = [logging.FileHandler(f"logs/{config.get('dataset')}_svm.txt"), logging.StreamHandler()]
    logging.basicConfig(handlers=handlers, level=logging.INFO)

    def get_text(dataset):
        if 'ecthr' in config.get('dataset'):
            texts = [' '.join(text) for text in dataset['text']]
            return [' '.join(text.split()[:config.get('text_limit')]) for text in texts]
        elif config.get('dataset') == 'case_hold':
            data = [[context] + endings for context, endings in zip(dataset['context'], dataset['endings'])]
            return pd.DataFrame(data=data,
                                columns=['context', 'option_1', 'option_2', 'option_3', 'options_4', 'option_5']
                                )
        else:
            return [' '.join(text.split()[:config.get('text_limit')]) for text in dataset['text']]

    def get_labels(dataset, mlb=None):
        if config.get('task_type') == 'multi_class':
            return dataset['label']
        else:
            return mlb.transform(dataset['labels']).tolist()

    def add_zero_class(labels):
        augmented_labels = np.zeros((len(labels), len(labels[0]) + 1), dtype=np.int32)
        augmented_labels[:, :-1] = labels
        augmented_labels[:, -1] = (np.sum(labels, axis=1) == 0).astype('int32')
        return augmented_labels

    scores = {'micro-f1': [], 'macro-f1': []}
    dataset = load_dataset('lex_glue', config.get('dataset'))
    from sklearn.model_selection import train_test_split

    dataset = load_dataset('lex_glue', config.get('dataset'))

    for seed in range(1, 6):
        if config.get('task_type') == 'multi_label':
            classifier = OneVsRestClassifier(LinearSVC(random_state=seed, max_iter=50000))
            parameters = {
                'vect__max_features': [10000, 20000, 40000],
                'clf__estimator__C': [0.1, 1, 10],
                'clf__estimator__loss': ('hinge', 'squared_hinge')
            }
        elif config.get('dataset') == 'case_hold':
            classifier = LinearSVC(random_state=seed, max_iter=50000)
            parameters = {
                'clf__C': [0.1, 1, 10],
                'clf__loss': ('hinge', 'squared_hinge')
            }
        else:
            classifier = LinearSVC(random_state=seed, max_iter=50000)
            parameters = {
                'vect__max_features': [10000, 20000, 40000],
                'clf__C': [0.1, 1, 10],
                'clf__loss': ('hinge', 'squared_hinge')
            }

        if config.get('dataset') == 'case_hold':
            text_clf = Pipeline([
                ('union', FeatureUnion([('context_tfidf',
                                         Pipeline([('extract_field',
                                                    FunctionTransformer(lambda x: x['context'], validate=False)),
                                                   ('vect', CountVectorizer(stop_words=stopwords.words('english'),
                                                                            ngram_range=(1, 3), min_df=5,
                                                                            max_features=40000)),
                                                   ('tfidf', TfidfTransformer())]))] +
                                       [(f'option_{idx}_tfidf',
                                         Pipeline([('extract_field',
                                                    FunctionTransformer(lambda x: x[f'option_{idx}'], validate=False)),
                                                   ('vect', CountVectorizer(stop_words=stopwords.words('english'),
                                                                            ngram_range=(1, 3), min_df=5,
                                                                            max_features=40000)),
                                                   ('tfidf', TfidfTransformer())]))
                                        for idx in range(1, 6)]
                                       )),
                ('clf', classifier)
            ])
        else:
            text_clf = Pipeline([('vect', CountVectorizer(stop_words=stopwords.words('english'),
                                                          ngram_range=(1, 3), min_df=5)),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf', classifier),
                                 ])

        split_index = [-1] * len(dataset['train']) + [0] * len(dataset['validation'])
        val_split = PredefinedSplit(test_fold=split_index)
        gs_clf = GridSearchCV(text_clf, parameters, cv=val_split, n_jobs=32, verbose=4, refit=False)
        x_train = get_text(dataset['train'])
        x_train_series = pd.Series(x_train)
        x_val = get_text(dataset['validation'])
        x_val_series = pd.Series(x_val)
        x_train_val = pd.concat([x_train_series, x_val_series])
        if config.get('task_type') == 'multi_label':
                mlb = MultiLabelBinarizer(classes=range(n_classes))
                mlb.fit(dataset['train']['labels'])
        else:
                mlb = None
                y_train = get_labels(dataset['train'], mlb)
                y_val = get_labels(dataset['validation'], mlb)
                y_train_val = y_train + y_val


        if config.get('dataset') == 'eurlex':
            svm_clf = SVC()
            param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.1, 1, 10]}
            batch_size = 1000
            total_samples = len(x_train_val)
            for i in range(0, total_samples, batch_size):
                x_batch = x_train_val[i:i+batch_size]
                y_batch = y_train_val[i:i+batch_size]
                gs_clf = GridSearchCV(svm_clf, param_grid, cv=5)
                gs_clf.fit(x_batch, y_batch)
            best_estimator = gs_clf.best_estimator_
        else:
            x_train = get_text(dataset['train'])
            x_train_series = pd.Series(x_train)
            x_val = get_text(dataset['validation'])
            x_val_series = pd.Series(x_val)
            x_train_val = pd.concat([x_train_series, x_val_series])
            if config.get('task_type') == 'multi_label':
                mlb = MultiLabelBinarizer(classes=range(n_classes))
                mlb.fit(dataset['train']['labels'])
            else:
                mlb = None
                y_train = get_labels(dataset['train'], mlb)
                y_val = get_labels(dataset['validation'], mlb)
                y_train_val = y_train + y_val

            gs_clf = gs_clf.fit(x_train_val, y_train_val)

        print('Best Parameters:')
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

        text_clf.set_params(**gs_clf.best_params_)
        gs_clf = text_clf.fit(x_train, y_train)

        print(config.get('dataset'))
        print('VALIDATION RESULTS:')
        y_pred = gs_clf.predict(get_text(dataset['validation']))
        y_true = get_labels(dataset["validation"], mlb)
        if config.get('task_type') == 'multi_label' and config.get('dataset') != 'eurlex':
            y_true = add_zero_class(y_true)
            y_pred = add_zero_class(y_pred)
        print(f'Accuracy: {metrics.accuracy_score(y_true, y_pred):.1%}')
        print(f'Micro-F1: {metrics.f1_score(y_true, y_pred, average="micro") * 100:.1f}')
        print(f'Macro-F1: {metrics.f1_score(y_true, y_pred, average="macro") * 100:.1f}')

        print('TEST RESULTS:')
        y_pred = gs_clf.predict(get_text(dataset['test']))
        y_true = get_labels(dataset["test"], mlb)
        if config.get('task_type') == 'multi_label' and config.get('dataset') != 'eurlex':
            y_true = add_zero_class(y_true)
            y_pred = add_zero_class(y_pred)
        print(f'Accuracy: {metrics.accuracy_score(y_true, y_pred):.1%}')
        print(f'Micro-F1: {metrics.f1_score(y_true, y_pred, average="micro") * 100:.1f}')
        print(f'Macro-F1: {metrics.f1_score(y_true, y_pred, average="macro") * 100:.1f}')
        scores['micro-f1'].append(metrics.f1_score(y_true, y_pred, average="micro"))
        scores['macro-f1'].append(metrics.f1_score(y_true, y_pred, average="macro"))

    print('-' * 100)
    print(
        f'Micro-F1: {np.mean(scores["micro-f1"]) * 100:.1f} +/- {np.std(scores["micro-f1"]) * 100:.1f}\t'
        f'Macro-F1: {np.mean(scores["macro-f1"]) * 100:.1f} +/- {np.std(scores["macro-f1"]) * 100:.1f}\t'
    )


if __name__ == '__main__':
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


/content


Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/511k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5532 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1607 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2275 [00:00<?, ? examples/s]

UnboundLocalError: ignored

In [None]:
import os
contents = os.listdir()
print(contents)
import os

# Get the current working directory
current_dir = os.getcwd()

# Define the path to the logs directory
logs_dir = os.path.join(current_dir, 'logs')

# Check if the logs directory exists
if os.path.exists(logs_dir):
    # Get the list of files and directories inside the logs directory
    contents = os.listdir(logs_dir)

    # Display the contents
    print(contents)
else:
    print("Logs directory does not exist.")


    # Define the path to the case_hold_svm.txt file
file_path = os.path.join(logs_dir, 'case_hold_svm.txt')

# Check if the file exists
if os.path.isfile(file_path):
    # Open the file and read its contents
    with open(file_path, 'r') as file:
        file_contents = file.read()

    # Print the contents to the console
    print(file_contents)
else:
    print("case_hold_svm.txt does not exist.")

['.config', 'logs', 'sample_data']
['case_hold_svm.txt', 'case_hold']



In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
import numpy as np
from torch import nn
from transformers.file_utils import ModelOutput


@dataclass
class SimpleOutput(ModelOutput):
    last_hidden_state: torch.FloatTensor = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None


def sinusoidal_init(num_embeddings: int, embedding_dim: int):
    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2 * i / embedding_dim) for i in range(embedding_dim)]
        if pos != 0 else np.zeros(embedding_dim) for pos in range(num_embeddings)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return torch.from_numpy(position_enc).type(torch.FloatTensor)


class HierarchicalBert(nn.Module):

    def __init__(self, encoder, max_segments=64, max_segment_length=128):
        super(HierarchicalBert, self).__init__()
        supported_models = ['bert', 'roberta', 'deberta']
        assert encoder.config.model_type in supported_models  # other model types are not supported so far
        # Pre-trained segment (token-wise) encoder, e.g., BERT
        self.encoder = encoder
        # Specs for the segment-wise encoder
        self.hidden_size = encoder.config.hidden_size
        self.max_segments = max_segments
        self.max_segment_length = max_segment_length
        # Init sinusoidal positional embeddings
        self.seg_pos_embeddings = nn.Embedding(max_segments + 1, encoder.config.hidden_size,
                                               padding_idx=0,
                                               _weight=sinusoidal_init(max_segments + 1, encoder.config.hidden_size))
        # Init segment-wise transformer-based encoder
        self.seg_encoder = nn.Transformer(d_model=encoder.config.hidden_size,
                                          nhead=encoder.config.num_attention_heads,
                                          batch_first=True, dim_feedforward=encoder.config.intermediate_size,
                                          activation=encoder.config.hidden_act,
                                          dropout=encoder.config.hidden_dropout_prob,
                                          layer_norm_eps=encoder.config.layer_norm_eps,
                                          num_encoder_layers=2, num_decoder_layers=0).encoder

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,
                ):
        # Hypothetical Example
        # Batch of 4 documents: (batch_size, n_segments, max_segment_length) --> (4, 64, 128)
        # BERT-BASE encoder: 768 hidden units

        # Squash samples and segments into a single axis (batch_size * n_segments, max_segment_length) --> (256, 128)
        input_ids_reshape = input_ids.contiguous().view(-1, input_ids.size(-1))
        attention_mask_reshape = attention_mask.contiguous().view(-1, attention_mask.size(-1))
        if token_type_ids is not None:
            token_type_ids_reshape = token_type_ids.contiguous().view(-1, token_type_ids.size(-1))
        else:
            token_type_ids_reshape = None

        # Encode segments with BERT --> (256, 128, 768)
        encoder_outputs = self.encoder(input_ids=input_ids_reshape,
                                       attention_mask=attention_mask_reshape,
                                       token_type_ids=token_type_ids_reshape)[0]

        # Reshape back to (batch_size, n_segments, max_segment_length, output_size) --> (4, 64, 128, 768)
        encoder_outputs = encoder_outputs.contiguous().view(input_ids.size(0), self.max_segments,
                                                            self.max_segment_length,
                                                            self.hidden_size)

        # Gather CLS outputs per segment --> (4, 64, 768)
        encoder_outputs = encoder_outputs[:, :, 0]

        # Infer real segments, i.e., mask paddings
        seg_mask = (torch.sum(input_ids, 2) != 0).to(input_ids.dtype)
        # Infer and collect segment positional embeddings
        seg_positions = torch.arange(1, self.max_segments + 1).to(input_ids.device) * seg_mask
        # Add segment positional embeddings to segment inputs
        encoder_outputs += self.seg_pos_embeddings(seg_positions)

        # Encode segments with segment-wise transformer
        seg_encoder_outputs = self.seg_encoder(encoder_outputs)

        # Collect document representation
        outputs, _ = torch.max(seg_encoder_outputs, 1)

        return SimpleOutput(last_hidden_state=outputs, hidden_states=outputs)


if __name__ == "__main__":
    from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    # Use as a stand-alone encoder
    bert = AutoModel.from_pretrained('bert-base-uncased')
    model = HierarchicalBert(encoder=bert, max_segments=64, max_segment_length=128)

    fake_inputs = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
    for i in range(4):
        # Tokenize segment
        temp_inputs = tokenizer(['dog ' * 126] * 64)
        fake_inputs['input_ids'].append(temp_inputs['input_ids'])
        fake_inputs['attention_mask'].append(temp_inputs['attention_mask'])
        fake_inputs['token_type_ids'].append(temp_inputs['token_type_ids'])

    fake_inputs['input_ids'] = torch.as_tensor(fake_inputs['input_ids'])
    fake_inputs['attention_mask'] = torch.as_tensor(fake_inputs['attention_mask'])
    fake_inputs['token_type_ids'] = torch.as_tensor(fake_inputs['token_type_ids'])

    output = model(fake_inputs['input_ids'], fake_inputs['attention_mask'], fake_inputs['token_type_ids'])

    # 4 document representations of 768 features are expected
    assert output[0].shape == torch.Size([4, 768])

    # Use with HuggingFace AutoModelForSequenceClassification and Trainer API

    # Init Classifier
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10)
    # Replace flat BERT encoder with hierarchical BERT encoder
    model.bert = HierarchicalBert(encoder=model.bert, max_segments=64, max_segment_length=128)
    output = model(fake_inputs['input_ids'], fake_inputs['attention_mask'], fake_inputs['token_type_ids'])

    # 4 document outputs with 10 (num_labels) logits are expected
    assert output.logits.shape == torch.Size([4, 10])



Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# New section

In [None]:
import torch
from torch import nn
from transformers import DebertaPreTrainedModel, DebertaModel
from transformers.modeling_outputs import SequenceClassifierOutput, MultipleChoiceModelOutput
from transformers.activations import ACT2FN


class ContextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.

        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token)
        pooled_output = self.dense(context_token)
        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
        return pooled_output

    @property
    def output_dim(self):
        return self.config.hidden_size


class DropoutContext(object):
    def __init__(self):
        self.dropout = 0
        self.mask = None
        self.scale = 1
        self.reuse_mask = True


def get_mask(input, local_context):
    if not isinstance(local_context, DropoutContext):
        dropout = local_context
        mask = None
    else:
        dropout = local_context.dropout
        dropout *= local_context.scale
        mask = local_context.mask if local_context.reuse_mask else None

    if dropout > 0 and mask is None:
        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()

    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
            local_context.mask = mask

    return mask, dropout


class XDropout(torch.autograd.Function):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        mask, dropout = get_mask(input, local_ctx)
        ctx.scale = 1.0 / (1 - dropout)
        if dropout > 0:
            ctx.save_for_backward(mask)
            return input.masked_fill(mask, 0) * ctx.scale
        else:
            return input

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.scale > 1:
            (mask,) = ctx.saved_tensors
            return grad_output.masked_fill(mask, 0) * ctx.scale, None
        else:
            return grad_output, None


class StableDropout(nn.Module):
    """
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    """

    def __init__(self, drop_prob):
        super().__init__()
        self.drop_prob = drop_prob
        self.count = 0
        self.context_stack = None

    def forward(self, x):
        """
        Call the module

        Args:
            x (:obj:`torch.tensor`): The input tensor to apply dropout
        """
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
        return x

    def clear_context(self):
        self.count = 0
        self.context_stack = None

    def init_context(self, reuse_mask=True, scale=1):
        if self.context_stack is None:
            self.context_stack = []
        self.count = 0
        for c in self.context_stack:
            c.reuse_mask = reuse_mask
            c.scale = scale

    def get_context(self):
        if self.context_stack is not None:
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())
            ctx = self.context_stack[self.count]
            ctx.dropout = self.drop_prob
            self.count += 1
            return ctx
        else:
            return self.drop_prob


class DebertaForSequenceClassification(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaModel(config)

        self.classifier = nn.Linear(config.hidden_size, num_labels)
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = nn.Dropout(drop_out)

        self.init_weights()

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                # regression task
                loss_fn = nn.MSELoss()
                logits = logits.view(-1).to(labels.dtype)
                loss = loss_fn(logits, labels.view(-1))
            elif labels.dim() == 1 or labels.size(-1) == 1:
                label_index = (labels >= 0).nonzero()
                labels = labels.long()
                if label_index.size(0) > 0:
                    labeled_logits = torch.gather(logits, 0, label_index.expand(label_index.size(0), logits.size(1)))
                    labels = torch.gather(labels, 0, label_index.view(-1))
                    loss_fct = nn.CrossEntropyLoss()
                    loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1))
                else:
                    loss = torch.tensor(0).to(logits)
            else:
                log_softmax = nn.LogSoftmax(-1)
                loss = -((log_softmax(logits) * labels).sum(-1)).mean()
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        else:
            return SequenceClassifierOutput(
                loss=loss,
                logits=logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )


class DebertaForMultipleChoice(DebertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.deberta = DebertaModel(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)
        self.classifier = nn.Linear(output_dim, 1)

        self.init_weights()

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



In [None]:
!pip install transformers[torch] accelerate

In [None]:
from datasets import load_dataset
dataset_dict = load_dataset("lex_glue", runtime_args('scotus','bert'))
#print(dataset)
#Divide into train,dev,test

from sklearn.model_selection import train_test_split

#data_list = list(dataset_dict.items())

train_set_dict = dataset_dict['train'].data
test_set_dict = dataset_dict['test'].data
validation_set_dict = dataset_dict['validation'].data
print(train_set_dict[0])

FileNotFoundError: ignored

# New section

In [None]:
# coding=utf-8
""" Finetuning models on SCOTUS (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
import random
import re
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset
from sklearn.metrics import f1_score

import numpy as np
from torch import nn
import glob
import shutil
import torch
import transformers
from transformers import (
    Trainer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
#from models.deberta import DebertaForSequenceClassification

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.9.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)


#desired_max_split_size_mb = 100

from transformers import AutoModel, AutoTokenizer

# First, load the tokenizer and pre-trained BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# Then, create an instance of HierarchicalBert
max_segments = 64
max_segment_length = 128
HierarchicalBertObj = HierarchicalBert(encoder=bert_model, max_segments=max_segments, max_segment_length=max_segment_length)
# Set the environment variable PYTORCH_CUDA_ALLOC_CONF with the desired value
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=256,512,1024"
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated())  # Memory allocated on GPU 0
print(torch.cuda.max_memory_allocated())  # Peak memory allocated on GPU 0
#max_split_size_mb = torch.cuda.memory._get_max_memory_allocated() / (1024.0 * 1024.0)

#print("max_split_size_mb:", max_split_size_mb)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_segments: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum number of segments (paragraphs) to be considered. Sequences longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_seg_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    server_ip: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})
    server_port: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    hierarchical: bool = field(
        default=True, metadata={"help": "Whether to use a hierarchical variant or not"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    do_lower_case: Optional[bool] = field(
        default=True,
        metadata={"help": "arg to indicate if tokenizer should do lower case in AutoTokenizer.from_pretrained()"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )


def main(training_args):
    # Set default values for arguments
    model_args = ModelArguments(
        model_name_or_path="microsoft/deberta-base",
        hierarchical=True,
        do_lower_case=True,
        use_fast_tokenizer=True,
    )
    data_args = DataTrainingArguments(
        max_seq_length=128,
        max_segments=64,
        max_seg_length=128,
        overwrite_cache=False,
        pad_to_max_length=True,
    )


    # Fix boolean parameter
    if model_args.do_lower_case == 'False' or not model_args.do_lower_case:
        model_args.do_lower_case = False
    else:
        model_args.do_lower_case = True

    if model_args.hierarchical == 'False' or not model_args.hierarchical:
        model_args.hierarchical = False
    else:
        model_args.hierarchical = True

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # Downloading and loading eurlex dataset from the hub.
    if training_args.do_train:
        train_dataset = load_dataset("lex_glue", "scotus", split="train", cache_dir=model_args.cache_dir)

    if training_args.do_eval:
        eval_dataset = load_dataset("lex_glue", "scotus", split="validation", cache_dir=model_args.cache_dir)

    if training_args.do_predict:
        predict_dataset = load_dataset("lex_glue", "scotus", split="test", cache_dir=model_args.cache_dir)

    # Labels
    label_list = list(range(14))
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task="scotus",
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        do_lower_case=model_args.do_lower_case,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    if config.model_type == 'deberta' and model_args.hierarchical:
        model = DebertaForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    if model_args.hierarchical:
        # Hack the classifier encoder to use hierarchical BERT
        if config.model_type in ['bert', 'deberta']:
            if config.model_type == 'bert':
                segment_encoder = model.bert
            else:
                segment_encoder = model.deberta
            model_encoder = HierarchicalBert(encoder=segment_encoder,
                                             max_segments=data_args.max_segments,
                                             max_segment_length=data_args.max_seg_length)
            if config.model_type == 'bert':
                model.bert = model_encoder
            elif config.model_type == 'deberta':
                model.deberta = model_encoder
            else:
                raise NotImplementedError(f"{config.model_type} is no supported yet!")
        elif config.model_type == 'roberta':
            model_encoder = HierarchicalBert(encoder=model.roberta, max_segments=data_args.max_segments,
                                             max_segment_length=data_args.max_seg_length)
            model.roberta = model_encoder
            # Build a new classification layer, as well
            dense = nn.Linear(config.hidden_size, config.hidden_size)
            dense.load_state_dict(model.classifier.dense.state_dict())  # load weights
            dropout = nn.Dropout(config.hidden_dropout_prob).to(model.device)
            out_proj = nn.Linear(config.hidden_size, config.num_labels).to(model.device)
            out_proj.load_state_dict(model.classifier.out_proj.state_dict())  # load weights
            model.classifier = nn.Sequential(dense, dropout, out_proj).to(model.device)
        elif config.model_type in ['longformer', 'big_bird']:
            pass
        else:
            raise NotImplementedError(f"{config.model_type} is no supported yet!")

    # Preprocessing the datasets
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    def preprocess_function(examples):
        # Tokenize the texts
        if model_args.hierarchical:
            case_template = [[0] * data_args.max_seq_length]
            if config.model_type == 'roberta':
                batch = {'input_ids': [], 'attention_mask': []}
                for doc in examples['text']:
                    doc = re.split('\n{2,}', doc)
                    doc_encodings = tokenizer(doc[:data_args.max_segments], padding=padding,
                                              max_length=data_args.max_seg_length, truncation=True)
                    batch['input_ids'].append(doc_encodings['input_ids'] + case_template * (
                            data_args.max_segments - len(doc_encodings['input_ids'])))
                    batch['attention_mask'].append(doc_encodings['attention_mask'] + case_template * (
                            data_args.max_segments - len(doc_encodings['attention_mask'])))
            else:
                batch = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
                for doc in examples['text']:
                    doc = re.split('\n{2,}', doc)
                    doc_encodings = tokenizer(doc[:data_args.max_segments], padding=padding,
                                              max_length=data_args.max_seg_length, truncation=True)
                    batch['input_ids'].append(doc_encodings['input_ids'] + case_template * (
                                data_args.max_segments - len(doc_encodings['input_ids'])))
                    batch['attention_mask'].append(doc_encodings['attention_mask'] + case_template * (
                                data_args.max_segments - len(doc_encodings['attention_mask'])))
                    batch['token_type_ids'].append(doc_encodings['token_type_ids'] + case_template * (
                                data_args.max_segments - len(doc_encodings['token_type_ids'])))
        elif config.model_type in ['longformer', 'big_bird']:
            cases = []
            max_position_embeddings = config.max_position_embeddings - 2 if config.model_type == 'longformer' \
                else config.max_position_embeddings
            for doc in examples['text']:
                doc = re.split('\n{2,}', doc)
                cases.append(f' {tokenizer.sep_token} '.join([' '.join(paragraph.split()[:data_args.max_seg_length])
                                                              for paragraph in doc[:data_args.max_segments]]))
            batch = tokenizer(cases, padding=padding, max_length=max_position_embeddings, truncation=True)
            if config.model_type == 'longformer':
                global_attention_mask = np.zeros((len(cases), max_position_embeddings), dtype=np.int32)
                # global attention on cls token
                global_attention_mask[:, 0] = 1
                batch['global_attention_mask'] = list(global_attention_mask)
        else:
            batch = tokenizer(examples['text'], padding=padding, max_length=512, truncation=True)

        batch["label"] = [label_list.index(labels) for labels in examples["label"]]

        return batch

    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(logits, axis=1)
        macro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                for index, pred_list in enumerate(predictions[0]):
                    pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list])
                    writer.write(f"{index}\t{pred_line}\n")

    # Clean up checkpoints
    checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
    for checkpoint in checkpoints:
        shutil.rmtree(checkpoint)


if __name__ == "__main__":
    #main()

#For training

    training_args = TrainingArguments(
        do_train = True,
        do_eval = False,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    #main(training_args)

# For Validation
    training_args = TrainingArguments(
        do_train = False,
        do_eval = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    #main(training_args)

    # For Evaluation
    training_args = TrainingArguments(
        do_train = True,
        do_eval = True,
        do_predict = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    main(training_args)



40683995648
40683995648


You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on prediction dataset:   0%|          | 0/1400 [00:00<?, ? examples/s]

OutOfMemoryError: ignored

In [None]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on the ECtHR dataset (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset
from sklearn.metrics import f1_score
#from trainer import MultilabelTrainer
from scipy.special import expit
from torch import nn
import glob
import shutil
import torch
torch.cuda.empty_cache()
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
#from models.hierbert import HierarchicalBert
#from models.deberta import DebertaForSequenceClassification


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.9.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)

from transformers import AutoModel, AutoTokenizer

# First, load the tokenizer and pre-trained BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# Then, create an instance of HierarchicalBert
max_segments = 64
max_segment_length = 128
HierarchicalBertObj = HierarchicalBert(encoder=bert_model, max_segments=max_segments, max_segment_length=max_segment_length)
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=256,512,1024"

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    max_seq_length: Optional[int] = field(
        default=4096,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_segments: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum number of segments (paragraphs) to be considered. Sequences longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_seg_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum segment (paragraph) length to be considered. Segments longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    task: Optional[str] = field(
        default='ecthr_b',
        metadata={
            "help": "Define downstream task"
        },
    )
    server_ip: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})
    server_port: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    hierarchical: bool = field(
        default=True, metadata={"help": "Whether to use a hierarchical variant or not"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    do_lower_case: Optional[bool] = field(
        default=True,
        metadata={"help": "arg to indicate if tokenizer should do lower case in AutoTokenizer.from_pretrained()"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )


def main(training_args):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    model_args = ModelArguments(
        model_name_or_path="nlpaueb/legal-bert-base-uncased",
        hierarchical=True,
        do_lower_case=True,
        use_fast_tokenizer=True,
    )
    data_args = DataTrainingArguments(
        max_seq_length=128,
        max_segments=64,
        max_seg_length=128,
        overwrite_cache=False,
        pad_to_max_length=True,
    )


    # Fix boolean parameter
    if model_args.do_lower_case == 'False' or not model_args.do_lower_case:
        model_args.do_lower_case = False
    else:
        model_args.do_lower_case = True

    if model_args.hierarchical == 'False' or not model_args.hierarchical:
        model_args.hierarchical = False
    else:
        model_args.hierarchical = True

    # Setup distant debugging if needed
    if data_args.server_ip and data_args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(data_args.server_ip, data_args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # Downloading and loading eurlex dataset from the hub.
    if training_args.do_train:
        train_dataset = load_dataset("lex_glue", name=data_args.task, split="train", data_dir='data', cache_dir=model_args.cache_dir)

    if training_args.do_eval:
        eval_dataset = load_dataset("lex_glue", name=data_args.task, split="validation", data_dir='data', cache_dir=model_args.cache_dir)

    if training_args.do_predict:
        predict_dataset = load_dataset("lex_glue", name=data_args.task, split="test", data_dir='data', cache_dir=model_args.cache_dir)

    # Labels
    label_list = list(range(10))
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=f"{data_args.task}",
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        do_lower_case=model_args.do_lower_case,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    if config.model_type == 'deberta' and model_args.hierarchical:
        model = DebertaForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )

    if model_args.hierarchical:
        # Hack the classifier encoder to use hierarchical BERT
        if config.model_type in ['bert', 'deberta']:
            if config.model_type == 'bert':
                segment_encoder = model.bert
            else:
                segment_encoder = model.deberta
            model_encoder = HierarchicalBert(encoder=segment_encoder,
                                             max_segments=data_args.max_segments,
                                             max_segment_length=data_args.max_seg_length)
            if config.model_type == 'bert':
                model.bert = model_encoder
            elif config.model_type == 'deberta':
                model.deberta = model_encoder
            else:
                raise NotImplementedError(f"{config.model_type} is no supported yet!")
        elif config.model_type == 'roberta':
            model_encoder = HierarchicalBert(encoder=model.roberta, max_segments=data_args.max_segments,
                                             max_segment_length=data_args.max_seg_length)
            model.roberta = model_encoder
            # Build a new classification layer, as well
            dense = nn.Linear(config.hidden_size, config.hidden_size)
            dense.load_state_dict(model.classifier.dense.state_dict())  # load weights
            dropout = nn.Dropout(config.hidden_dropout_prob).to(model.device)
            out_proj = nn.Linear(config.hidden_size, config.num_labels).to(model.device)
            out_proj.load_state_dict(model.classifier.out_proj.state_dict())  # load weights
            model.classifier = nn.Sequential(dense, dropout, out_proj).to(model.device)
        elif config.model_type in ['longformer', 'big_bird']:
            pass
        else:
            raise NotImplementedError(f"{config.model_type} is no supported yet!")

    # Preprocessing the datasets
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    def preprocess_function(examples):
        # Tokenize the texts
        if model_args.hierarchical:
            case_template = [[0] * data_args.max_seg_length]
            if config.model_type == 'roberta':
                batch = {'input_ids': [], 'attention_mask': []}
                for case in examples['text']:
                    case_encodings = tokenizer(case[:data_args.max_segments], padding=padding,
                                               max_length=data_args.max_seg_length, truncation=True)
                    batch['input_ids'].append(case_encodings['input_ids'] + case_template * (
                                data_args.max_segments - len(case_encodings['input_ids'])))
                    batch['attention_mask'].append(case_encodings['attention_mask'] + case_template * (
                                data_args.max_segments - len(case_encodings['attention_mask'])))
            else:
                batch = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
                for case in examples['text']:
                    case_encodings = tokenizer(case[:data_args.max_segments], padding=padding,
                                               max_length=data_args.max_seg_length, truncation=True)
                    batch['input_ids'].append(case_encodings['input_ids'] + case_template * (
                            data_args.max_segments - len(case_encodings['input_ids'])))
                    batch['attention_mask'].append(case_encodings['attention_mask'] + case_template * (
                            data_args.max_segments - len(case_encodings['attention_mask'])))
                    batch['token_type_ids'].append(case_encodings['token_type_ids'] + case_template * (
                            data_args.max_segments - len(case_encodings['token_type_ids'])))
        elif config.model_type in ['longformer', 'big_bird']:
            cases = []
            max_position_embeddings = config.max_position_embeddings - 2 if config.model_type == 'longformer' \
                else config.max_position_embeddings
            for case in examples['text']:
                cases.append(f' {tokenizer.sep_token} '.join(
                    [' '.join(fact.split()[:data_args.max_seg_length]) for fact in case[:data_args.max_segments]]))
            batch = tokenizer(cases, padding=padding, max_length=max_position_embeddings, truncation=True)
            if config.model_type == 'longformer':
                global_attention_mask = np.zeros((len(cases), max_position_embeddings), dtype=np.int32)
                # global attention on cls token
                global_attention_mask[:, 0] = 1
                batch['global_attention_mask'] = list(global_attention_mask)
        else:
            cases = []
            for case in examples['text']:
                cases.append(f'\n'.join(case))
            batch = tokenizer(cases, padding=padding, max_length=512, truncation=True)

        batch["labels"] = [[1 if label in labels else 0 for label in label_list] for labels in examples["labels"]]

        return batch

    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        # Fix gold labels
        y_true = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_true[:, :-1] = p.label_ids
        y_true[:, -1] = (np.sum(p.label_ids, axis=1) == 0).astype('int32')
        # Fix predictions
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = (expit(logits) > 0.5).astype('int32')
        y_pred = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_pred[:, :-1] = preds
        y_pred[:, -1] = (np.sum(preds, axis=1) == 0).astype('int32')
        # Compute scores
        macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = MultilabelTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                for index, pred_list in enumerate(predictions[0]):
                    pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list])
                    writer.write(f"{index}\t{pred_line}\n")

    # Clean up checkpoints
    checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
    for checkpoint in checkpoints:
        shutil.rmtree(checkpoint)


if __name__ == "__main__":
    #For training

    training_args = TrainingArguments(
        do_train = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    #main(training_args)

# For Validation
    training_args = TrainingArguments(
        do_train = False,
        do_eval = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    #main(training_args)

    # For Evaluation
    training_args = TrainingArguments(
        do_train = True,
        do_eval = True,
        do_predict = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="micro-f1",
    )
    main(training_args)




Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/32.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Running tokenizer on validation dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running tokenizer on prediction dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,0.2405,0.270479,0.135584,0.313827


In [None]:

!pip install sentencepiece
! pip install torch
! pip install transformers
! pip install scikit-learn
! pip install tqdm
! pip install numpy
! pip install datasets
! pip install nltk
import nltk
nltk.download('stopwords')
! pip install scipy
! pip install transformers[torch] accelerate
#! pip install transformers-cli


Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
[31mERROR: Could not find a version that satisfies the requirement transformers-cli (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformers-cli[0m[31m
[0m

In [None]:
!pip install nlpaug
import random
import nlpaug.augmenter.word as naw
from datasets import load_dataset

# Initialize the augmentation object
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
dataset = load_dataset('lex_glue', 'unfair_tos')

# Get the training data
train_data = dataset['train']

# Define batch size
batch_size = 32

# Augment the data
augmented_texts = []
augmented_labels = []

for i in range(0, len(train_data), batch_size):
    batch = train_data[i:i+batch_size]
    batch_texts = batch['text']  # Access the 'text' key of each batch element
    batch_labels = batch['labels']  # Access the 'labels' key of each batch element

    augmented_batch = aug.augment(batch_texts)
    augmented_texts.extend(augmented_batch)
    augmented_labels.extend(batch_labels)

# Combine original and augmented data
combined_data = list(zip(augmented_texts, augmented_labels))
random.shuffle(combined_data)
augmented_texts, augmented_labels = zip(*combined_data)

# Print some augmented examples
for text, label in zip(augmented_texts[:10], augmented_labels[:10]):
    print("Augmented Text:", text)
    print("Label:", label)
    print("=" * 50)




In [None]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on UNFAIR-ToC (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset
from sklearn.metrics import f1_score
#from trainer import MultilabelTrainer
from scipy.special import expit
import glob
import shutil
import numpy as np

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.9.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    max_seq_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    server_ip: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})
    server_port: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    do_lower_case: Optional[bool] = field(
        default=True,
        metadata={"help": "arg to indicate if tokenizer should do lower case in AutoTokenizer.from_pretrained()"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )


def main(training_args):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    model_args = ModelArguments(
        model_name_or_path="nlpaueb/legal-bert-base-uncased",
        #hierarchical=True,
        do_lower_case=True,
        use_fast_tokenizer=True,
    )
    data_args = DataTrainingArguments(
        max_seq_length=128,
        #max_segments=64,
        #max_seg_length=128,
        overwrite_cache=False,
        pad_to_max_length=True,
    )


    # Setup distant debugging if needed
    if data_args.server_ip and data_args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(data_args.server_ip, data_args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Fix boolean parameter
    if model_args.do_lower_case == 'False' or not model_args.do_lower_case:
        model_args.do_lower_case = False
        'Tokenizer do_lower_case False'
    else:
        model_args.do_lower_case = True

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # Downloading and loading eurlex dataset from the hub.
    if training_args.do_train:
        train_dataset = load_dataset("lex_glue", "unfair_tos", split="train", data_dir='data', cache_dir=model_args.cache_dir)

    if training_args.do_eval:
        eval_dataset = load_dataset("lex_glue", "unfair_tos", split="validation", data_dir='data', cache_dir=model_args.cache_dir)

    if training_args.do_predict:
        predict_dataset = load_dataset("lex_glue", "unfair_tos", split="test", data_dir='data', cache_dir=model_args.cache_dir)

    # Labels
    label_list = list(range(8))
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task="unfair_toc",
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    if config.model_type == 'big_bird':
        config.attention_type = 'original_full'

    if config.model_type == 'longformer':
        config.attention_window = [128] * config.num_hidden_layers

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        do_lower_case=model_args.do_lower_case,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    def preprocess_function(examples):
        # Tokenize the texts
        batch = tokenizer(
            examples["text"],
            padding=padding,
            max_length=data_args.max_seq_length,
            truncation=True,
        )
        batch["labels"] = [[1 if label in labels else 0 for label in label_list] for labels in
                              examples["labels"]]

        return batch

    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        # Fix gold labels
        y_true = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_true[:, :-1] = p.label_ids
        y_true[:, -1] = (np.sum(p.label_ids, axis=1) == 0).astype('int32')
        # Fix predictions
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = (expit(logits) > 0.5).astype('int32')
        y_pred = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
        y_pred[:, :-1] = preds
        y_pred[:, -1] = (np.sum(preds, axis=1) == 0).astype('int32')
        # Compute scores
        macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = MultilabelTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                for index, pred_list in enumerate(predictions[0]):
                    pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list.tolist()])
                    writer.write(f"{index}\t{pred_line}\n")

    # Clean up checkpoints
    checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
    for checkpoint in checkpoints:
        shutil.rmtree(checkpoint)


if __name__ == "__main__":
 #    For Evaluation
    training_args = TrainingArguments(
        do_train = True,
        do_eval = True,
        do_predict = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=200,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    main(training_args)


You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on validation dataset:   0%|          | 0/2275 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,0.0764,0.070431,0.105195,0.895163
1000,0.0674,0.054441,0.105195,0.895163


***** train metrics *****
  epoch                    =        2.0
  total_flos               =   677820GF
  train_loss               =     0.0779
  train_runtime            = 0:01:44.13
  train_samples            =       5532
  train_samples_per_second =    106.249
  train_steps_per_second   =     13.291


***** eval metrics *****
  epoch                   =        2.0
  eval_loss               =     0.0704
  eval_macro-f1           =     0.1052
  eval_micro-f1           =     0.8952
  eval_runtime            = 0:00:05.10
  eval_samples            =       2275
  eval_samples_per_second =    445.227
  eval_steps_per_second   =     55.776
***** predict metrics *****
  predict_loss               =     0.0734
  predict_macro-f1           =     0.1048
  predict_micro-f1           =     0.8891
  predict_runtime            = 0:00:03.91
  predict_samples            =       1607
  predict_samples_per_second =    410.974
  predict_steps_per_second   =     51.404


TypeError: ignored

In [None]:
pip install --upgrade transformers



In [None]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on LEDGAR (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset
from sklearn.metrics import f1_score
import numpy as np
import glob
import shutil

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
    Trainer
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.9.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    max_seq_length: Optional[int] = field(
        default=512,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    server_ip: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})
    server_port: Optional[str] = field(default=None, metadata={"help": "For distant debugging."})


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    do_lower_case: Optional[bool] = field(
        default=True,
        metadata={"help": "arg to indicate if tokenizer should do lower case in AutoTokenizer.from_pretrained()"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )


def main(training_args):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    model_args = ModelArguments(
        model_name_or_path="nlpaueb/legal-bert-base-uncased",
        #hierarchical=True,
        do_lower_case=True,
        use_fast_tokenizer=True,
    )
    data_args = DataTrainingArguments(
        max_seq_length=128,
        #max_segments=64,
        #max_seg_length=128,
        overwrite_cache=False,
        pad_to_max_length=True,
    )

    # Setup distant debugging if needed
    if data_args.server_ip and data_args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(data_args.server_ip, data_args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Fix boolean parameter
    if model_args.do_lower_case == 'False' or not model_args.do_lower_case:
        model_args.do_lower_case = False
        'Tokenizer do_lower_case False'
    else:
        model_args.do_lower_case = True

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # Downloading and loading eurlex dataset from the hub.
    if training_args.do_train:
        train_dataset = load_dataset("lex_glue", "ledgar", split="train", cache_dir=model_args.cache_dir)

    if training_args.do_eval:
        eval_dataset = load_dataset("lex_glue", "ledgar", split="validation", cache_dir=model_args.cache_dir)

    if training_args.do_predict:
        predict_dataset = load_dataset("lex_glue", "ledgar", split="test", cache_dir=model_args.cache_dir)

    # Labels
    label_list = list(range(100))
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task="eurlex",
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    if config.model_type == 'big_bird':
        config.attention_type = 'original_full'

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        do_lower_case=model_args.do_lower_case,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets
    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    def preprocess_function(examples):
        # Tokenize the texts
        batch = tokenizer(
            examples["text"],
            padding=padding,
            max_length=data_args.max_seq_length,
            truncation=True,
        )
        batch["label"] = [label_list.index(label) for label in examples["label"]]

        return batch

    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        with training_args.main_process_first(desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.argmax(logits, axis=1)
        macro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        #train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        #eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Prediction
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                for index, pred_list in enumerate(predictions):
                    pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list])
                    writer.write(f"{index}\t{pred_line}\n")


    # Clean up checkpoints
    checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
    for checkpoint in checkpoints:
        shutil.rmtree(checkpoint)


if __name__ == "__main__":
 #    For Evaluation
    training_args = TrainingArguments(
        do_train = True,
        do_eval = True,
        do_predict = True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=200,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end = True,
        metric_for_best_model="macro-f1",
    )
    main(training_args)



Downloading data:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on train dataset:   0%|          | 0/60000 [00:00<?, ? examples/s]

Running tokenizer on validation dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Running tokenizer on prediction dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,2.5363,1.86124,0.437121,0.6727
1000,1.3987,1.268317,0.488014,0.7166
1500,1.2781,1.08862,0.547837,0.7516
2000,1.0925,1.016976,0.579902,0.7653
2500,1.1488,1.033687,0.594855,0.7585
3000,1.0204,0.973116,0.601465,0.7679
3500,0.9504,0.906729,0.60536,0.769
4000,0.9496,0.931866,0.625732,0.7828
4500,0.8774,0.912036,0.61399,0.7801
5000,0.9717,0.894923,0.63327,0.7855


***** train metrics *****
  epoch                    =       0.73
  total_flos               =  2697825GF
  train_loss               =     1.0322
  train_runtime            = 0:23:47.29
  train_samples            =      60000
  train_samples_per_second =     84.075
  train_steps_per_second   =     21.019


***** eval metrics *****
  epoch                   =       0.73
  eval_loss               =     0.7573
  eval_macro-f1           =     0.7118
  eval_micro-f1           =     0.8237
  eval_runtime            = 0:00:35.94
  eval_samples            =      10000
  eval_samples_per_second =    278.173
  eval_steps_per_second   =     69.543
***** predict metrics *****
  predict_loss               =      0.743
  predict_macro-f1           =     0.7082
  predict_micro-f1           =     0.8263
  predict_runtime            = 0:00:39.53
  predict_samples            =      10000
  predict_samples_per_second =    252.926
  predict_steps_per_second   =     63.231


In [None]:
import logging
import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional

import tqdm
import re

from filelock import FileLock
from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
import datasets

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    """

    input_ids: List[List[int]]
    attention_mask: Optional[List[List[int]]]
    token_type_ids: Optional[List[List[int]]]
    label: Optional[int]


class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


if is_torch_available():
    import torch
    from torch.utils.data.dataset import Dataset

    class MultipleChoiceDataset(Dataset):
        """
        PyTorch multiple choice dataset class
        """

        features: List[InputFeatures]

        def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            dataset = datasets.load_dataset('lex_glue', task)
            tokenizer_name = re.sub('[^a-z]+', ' ', tokenizer.name_or_path).title().replace(' ', '')
            cached_features_file = os.path.join(
                '.cache',
                task,
                "cached_{}_{}_{}_{}".format(
                    mode.value,
                    tokenizer_name,
                    str(max_seq_length),
                    task,
                ),
            )

            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            if not os.path.exists(os.path.join('.cache', task)):
                if not os.path.exists('.cache'):
                    os.mkdir('.cache')
                os.mkdir(os.path.join('.cache', task))
            with FileLock(lock_path):

                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {task}")
                    if mode == Split.dev:
                        examples = dataset['validation']
                    elif mode == Split.test:
                        examples = dataset['test']
                    elif mode == Split.train:
                        examples = dataset['train']
                    logger.info("Training examples: %s", len(examples))
                    self.features = convert_examples_to_features(
                        examples,
                        max_seq_length,
                        tokenizer,
                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


if is_tf_available():
    import tensorflow as tf

    class TFMultipleChoiceDataset:
        """
        TensorFlow multiple choice dataset class
        """

        features: List[InputFeatures]

        def __init__(
            self,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = 256,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            dataset = datasets.load_dataset('lex_glue')

            logger.info(f"Creating features from dataset file at {task}")
            if mode == Split.dev:
                examples = dataset['validation']
            elif mode == Split.test:
                examples = dataset['test']
            else:
                examples = dataset['train']
            logger.info(f"{mode.name.title()} examples: %s", len(examples))

            self.features = convert_examples_to_features(
                examples,
                max_seq_length,
                tokenizer,
            )

            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
                    if ex_index % 10000 == 0:
                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))

                    yield (
                        {
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "token_type_ids": ex.token_type_ids,
                        },
                        ex.label,
                    )

            self.dataset = tf.data.Dataset.from_generator(
                gen,
                (
                    {
                        "input_ids": tf.int32,
                        "attention_mask": tf.int32,
                        "token_type_ids": tf.int32,
                    },
                    tf.int64,
                ),
                (
                    {
                        "input_ids": tf.TensorShape([None, None]),
                        "attention_mask": tf.TensorShape([None, None]),
                        "token_type_ids": tf.TensorShape([None, None]),
                    },
                    tf.TensorShape([]),
                ),
            )

        def get_dataset(self):
            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))

            return self.dataset

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


def convert_examples_to_features(
    examples: datasets.Dataset,
    max_length: int,
    tokenizer: PreTrainedTokenizer,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
    """
    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_inputs = []
        for ending_idx, ending in enumerate(example['endings']):
            context = example['context']
            inputs = tokenizer(
                context,
                ending,
                add_special_tokens=True,
                max_length=max_length,
                padding="max_length",
                truncation=True,
            )

            choices_inputs.append(inputs)

        label = example['label']

        input_ids = [x["input_ids"] for x in choices_inputs]
        attention_mask = (
            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
        )
        token_type_ids = (
            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
        )

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            )
        )

    for f in features[:2]:
        logger.info("*** Example ***")
        logger.info("feature: %s" % f)

    return features


In [None]:
#!/usr/bin/env python
# coding=utf-8
""" Finetuning models on CaseHOLD (e.g. Bert, RoBERTa, LEGAL-BERT)."""

import logging
import os
from dataclasses import dataclass, field
from typing import Optional
from sklearn.model_selection import ParameterGrid
import numpy as np
import random
import shutil
import glob
import os

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
		AutoModelForMultipleChoice,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback,
    Trainer
)
from transformers.trainer_utils import is_main_process
from transformers import EarlyStoppingCallback
# from casehold_helpers import MultipleChoiceDataset, Split
from sklearn.metrics import f1_score
# from models.deberta import DebertaForMultipleChoice

logger = logging.getLogger(__name__)

param_grid = {
    'learning_rate': [1e-5, 2e-5],  # Learning rates to try
    'num_train_epochs': [1, 2],        # Number of training epochs to try
    'per_device_train_batch_size': [2, 4],  # Batch sizes for training
    'per_device_eval_batch_size': [2, 4],   # Batch sizes for evaluation
}


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    task_name: str = field(default="case_hold", metadata={"help": "The name of the task to train on"})
    max_seq_length: int = field(
        default=256,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


def main(training_args,model_args):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    data_args = DataTrainingArguments(
        max_seq_length=128,
        # max_segments=64,
        # max_seg_length=128,
        overwrite_cache=False,
        pad_to_max_length=True,
    )
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=5,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )

    if config.model_type == 'big_bird':
        config.attention_type = 'original_full'
    elif config.model_type == 'longformer':
        config.attention_window = [data_args.max_seq_length] * config.num_hidden_layers

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        # Default fast tokenizer is buggy on CaseHOLD task, switch to legacy tokenizer
        use_fast=True,
    )

    if config.model_type != 'deberta':
        model = AutoModelForMultipleChoice.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        model = DebertaForMultipleChoice.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    train_dataset = None
    eval_dataset = None

    # If do_train passed, train_dataset by default loads train split from file named train.csv in data directory
    if training_args.do_train:
        train_dataset = \
            MultipleChoiceDataset(
                tokenizer=tokenizer,
                task=data_args.task_name,
                max_seq_length=data_args.max_seq_length,
                overwrite_cache=data_args.overwrite_cache,
                mode=Split.train,
            )

    # If do_eval or do_predict passed, eval_dataset by default loads dev split from file named dev.csv in data directory
    if training_args.do_eval:
        eval_dataset = \
            MultipleChoiceDataset(
                tokenizer=tokenizer,
                task=data_args.task_name,
                max_seq_length=data_args.max_seq_length,
                overwrite_cache=data_args.overwrite_cache,
                mode=Split.dev,
            )

    if training_args.do_predict:
        predict_dataset = \
            MultipleChoiceDataset(
                tokenizer=tokenizer,
                task=data_args.task_name,
                max_seq_length=data_args.max_seq_length,
                overwrite_cache=data_args.overwrite_cache,
                mode=Split.test,
            )

    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset[:data_args.max_train_samples]
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset[:data_args.max_eval_samples]

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset[:data_args.max_predict_samples]

    # Define custom compute_metrics function, returns macro F1 metric for CaseHOLD task
    def compute_metrics(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        # Compute macro and micro F1 for 5-class CaseHOLD task
        macro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro', zero_division=0)
        return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # Re-save the tokenizer for model sharing
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation on eval_dataset
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)
        print("*** Evaluate ***")
        #metrics = trainer.evaluate(eval_dataset=eval_dataset)
        macro_f1 = metrics['eval_macro-f1']
        best_accuracy = 0.0
        best_params = {}
        if macro_f1 > best_accuracy:
           best_accuracy = macro_f1
           best_params = params
        print("Best Hyperparameters:", best_params)
        print("Best Validation Macro-F1:", best_accuracy)

    # Predict on eval_dataset
    if training_args.do_predict:
        logger.info("*** Predict ***")

        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")

        max_predict_samples = (
            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
        )
        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        output_predict_file = os.path.join(training_args.output_dir, "test_predictions.csv")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                for index, pred_list in enumerate(predictions):
                    pred_line = '\t'.join([f'{pred:.5f}' for pred in pred_list])
                    writer.write(f"{index}\t{pred_line}\n")


# Print the best hyperparameters and its corresponding validation macro-f1 score

    # Clean up checkpoints
    checkpoints = [filepath for filepath in glob.glob(f'{training_args.output_dir}/*/') if '/checkpoint' in filepath]
    for checkpoint in checkpoints:
        shutil.rmtree(checkpoint)


# def _mp_fn(index):
# For xla_spawn (TPUs)
# main()


if __name__ == "__main__":
    # main()

    training_args = TrainingArguments(
        do_train=True,
        do_eval=True,
        do_predict=True,
        output_dir=os.getcwd(),
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        logging_steps=200,
        evaluation_strategy="steps",
        eval_steps=500,
        logging_first_step=False,
        load_best_model_at_end=True,
        metric_for_best_model="macro-f1",
    )
    model_names_or_paths = [
        "casehold/custom-legalbert",
        "bert-base-uncased",
        "bert-base-cased",
        # Add more model names or paths as needed
    ]
    for model_name_or_path in model_names_or_paths:
        # Set the model_name_or_path in the ModelArguments
        model_args = ModelArguments(
            model_name_or_path=model_name_or_path,
            # ... Rest of the model arguments ...
        )
    for params in ParameterGrid(param_grid):
    # Set the current hyperparameters in the TrainingArguments object
        training_args.learning_rate = params['learning_rate']
        training_args.num_train_epochs = params['num_train_epochs']
        training_args.per_device_train_batch_size = params['per_device_train_batch_size']
        training_args.per_device_eval_batch_size = params['per_device_eval_batch_size']
        main(training_args,model_args)
		 # Train the model
    #train_result = trainer.train()
    #metrics = train_result.metrics

    # Evaluate the model




[INFO|training_args.py:1299] 2023-07-28 17:35:42,149 >> Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
[INFO|training_args.py:1713] 2023-07-28 17:35:42,151 >> PyTorch: setting up devices
[INFO|training_args.py:1439] 2023-07-28 17:35:42,155 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
[INFO|configuration_utils.py:712] 2023-07-28 17:35:42,212 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_u

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.249,1.175207,0.537423,0.537436
1000,1.1828,1.127146,0.555552,0.555641
1500,1.0674,1.097465,0.571301,0.571282
2000,1.0742,1.060058,0.569259,0.569487
2500,1.0749,1.102692,0.579966,0.58
3000,1.1362,1.097008,0.584679,0.584615
3500,1.123,1.120891,0.583792,0.583846
4000,1.1327,1.059442,0.596483,0.596667
4500,1.0618,1.065231,0.598838,0.598974
5000,1.1381,1.048116,0.599958,0.6


[INFO|trainer.py:3081] 2023-07-28 17:43:26,719 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 17:43:26,721 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 17:43:26,722 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 17:44:12,644 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 17:44:12,647 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 17:44:15,484 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:2894] 2023-07-28 17:44:24,225 >> Deleting older checkpoint [/content/checkpoint-3000] due to args.save_total_limit
[INFO|trainer.py:3081] 2023-07-28 17:45:10,041 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 17:45:10,042 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 17:45:10,044 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 17:45:56,121 >> Saving model checkpoint to /content

[INFO|trainer.py:3081] 2023-07-28 18:05:19,086 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 18:05:19,087 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 18:05:19,088 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.29
  eval_loss               =     1.0481
  eval_macro-f1           =        0.6
  eval_micro-f1           =        0.6
  eval_runtime            = 0:00:45.81
  eval_samples            =       3900
  eval_samples_per_second =     85.125
  eval_steps_per_second   =     42.563
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 2}
Best Validation Macro-F1: 0.5999575421636008
***** predict metrics *****
  predict_loss               =     1.0839
  predict_macro-f1           =     0.5741
  predict_micro-f1           =     0.5742
  predict_runtime            = 0:00:46.28
  predict_samples            =       3600
  predict_samples_per_second =     77.782
  predict_steps_per_second   =     38.891


[INFO|configuration_utils.py:712] 2023-07-28 18:06:06,247 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 18:06:06,249 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2314,1.122095,0.556678,0.556667
1000,1.1128,1.076632,0.576361,0.57641
1500,1.097,1.029327,0.589759,0.589744
2000,1.047,1.013927,0.597693,0.597692
2500,1.0585,0.99723,0.602222,0.602308
3000,1.0105,1.015857,0.599233,0.599231
3500,1.0488,0.99287,0.60363,0.60359
4000,1.0374,1.004387,0.608152,0.608205
4500,1.0611,0.992145,0.611446,0.611538
5000,1.0211,0.976924,0.613862,0.613846


[INFO|trainer.py:3081] 2023-07-28 18:07:41,225 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 18:07:41,226 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 18:07:41,228 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 18:08:27,247 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 18:08:27,250 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 18:08:33,257 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 18:09:54,259 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 18:09:54,260 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 18:09:54,263 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 18:10:40,312 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 18:10:40,315 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 18:35:00,261 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 18:35:00,262 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 18:35:00,263 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.58
  eval_loss               =     0.9769
  eval_macro-f1           =     0.6139
  eval_micro-f1           =     0.6138
  eval_runtime            = 0:00:45.94
  eval_samples            =       3900
  eval_samples_per_second =     84.893
  eval_steps_per_second   =     42.446
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 4}
Best Validation Macro-F1: 0.6138615413816193
***** predict metrics *****
  predict_loss               =     1.0152
  predict_macro-f1           =     0.5833
  predict_micro-f1           =     0.5833
  predict_runtime            = 0:00:46.27
  predict_samples            =       3600
  predict_samples_per_second =       77.8
  predict_steps_per_second   =       38.9


[INFO|configuration_utils.py:712] 2023-07-28 18:35:47,315 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 18:35:47,319 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.249,1.175207,0.537423,0.537436
1000,1.1828,1.127146,0.555552,0.555641
1500,1.0674,1.097464,0.571301,0.571282
2000,1.0742,1.060058,0.569259,0.569487
2500,1.0749,1.102692,0.579966,0.58
3000,1.1362,1.097008,0.584679,0.584615
3500,1.123,1.120891,0.583792,0.583846
4000,1.1327,1.059442,0.596483,0.596667
4500,1.0618,1.065231,0.598838,0.598974
5000,1.1381,1.048116,0.599958,0.6


[INFO|trainer.py:3081] 2023-07-28 18:36:53,856 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 18:36:53,857 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 18:36:53,858 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 18:37:36,138 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 18:37:36,143 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 18:37:42,158 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 18:38:31,367 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 18:38:31,368 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 18:38:31,371 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 18:39:13,728 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 18:39:13,732 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 18:57:54,694 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 18:57:54,695 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 18:57:54,696 >>   Batch size = 4


***** eval metrics *****
  epoch                   =       0.29
  eval_loss               =     1.0481
  eval_macro-f1           =        0.6
  eval_micro-f1           =        0.6
  eval_runtime            = 0:00:42.57
  eval_samples            =       3900
  eval_samples_per_second =     91.595
  eval_steps_per_second   =     22.899
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 4, 'per_device_train_batch_size': 2}
Best Validation Macro-F1: 0.5999575421636008
***** predict metrics *****
  predict_loss               =     1.0839
  predict_macro-f1           =     0.5741
  predict_micro-f1           =     0.5742
  predict_runtime            = 0:00:41.55
  predict_samples            =       3600
  predict_samples_per_second =     86.625
  predict_steps_per_second   =     21.656


[INFO|configuration_utils.py:712] 2023-07-28 18:58:36,941 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 18:58:36,944 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2314,1.122095,0.556678,0.556667
1000,1.1128,1.076632,0.576361,0.57641
1500,1.097,1.029327,0.589759,0.589744
2000,1.047,1.013927,0.597693,0.597692
2500,1.0585,0.99723,0.602222,0.602308
3000,1.0105,1.015857,0.599233,0.599231
3500,1.0488,0.99287,0.60363,0.60359
4000,1.0374,1.004387,0.608152,0.608205
4500,1.0611,0.992145,0.611446,0.611538
5000,1.0211,0.976924,0.613862,0.613846


[INFO|trainer.py:3081] 2023-07-28 19:00:14,943 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:00:14,944 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:00:14,947 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 19:00:57,771 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 19:00:57,774 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 19:01:03,889 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 19:02:21,899 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:02:21,901 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:02:21,903 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 19:03:04,759 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 19:03:04,763 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 19:27:18,590 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 19:27:18,591 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 19:27:18,596 >>   Batch size = 4


***** eval metrics *****
  epoch                   =       0.58
  eval_loss               =     0.9769
  eval_macro-f1           =     0.6139
  eval_micro-f1           =     0.6138
  eval_runtime            = 0:00:42.79
  eval_samples            =       3900
  eval_samples_per_second =     91.141
  eval_steps_per_second   =     22.785
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 4, 'per_device_train_batch_size': 4}
Best Validation Macro-F1: 0.6138615413816193
***** predict metrics *****
  predict_loss               =     1.0152
  predict_macro-f1           =     0.5833
  predict_micro-f1           =     0.5833
  predict_runtime            = 0:00:41.86
  predict_samples            =       3600
  predict_samples_per_second =     85.981
  predict_steps_per_second   =     21.495


[INFO|configuration_utils.py:712] 2023-07-28 19:28:01,120 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 19:28:01,123 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2501,1.176715,0.537964,0.537949
1000,1.1829,1.108024,0.557875,0.557949
1500,1.0612,1.089585,0.569759,0.569744
2000,1.0766,1.072198,0.57084,0.571026
2500,1.0805,1.086128,0.57513,0.575128
3000,1.1456,1.101332,0.588264,0.588205
3500,1.129,1.124533,0.583514,0.58359
4000,1.124,1.057125,0.592438,0.592564
4500,1.0725,1.070488,0.598559,0.598718
5000,1.1469,1.032918,0.601713,0.601795


[INFO|trainer.py:3081] 2023-07-28 19:29:12,578 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:29:12,579 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:29:12,581 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 19:29:59,844 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 19:29:59,848 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 19:30:05,941 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 19:30:56,790 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:30:56,792 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:30:56,793 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 19:31:44,076 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 19:31:44,081 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 19:51:47,764 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 19:51:47,765 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 19:51:47,767 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.29
  eval_loss               =     1.0329
  eval_macro-f1           =     0.6017
  eval_micro-f1           =     0.6018
  eval_runtime            = 0:00:47.41
  eval_samples            =       3900
  eval_samples_per_second =     82.261
  eval_steps_per_second   =      41.13
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 2, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 2}
Best Validation Macro-F1: 0.6017131697028711
***** predict metrics *****
  predict_loss               =     1.0749
  predict_macro-f1           =     0.5753
  predict_micro-f1           =     0.5753
  predict_runtime            = 0:00:48.42
  predict_samples            =       3600
  predict_samples_per_second =     74.335
  predict_steps_per_second   =     37.167


[INFO|configuration_utils.py:712] 2023-07-28 19:52:36,892 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 19:52:36,895 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2308,1.121775,0.556928,0.556923
1000,1.1129,1.07465,0.576387,0.57641
1500,1.1003,1.029728,0.590315,0.590256
2000,1.0506,1.012806,0.601811,0.601795
2500,1.0573,0.998161,0.599428,0.599487
3000,1.0155,1.017272,0.601285,0.601282
3500,1.0466,0.998847,0.601803,0.601795


[INFO|trainer.py:3081] 2023-07-28 19:54:18,649 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:54:18,653 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:54:18,655 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 19:55:06,361 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 19:55:06,364 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 19:55:12,513 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 19:56:32,390 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 19:56:32,392 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 19:56:32,395 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 19:57:20,071 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 19:57:20,075 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 20:09:24,336 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 20:09:24,337 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 20:09:24,342 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.31
  eval_loss               =     1.0128
  eval_macro-f1           =     0.6018
  eval_micro-f1           =     0.6018
  eval_runtime            = 0:00:47.64
  eval_samples            =       3900
  eval_samples_per_second =     81.862
  eval_steps_per_second   =     40.931
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 2, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 4}
Best Validation Macro-F1: 0.6018114496654995
***** predict metrics *****
  predict_loss               =     1.0325
  predict_macro-f1           =     0.5769
  predict_micro-f1           =     0.5769
  predict_runtime            = 0:00:48.94
  predict_samples            =       3600
  predict_samples_per_second =      73.55
  predict_steps_per_second   =     36.775


[INFO|configuration_utils.py:712] 2023-07-28 20:10:13,898 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 20:10:13,900 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2501,1.176715,0.537964,0.537949
1000,1.1829,1.108024,0.557875,0.557949
1500,1.0612,1.089585,0.569759,0.569744
2000,1.0766,1.072198,0.57084,0.571026
2500,1.0805,1.086128,0.57513,0.575128
3000,1.1456,1.101332,0.588264,0.588205
3500,1.129,1.124533,0.583514,0.58359
4000,1.124,1.057124,0.592438,0.592564
4500,1.0725,1.070488,0.598559,0.598718
5000,1.1469,1.032918,0.601713,0.601795


[INFO|trainer.py:3081] 2023-07-28 20:11:27,366 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 20:11:27,367 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 20:11:27,369 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 20:12:10,465 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 20:12:10,468 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 20:12:16,422 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 20:13:07,121 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 20:13:07,123 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 20:13:07,125 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 20:13:50,157 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 20:13:50,161 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 20:34:36,481 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 20:34:36,488 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 20:34:36,490 >>   Batch size = 4


***** eval metrics *****
  epoch                   =       0.29
  eval_loss               =     1.0329
  eval_macro-f1           =     0.6017
  eval_micro-f1           =     0.6018
  eval_runtime            = 0:00:42.94
  eval_samples            =       3900
  eval_samples_per_second =     90.807
  eval_steps_per_second   =     22.702
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 2, 'per_device_eval_batch_size': 4, 'per_device_train_batch_size': 2}
Best Validation Macro-F1: 0.6017131697028711
***** predict metrics *****
  predict_loss               =     1.0749
  predict_macro-f1           =     0.5753
  predict_micro-f1           =     0.5753
  predict_runtime            = 0:00:42.21
  predict_samples            =       3600
  predict_samples_per_second =     85.282
  predict_steps_per_second   =     21.321


[INFO|configuration_utils.py:712] 2023-07-28 20:35:19,295 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 20:35:19,298 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2308,1.121775,0.556928,0.556923
1000,1.1129,1.074649,0.576387,0.57641
1500,1.1003,1.029728,0.590315,0.590256
2000,1.0506,1.012806,0.601811,0.601795
2500,1.0573,0.998161,0.599428,0.599487
3000,1.0155,1.017272,0.601285,0.601282
3500,1.0466,0.998847,0.601803,0.601795


[INFO|trainer.py:3081] 2023-07-28 20:37:06,156 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 20:37:06,157 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 20:37:06,158 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 20:37:49,355 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 20:37:49,358 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 20:38:04,675 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 20:39:45,712 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 20:39:45,714 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 20:39:45,717 >>   Batch size = 4
[INFO|trainer.py:2807] 2023-07-28 20:40:28,854 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 20:40:28,858 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 20:55:41,751 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 20:55:41,753 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 20:55:41,754 >>   Batch size = 4


***** eval metrics *****
  epoch                   =       0.31
  eval_loss               =     1.0128
  eval_macro-f1           =     0.6018
  eval_micro-f1           =     0.6018
  eval_runtime            = 0:00:42.74
  eval_samples            =       3900
  eval_samples_per_second =     91.245
  eval_steps_per_second   =     22.811
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 1e-05, 'num_train_epochs': 2, 'per_device_eval_batch_size': 4, 'per_device_train_batch_size': 4}
Best Validation Macro-F1: 0.6018114496654995
***** predict metrics *****
  predict_loss               =     1.0325
  predict_macro-f1           =     0.5769
  predict_micro-f1           =     0.5769
  predict_runtime            = 0:00:41.87
  predict_samples            =       3600
  predict_samples_per_second =      85.96
  predict_steps_per_second   =      21.49


[INFO|configuration_utils.py:712] 2023-07-28 20:56:24,191 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 20:56:24,193 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.3123,1.269219,0.487191,0.487179
1000,1.2626,1.145242,0.538143,0.538205
1500,1.1312,1.16353,0.547455,0.547436
2000,1.1557,1.135434,0.545203,0.545128
2500,1.1372,1.188412,0.559467,0.559487
3000,1.2239,1.165895,0.570251,0.570256
3500,1.2012,1.175446,0.569712,0.569744
4000,1.1529,1.088874,0.576779,0.576923
4500,1.1584,1.150989,0.576416,0.57641
5000,1.1529,1.082193,0.584826,0.584872


[INFO|trainer.py:3081] 2023-07-28 20:57:41,129 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 20:57:41,130 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 20:57:41,131 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 20:58:28,071 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 20:58:28,074 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 20:59:07,505 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 21:01:08,722 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 21:01:08,726 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 21:01:08,727 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 21:01:55,796 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 21:01:55,799 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-28 21:47:20,638 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-28 21:47:20,639 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-28 21:47:20,642 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.29
  eval_loss               =     1.0822
  eval_macro-f1           =     0.5848
  eval_micro-f1           =     0.5849
  eval_runtime            = 0:00:46.44
  eval_samples            =       3900
  eval_samples_per_second =     83.968
  eval_steps_per_second   =     41.984
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 2e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 2}
Best Validation Macro-F1: 0.5848256237608144
***** predict metrics *****
  predict_loss               =     1.1055
  predict_macro-f1           =     0.5674
  predict_micro-f1           =     0.5675
  predict_runtime            = 0:00:47.19
  predict_samples            =       3600
  predict_samples_per_second =     76.276
  predict_steps_per_second   =     38.138


[INFO|configuration_utils.py:712] 2023-07-28 21:48:08,266 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-28 21:48:08,269 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

Step,Training Loss,Validation Loss,Macro-f1,Micro-f1
500,1.2219,1.120786,0.554679,0.554615
1000,1.1159,1.061108,0.571839,0.571795
1500,1.1092,1.074857,0.579729,0.579744
2000,1.0689,1.020033,0.590998,0.591026
2500,1.0698,1.00496,0.596109,0.596154
3000,1.0251,0.998305,0.600506,0.600513
3500,1.0494,1.005086,0.601465,0.601538
4000,1.0515,1.066649,0.600906,0.601026
4500,1.0553,1.011513,0.604806,0.604872
5000,1.0107,0.989541,0.605101,0.605128


[INFO|trainer.py:3081] 2023-07-28 21:49:57,371 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 21:49:57,372 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 21:49:57,373 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 21:50:44,205 >> Saving model checkpoint to /content/checkpoint-500
[INFO|configuration_utils.py:458] 2023-07-28 21:50:44,208 >> Configuration saved in /content/checkpoint-500/config.json
[INFO|modeling_utils.py:1851] 2023-07-28 21:51:49,456 >> Model weights saved in /content/checkpoint-500/pytorch_model.bin
[INFO|trainer.py:3081] 2023-07-28 21:55:49,951 >> ***** Running Evaluation *****
[INFO|trainer.py:3083] 2023-07-28 21:55:49,953 >>   Num examples = 3900
[INFO|trainer.py:3086] 2023-07-28 21:55:49,954 >>   Batch size = 2
[INFO|trainer.py:2807] 2023-07-28 21:56:36,697 >> Saving model checkpoint to /content/checkpoint-1000
[INFO|configuration_utils.py:458] 2023-07-28 21:56:36,700 >> Configuration saved in /content/checkpoint-1000/config

[INFO|trainer.py:3081] 2023-07-29 00:21:16,817 >> ***** Running Prediction *****
[INFO|trainer.py:3083] 2023-07-29 00:21:16,819 >>   Num examples = 3600
[INFO|trainer.py:3086] 2023-07-29 00:21:16,820 >>   Batch size = 2


***** eval metrics *****
  epoch                   =       0.84
  eval_loss               =     0.9597
  eval_macro-f1           =     0.6229
  eval_micro-f1           =     0.6231
  eval_runtime            = 0:00:46.47
  eval_samples            =       3900
  eval_samples_per_second =     83.919
  eval_steps_per_second   =     41.959
*** Evaluate ***
Best Hyperparameters: {'learning_rate': 2e-05, 'num_train_epochs': 1, 'per_device_eval_batch_size': 2, 'per_device_train_batch_size': 4}
Best Validation Macro-F1: 0.6229474549327568
***** predict metrics *****
  predict_loss               =     0.9997
  predict_macro-f1           =      0.595
  predict_micro-f1           =      0.595
  predict_runtime            = 0:00:47.12
  predict_samples            =       3600
  predict_samples_per_second =     76.391
  predict_steps_per_second   =     38.196


[INFO|configuration_utils.py:712] 2023-07-29 00:22:04,406 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
[INFO|configuration_utils.py:768] 2023-07-29 00:22:04,408 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "case_hold",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_att

In [None]:
import re

def extract_legal_notices(text):
    # Define regular expressions for common legal notice patterns
    arbitration_pattern = r'\barbitration\b'
    acknowledgment_pattern = r'\b(?:acknowledge|agree|understand|bound by)\b'

    # Find all occurrences of legal notices and acknowledgments in the text
    arbitration_matches = re.findall(arbitration_pattern, text, re.IGNORECASE)
    acknowledgment_matches = re.findall(acknowledgment_pattern, text, re.IGNORECASE)

    # Return a dictionary containing the extracted patterns
    extracted_patterns = {
        'arbitration_notices': arbitration_matches,
        'acknowledgments': acknowledgment_matches
    }
    return extracted_patterns

def main():
    # Read the text file containing incorrect predictions
    with open('/content/sample_data/incorrect_predictions.txt', 'r') as file:
        text = file.read()

    # Extract patterns from the text
    patterns = extract_legal_notices(text)

    # Print the extracted patterns
    print("Arbitration Notices:", patterns['arbitration_notices'])
    print("Acknowledgments:", patterns['acknowledgments'])

if __name__ == "__main__":
    main()


Arbitration Notices: ['arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration', 'arbitration']
Acknowledgments: ['agree', 'agree', 'acknowledge', 'agree', 'understand', 'agree', 'bound by', 'agree', 'bound by', 'acknowledge', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'bound by', 'understand', 'agree', 'bound by', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'bound by', 'understand', 'agree', 'agree', 'agree', 'bound by', 'bound by', 'agree', 'acknowledge', 'agree', 'agree', 'bound by', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree', 'agree']


In [None]:
!pip install nltk
!pip install transformers
!pip install collections

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.8 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import GPT2Tokenizer

def main():
    input_text = "except where our dispute is being resolved pursuant to an arbitration ( as provided below ) , if you are a resident of the united states or canada , you agree that any claim or dispute you may have against evernote must be resolved exclusively by a state or federal court located in san mateo county , california ."
    input_ids = [0, 26837, 147, 84, 4464, 16, 145, 8179, 22918, 7, 41, 16211, 36, 25, 1286, 874, 4839, 2156, 114, 47, 32, 10, 3313, 9, 5, 10409, 982, 50, 64, 2095, 2156, 47, 2854, 14, 143, 2026, 50, 4464, 47, 189, 33, 136, 364, 12170, 6457, 531, 28, 8179, 8992, 30, 10, 194, 50, 752, 461, 2034, 11, 15610, 12563, 139, 2109, 2156, 13011, 1594, 43052, 479, 1437, 50118, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

    # Initialize the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # Decode the input ID 2156 to get the corresponding word
    corresponding_word = tokenizer.decode(2156)

    print(f"Corresponding word for Input ID 2156: {corresponding_word}")

if __name__ == "__main__":
    main()


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Corresponding word for Input ID 2156:  house


In [None]:
from collections import Counter
from transformers import BertTokenizer





import re
import nltk
from collections import Counter

nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(input_text):
    # Convert the input text to lowercase
    input_text = input_text.lower()

    # Remove non-alphanumeric characters and split into words
    words = re.findall(r'\b\w+\b', input_text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join the words back into a sentence
    processed_text = ' '.join(words)
    return processed_text


def find_most_occurring_ids_and_words(text):
    # Initialize the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Split the text into lines
    lines = text.strip().split('\n')

    # Initialize a Counter to store the occurrences of each input ID
    id_counter = Counter()

    # Process each line in the text
    for line in lines:
        # Find the index of "Input IDs: " to extract the input IDs
        id_index = line.find("Input IDs: ")
        if id_index != -1:
            input_ids_str = line[id_index + len("Input IDs: "):].strip()

            # Convert the input IDs from a list of integers
            # Handle cases where input_ids_str is enclosed in brackets
            input_ids_str = input_ids_str.strip("[]")
            input_ids = [int(id_str) for id_str in input_ids_str.split(', ')]

            # Count occurrences of each input ID
            id_counter.update(input_ids)

    # Find the most occurring input IDs and their counts
    most_occurring_ids = id_counter.most_common()

    # Get the vocabulary of the BERT tokenizer
    vocab = tokenizer.get_vocab()

    # Get the corresponding words for the most occurring input IDs
    id_to_words = {id: tokenizer.convert_ids_to_tokens(id) for id, _ in most_occurring_ids}

    # Return the most occurring input IDs and their corresponding words
    return most_occurring_ids, id_to_words

from collections import Counter
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer

nltk.download("stopwords")

def main():
    input_file = "/content/sample_data/incorrect_predictions.txt"  # Replace this with the path to your input file

    # Initialize the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Load the input file
    with open(input_file, "r", encoding="utf-8") as file:
        lines = file.readlines()

    # Process the lines to remove stop words and count occurrences of input IDs
    input_ids_count = Counter()

    for line in lines:
        if "Input IDs: " in line:
            input_ids_str = line.split("Input IDs: ")[1].strip()
            input_ids = [int(id_str) for id_str in input_ids_str[1:-1].split(", ")]

            # Remove stop words from the input IDs
            input_ids = [id for id in input_ids if id not in stopwords.words("english")]

            # Count occurrences of each input ID
            input_ids_count.update(input_ids)

    # Get the most occurring input IDs
    most_occurring_input_ids = [(input_id, count) for input_id, count in input_ids_count.items() if count > 100]

    # Sort the input IDs by count in descending order
    most_occurring_input_ids.sort(key=lambda x: x[1], reverse=True)

    # Display the most occurring input IDs and their counts
    print("Most occurring input IDs and their counts:")
    for input_id, count in most_occurring_input_ids:
        print(f"Input ID: {input_id}, Count: {count}")

    # Process the input texts to get the corresponding words for the most occurring input IDs
    print("\nCorresponding words for the most occurring input IDs:")
    for input_id, count in most_occurring_input_ids:
        input_text = tokenizer.decode(input_id)
        print(f"Input ID: {input_id}, Words: {input_text}")

if __name__ == "__main__":
    main()
#With these modifications, the code should now decode the input IDs using the BERT tokenizer to get the corresponding words. The output should display the most occurring input IDs and their counts, along with the corresponding words for those input IDs. Please replace "your_input_file.txt" with the actual path to your input file and run the code again.















[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Most occurring input IDs and their counts:
Input ID: 1, Count: 13164
Input ID: 2156, Count: 352
Input ID: 5, Count: 275
Input ID: 50, Count: 255
Input ID: 7, Count: 222
Input ID: 9, Count: 196
Input ID: 8, Count: 167
Input ID: 0, Count: 157
Input ID: 2, Count: 157
Input ID: 479, Count: 156
Input ID: 1437, Count: 156
Input ID: 50118, Count: 156
Input ID: 143, Count: 123
Input ID: 47, Count: 122

Corresponding words for the most occurring input IDs:
Input ID: 1, Words: [ u n u s e d 0 ]
Input ID: 2156, Words: s e e
Input ID: 5, Words: [ u n u s e d 4 ]
Input ID: 50, Words: [ u n u s e d 4 9 ]
Input ID: 7, Words: [ u n u s e d 6 ]
Input ID: 9, Words: [ u n u s e d 8 ]
Input ID: 8, Words: [ u n u s e d 7 ]
Input ID: 0, Words: [ P A D ]
Input ID: 2, Words: [ u n u s e d 1 ]
Input ID: 479, Words: [ u n u s e d 4 7 4 ]
Input ID: 1437, Words: မ
Input ID: 50118, Words: [ U N K ]
Input ID: 143, Words: [ u n u s e d 1 3 8 ]
Input ID: 47, Words: [ u n u s e d 4 6 ]
