In [None]:
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script_v0 on your own text classification task. Pointers for this are left as comments.
import json
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

import dataclasses
import inspect
import logging
import random
import sys

import fitlog
from dataclasses import dataclass, field
from sklearn.metrics import matthews_corrcoef
from typing import Optional

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

import transformers
from transformers import (
    AutoConfig,
    AutoTokenizer,
    HfArgumentParser,
    PretrainedConfig,
    EvalPrediction,
    set_seed,
)
#from trainer_knn import SimpleTrainer
from trainer import SimpleTrainer
from evaluate import Evaluation

from training_args import TrainingArguments
#from model import (
#   ContrastiveOrigin,
#    ContrastiveMoCoKnnBert
#)
from model import (
    ContrastiveOrigin,
    ContrastiveMoCoKnnBert
)
from transformers.trainer_utils import is_main_process

from transformers import AutoModelForSequenceClassification
import torch

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
                    "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
                    "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )

    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    valid_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )
    test_file: Optional[str] = field(
        default=None,
    )

    # def __post_init__(self):
    #     if self.task_name is not None:
    #         self.task_name = self.task_name.lower()
    #         if self.task_name not in task_to_keys.keys():
    #             raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
    #     elif self.train_file is None or self.valid_file is None:
    #         raise ValueError("Need either a GLUE task or a training/validation file.")
    #     else:
    #         extension = self.train_file.split(".")[-1]
    #         assert extension in ["csv", "json", "tsv"], "`train_file` should be a csv or a json file."
    #         extension = self.valid_file.split(".")[-1]
    #         assert extension in ["csv", "json", "tsv"], "`validation_file` should be a csv or a json file."


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )

    train_pattern: str = field(default="further_pretrain")

@dataclass
class FitLogArguments:
    #task: str = field(default='mrpc')
    negative_num: int = field(default=96)
    positive_num: int = field(default=3)
    queue_size: int = field(default=32000)
    top_k: int = field(default=20)
    end_k: int = field(default=1)
    m: float = field(default=0.999)
    contrastive_rate_in_training: float = field(default=0.1)
    contrastive_rate_in_inference: float = field(default=0.1)


def data_collator(features):
    """
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:

        - ``label``: handles a single value (int or float) per object
        - ``label_ids``: handles a list of values per object

    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.
    """

    first = features[0]
    batch = {}
    if "original_text" in first:
        batch["original_text"] = [f["original_text"] for f in features]
    # Special handling for labels.
    # Ensure that tensor is created with the correct type
    # (it should be automatically the case, but let's make sure of it.)
    if "label" in first and first["label"] is not None:
        label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
        dtype = torch.long if isinstance(label, int) else torch.float
        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
    elif "label_ids" in first and first["label_ids"] is not None:
        if isinstance(first["label_ids"], torch.Tensor):
            batch["labels"] = torch.stack([f["label_ids"] for f in features])
        else:
            dtype = torch.long if type(first["label_ids"][0]) is int else torch.float
            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)

    # Handling of all other possible keys.
    # Again, we will use the first element to figure out which key/values are not None for this model.
    for k, v in first.items():
        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
            if isinstance(v, torch.Tensor):
                batch[k] = torch.stack([f[k] for f in features])
            else:
                batch[k] = torch.tensor([f[k] for f in features])

    return batch


# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script_v0.
# We now keep distinct sets of args, for a cleaner separation of concerns.

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, FitLogArguments))


model_args, data_args, training_args, fitlog_args = parser.parse_json_file(json_file=os.path.abspath('json/snips/0.25/0.json'))


# model_args, data_args, training_args, fitlog_args = parser.parse_json_file(json_file=os.path.abspath('json/snips/0.25/0.json'))

results_path = './model_output/' + '_'.join([training_args.data, str(training_args.known_ratio), str(training_args.seed)]) + '.csv'
if os.path.exists(results_path):
    exit()

# model_args, data_args, training_args, fitlog_args = parser.parse_json_file(json_file='json/demo.json')

data_args.train_file = './data/' + training_args.data + '/train.tsv'
data_args.valid_file = './data/' + training_args.data + '/valid.tsv'
data_args.test_file = './data/' + training_args.data + '/test.tsv'
training_args.sample_file = data_args.train_file
training_args.max_length = data_args.max_seq_length
fitlog.set_log_dir(training_args.fitlog_dir)
fitlog_args_dict = {"seed": training_args.seed,
                    "warmup_steps": training_args.warmup_steps}

fitlog_args_name = [i for i in dir(fitlog_args) if i[0] != "_"]
for args_name in fitlog_args_name:
    args_value = getattr(fitlog_args, args_name)
    training_args.__dict__[args_name] = args_value
    if args_value is not None:
        fitlog_args_dict[args_name] = args_value
fitlog.add_hyper(fitlog_args_dict)

if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. "
        "Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)

# Log on each process the small summary:
logger.warning(
    f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
    + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
    transformers.utils.logging.set_verbosity_info()
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")

# Set seed before initializing model.
set_seed(training_args.seed)

# See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html.
df_train = pd.read_csv(data_args.train_file, sep='\t', dtype=str)
df_valid = pd.read_csv(data_args.valid_file, sep='\t', dtype=str)
df_test = pd.read_csv(data_args.test_file, sep='\t', dtype=str)

unique_labels = np.array(list(set(df_test.label.unique()) & set(df_train.label.unique())))
seen_labels = np.random.choice(unique_labels, int(len(unique_labels)*training_args.known_ratio), replace=False)

01/19/2025 16:06:13 - INFO - training_args -   PyTorch: setting up devices
01/19/2025 16:06:13 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./model_output/', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, model_parallel=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=32, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=1e-05, weight_decay=0.0001, adam_beta1=0.9, adam_beta2=0.98, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Jan19_16-06-13_gakki', logging_first_step=False, logging_steps=1000000, save_steps=1000000, save_total_limit=None, no_cuda=False, seed=0, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last

In [None]:
df_train_seen = df_train[df_train.label.isin(seen_labels)]
df_valid_seen = df_valid[df_valid.label.isin(seen_labels)]
df_valid_oos = df_valid[~df_valid.label.isin(seen_labels)]
df_valid_oos.loc[:, "label"] = 'oos'
df_test.loc[~df_test.label.isin(seen_labels), "label"] = 'oos'

data = dict()
data["train"] = Dataset.from_pandas(df_train_seen, preserve_index=False)
data["valid_seen"] = Dataset.from_pandas(df_valid_seen, preserve_index=False)
data["valid_oos"] = Dataset.from_pandas(df_valid_oos, preserve_index=False)
data["test"] = Dataset.from_pandas(df_test, preserve_index=False)
datasets = DatasetDict(data)

In [3]:
valid_loader = evaler.get_eval_dataloader()



In [21]:
df_train.label.unique()

array(['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent',
       'BookRestaurant', 'GetWeather', 'SearchCreativeWork'], dtype=object)

In [22]:
df_test.label.unique()

array(['oos', 'SearchScreeningEvent'], dtype=object)

In [24]:
data_args.test_file

'./data/snips/test.tsv'

In [25]:
unique_labels

array(['AddToPlaylist', 'SearchCreativeWork', 'PlayMusic', 'RateBook',
       'BookRestaurant', 'GetWeather', 'SearchScreeningEvent'],
      dtype='<U20')