# learner

In [None]:
#| default_exp learner

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from tqdm.auto import tqdm
from packaging import version
import torch, re, math, numpy as np, os, time, datasets
from typing import Any, Tuple, Optional, Sequence, Union, Dict, List, NamedTuple
from transformers import AutoTokenizer, BatchEncoding, Seq2SeqTrainer, Seq2SeqTrainingArguments

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler

from torch.nn.parallel import DataParallel
from torch.nn.parallel._functions import Scatter
from torch.nn.parallel.scatter_gather import _is_namedtuple

from xcai.core import *
from xcai.data import *
from xcai.representation.index import *
from xcai.generation.trie import *
from xcai.generation.generate import *

from fastcore.utils import *
from fastcore.meta import *
from fastcore.dispatch import *

In [None]:
#| export
from transformers.trainer_pt_utils import (
    find_batch_size, 
    nested_concat, nested_numpify, 
    IterableDatasetShard, 
    get_dataloader_sampler, 
    get_model_param_count,
    LengthGroupedSampler
)
from transformers.trainer_utils import has_length, denumpify_detensorize, speed_metrics, TrainOutput, HPSearchBackend, seed_worker
from transformers.trainer_callback import TrainerState
from transformers.trainer import _is_peft_model
from transformers.modeling_utils import unwrap_model
from transformers.utils import is_sagemaker_mp_enabled, is_accelerate_available, is_torch_tpu_available, logging, is_datasets_available
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow

from transformers.integrations import hp_params
from transformers.integrations.tpu import tpu_spmd_dataloader
from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available

if is_accelerate_available():
    from accelerate import Accelerator, skip_first_batches
    from accelerate import __version__ as accelerate_version
    from accelerate.utils import (
        DistributedDataParallelKwargs,
        DistributedType,
        GradientAccumulationPlugin,
        load_fsdp_model,
        load_fsdp_optimizer,
        save_fsdp_model,
        save_fsdp_optimizer,
    )

    DATA_SAMPLERS = [RandomSampler]
    if version.parse(accelerate_version) > version.parse("0.23.0"):
        from accelerate.data_loader import SeedableRandomSampler

        DATA_SAMPLERS += [SeedableRandomSampler]

    if is_deepspeed_available():
        from accelerate.utils import DeepSpeedSchedulerWrapper

if is_accelerate_available("0.28.0"):
    from accelerate.utils import DataLoaderConfiguration

TRAINING_ARGS_NAME = "training_args.bin"
TRAINER_STATE_NAME = "trainer_state.json"
OPTIMIZER_NAME = "optimizer.pt"
OPTIMIZER_NAME_BIN = "optimizer.bin"
SCHEDULER_NAME = "scheduler.pt"
SCALER_NAME = "scaler.pt"
FSDP_MODEL_NAME = "pytorch_model_fsdp"

logger = logging.get_logger(__name__)

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide
from xcai.test_utils import *
from xcai.models.BT000X import *
from xcai.metrics import *

## Setup

In [None]:
#| hide
block = XCBlock.from_cfg('train')

  self._set_arrayXarray(i, j, x)


In [None]:
#| hide
batch = block.train.one_batch(11)

In [None]:
#| hide
m = BT0002.from_pretrained('bert-base-uncased', tn_targ=10_000, ig_tok=0)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BT0002 were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['loss_fn.o']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#| hide
batch.keys()

dict_keys(['lbl2data_idx', 'lbl2data_identifier', 'lbl2data_input_text', 'lbl2data_input_ids', 'lbl2data_token_type_ids', 'lbl2data_attention_mask', 'lbl2data_data2ptr', 'data_identifier', 'data_input_text', 'data_input_ids', 'data_token_type_ids', 'data_attention_mask'])

In [None]:
#| hide
b = prepare_batch(m, batch, m_args='lbl2data_idx')

In [None]:
#| hide
b.keys()

dict_keys(['lbl2data_idx', 'lbl2data_input_ids', 'lbl2data_token_type_ids', 'lbl2data_attention_mask', 'lbl2data_data2ptr', 'data_input_ids', 'data_token_type_ids', 'data_attention_mask'])

In [None]:
#| hide
m = m.to('cuda')
b = b.to('cuda')

In [None]:
#| hide
o = m(**b)

In [None]:
#| hide
o.loss

tensor(13.8700, grad_fn=<SumBackward0>)

## DataParallel

In [None]:
#| export
def scatter(inputs, target_gpus, chunk_sizes=None, dim=0):
    def scatter_map(obj):
        if isinstance(obj, torch.Tensor):
            return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
        if _is_namedtuple(obj):
            return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
        if isinstance(obj, tuple) and len(obj) > 0:
            return list(zip(*map(scatter_map, obj)))
        if isinstance(obj, list) and len(obj) > 0:
            return [list(i) for i in zip(*map(scatter_map, obj))]
        if isinstance(obj, dict) and len(obj) > 0:
            return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
        return [obj for _ in target_gpus] 
    try:
        res = scatter_map(inputs)
    finally:
        scatter_map = None
    return res
    
def scatter_kwargs(
    inputs: Tuple[Any, ...],
    kwargs: Optional[Dict[str, Any]],
    target_gpus: Sequence[Union[int, torch.device]],
    chunk_sizes: Optional[Sequence[int]]=None,
    dim: int = 0,
) -> Tuple[Tuple[Any, ...], Tuple[Dict[str, Any], ...]]:
    scattered_inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
    scattered_kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
    if len(scattered_inputs) < len(scattered_kwargs):
        scattered_inputs.extend(() for _ in range(len(scattered_kwargs) - len(scattered_inputs)))
    elif len(scattered_kwargs) < len(inputs):
        scattered_kwargs.extend({} for _ in range(len(scattered_inputs) - len(scattered_kwargs)))
    return scattered_inputs, scattered_kwargs
    

In [None]:
#| export
class XCDataParallel(DataParallel):

    @delegates(DataParallel.__init__)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def _get_meta_name(self, x:Optional[Dict[str, Any]]):
        return list(set([k.split('_')[0] for k in x]).difference(['data', 'lbl2data'])) if len(x) else set()

    def _extract_feat(self, x:Optional[Dict[str, Any]], prefix:str):
        return {k:v for k,v in x.items() if re.match(f'^{prefix}', k) and not re.match(r'.*2ptr$', k)}

    def scatter(
        self,
        inputs: Tuple[Any, ...],
        kwargs: Optional[Dict[str, Any]],
        device_ids: Sequence[Union[int, torch.device]],
    ) -> Any:
        if len(inputs): raise ValueError('`inputs` should be empty.')
        meta_name = self._get_meta_name(kwargs)+['lbl2data']
        
        data_feat = self._extract_feat(kwargs, 'data')
        scattered_inputs, scattered_kwargs = scatter_kwargs(inputs, data_feat, device_ids, None, dim=self.dim)
        
        for o in meta_name:
            pn, chunk_sizes = f'{o}_data2ptr', None
            if pn in kwargs:
                ptr = kwargs[pn]
                psz, csz = ptr.shape[0], math.ceil(ptr.shape[0]/len(device_ids))
                psz = (psz//csz+1)*csz if psz%csz else psz # Use torch.chunk
                chunk_sizes = [ptr[p:q].sum() for p,q in zip(range(0, psz+1, csz), range(csz, psz+1, csz))]
                _, sc_ptr = scatter_kwargs(inputs, {pn:ptr}, device_ids, None, dim=self.dim)
                for p,q in zip(scattered_kwargs, sc_ptr): p.update(q)
                    
            feat = self._extract_feat(kwargs, o)    
            _, sc_kwargs = scatter_kwargs(inputs, feat, device_ids, chunk_sizes, dim=self.dim)
            for p,q in zip(scattered_kwargs, sc_kwargs): p.update(q)
            
        return tuple(scattered_inputs), tuple(scattered_kwargs)
        

### Example

In [None]:
#| hide
class MyModel(nn.Module):

    def forward(self, **kwargs):
        for k,v in kwargs.items(): print(k, ': ', v, ', ', v.device)
        return
        

In [None]:
#| hide
m = XCDataParallel(module=MyModel())

In [None]:
#| hide
o = m(**b)

data_input_ids :  data_input_ids :  tensor([[  101,   153,  4490,  7170, 22918,  2105,  1818,   102,     0,     0],
        [  101, 11341,  8032,  1548,   102,     0,     0,     0,     0,     0],
        [  101, 19166,  2779,   102,     0,     0,     0,     0,     0,     0],
        [  101,  1652,  1535,   112,   188,  1569,  4896,  3779,  1264,   102],
        [  101, 12556,  4616,  2328,   102,     0,     0,     0,     0,     0]],
       device='cuda:1') ,  cuda:1
data_token_type_ids :  tensor([[  101, 18958, 11752,  1186, 11907,  2354,  4559,   102,     0,     0],
        [  101, 12886,  2161,   102,     0,     0,     0,     0,     0,     0],
        [  101,  3132,   161,  4538,  2162,   102,     0,     0,     0,     0],
        [  101,  5755,   174, 26623,  4724,   102,     0,     0,     0,     0],
        [  101,  5479,  1948,   102,     0,     0,     0,     0,     0,     0],
        [  101,   153, 17384,  1161,  1595,   113, 15019,   114,   102,     0]],
       device='cuda:0') ,

In [None]:
#| hide
batch['lbl2data_idx']

tensor([268888,  69066,  51848, 127494,  14400,  14402,  21360,  37008,  37172,
         37259,  66128, 117048, 134455, 184458, 201618, 223919,   1160,  14855,
        125870, 141101, 184009, 200719,  51524, 180191, 187988, 267293, 206451,
        134916, 134917, 134918, 134973,  11661,  14798,  17605,  19869])

In [None]:
#| hide
batch['data_input_ids']

tensor([[  101, 18958, 11752,  1186, 11907,  2354,  4559,   102,     0,     0],
        [  101, 12886,  2161,   102,     0,     0,     0,     0,     0,     0],
        [  101,  3132,   161,  4538,  2162,   102,     0,     0,     0,     0],
        [  101,  5755,   174, 26623,  4724,   102,     0,     0,     0,     0],
        [  101,  5479,  1948,   102,     0,     0,     0,     0,     0,     0],
        [  101,   153, 17384,  1161,  1595,   113, 15019,   114,   102,     0],
        [  101,   153,  4490,  7170, 22918,  2105,  1818,   102,     0,     0],
        [  101, 11341,  8032,  1548,   102,     0,     0,     0,     0,     0],
        [  101, 19166,  2779,   102,     0,     0,     0,     0,     0,     0],
        [  101,  1652,  1535,   112,   188,  1569,  4896,  3779,  1264,   102],
        [  101, 12556,  4616,  2328,   102,     0,     0,     0,     0,     0]])

In [None]:
#| hide
batch['lbl2data_data2ptr']

tensor([ 1,  1,  2, 12,  5,  1,  1,  3,  1,  4,  4])

In [None]:
#| hide
batch['lbl2data_input_ids']

tensor([[  101,  9018,  1116,  6430,  1104,  1709,  1104,   102,     0,     0,
             0,     0,     0],
        [  101,  5619,  1104,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  5619,  1104,  1764,  2163,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101,   144,  5697,  1399,   147,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  3180,  9672,  1116,  7107,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 16890, 12562,  2818,  1104,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 12118,  8767,  4571,  1769,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  5755,  1769, 24768,   102,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 14271,  2749,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 

## Learner

In [None]:
#| export
class XCEvalLoopOutput(NamedTuple):
    pred_idx: Union[np.ndarray, Tuple[np.ndarray]]
    pred_ptr: Union[np.ndarray, Tuple[np.ndarray]]
    pred_score: Union[np.ndarray, Tuple[np.ndarray]]
    targ_idx: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
    targ_ptr: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
    metrics: Optional[Dict[str, float]]
    num_samples: Optional[int]

class XCPredictionOutput(NamedTuple):
    pred_idx: Union[np.ndarray, Tuple[np.ndarray]]
    pred_ptr: Union[np.ndarray, Tuple[np.ndarray]]
    pred_score: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
    metrics: Optional[Dict[str, float]]
    num_samples: Optional[int]
    

In [None]:
#| export
class XCLearningArguments(Seq2SeqTrainingArguments):

    @delegates(Seq2SeqTrainingArguments.__init__)
    def __init__(self, 
                 generation_length_penalty:Optional[float]=1.0, 
                 generation_num_beams:Optional[int]=5,
                 generation_max_beams:Optional[int]=10,
                 generation_max_info:Optional[int]=None,
                 representation_accumulation_steps:Optional[int]=None,
                 representation_attribute:Optional[str]='data_repr',
                 representation_num_beams:Optional[int]=5,
                 index_space:Optional[str]='cosine', 
                 index_efc:Optional[int]=200, 
                 index_m:Optional[int]=16, 
                 index_efs:Optional[int]=50,
                 index_num_threads:Optional[int]=84,
                 predict_with_generation:Optional[bool]=False,
                 predict_with_representation:Optional[bool]=False,
                 output_concatenation_weight:Optional[float]=1.0,
                 group_by_cluster:Optional[bool]=False,
                 minimum_clusters:Optional[int]=3,
                 maximum_clusters:Optional[int]=None,
                 num_cluster_update_epochs:Optional[int]=1,
                 **kwargs):
        super().__init__(**kwargs)
        store_attr('generation_num_beams,generation_max_beams,generation_length_penalty,generation_max_info')
        store_attr('representation_accumulation_steps,representation_attribute,representation_num_beams')
        store_attr('index_space,index_efc,index_m,index_efs,index_num_threads')
        store_attr('predict_with_generation,predict_with_representation,output_concatenation_weight')
        store_attr('group_by_cluster,num_cluster_update_epochs')
        self.minimum_clusters = max(1, minimum_clusters)
        self.maximum_clusters = max(minimum_clusters, maximum_clusters) if maximum_clusters is not None else minimum_clusters
        

### `XCLearner`

In [None]:
#| export
class XCLearner(Seq2SeqTrainer):

    @delegates(Seq2SeqTrainer.__init__)
    def __init__(self, 
                 trie:Optional[Trie]=None, 
                 **kwargs):
        super().__init__(**kwargs)
        self.tbs = TrieBeamSearch(trie, n_bm=self.args.generation_num_beams, max_bm=self.args.generation_max_beams,
                                  len_penalty=self.args.generation_length_penalty, max_info=self.args.generation_max_info)
        self.idxs = IndexSearch(space=self.args.index_space, efc=self.args.index_efc, m=self.args.index_m, efs=self.args.index_efs, 
                                n_bm=self.args.representation_num_beams, n_threads=self.args.index_num_threads)

    def _wrap_model(self, model, training=True, dataloader=None):
        if unwrap_model(model) is not model:
            return model

        if self.args.n_gpu > 1:
            model = XCDataParallel(module=model)
        return model

    def evaluate(self, eval_dataset:Optional[Dataset]=None, ignore_keys:Optional[List[str]]=None, metric_key_prefix:str="eval",
                 **gen_kwargs):
        gen_kwargs = gen_kwargs.copy()
        if gen_kwargs.get("length_penalty") is None and self.args.generation_length_penalty is not None:
            gen_kwargs["length_penalty"] = self.args.generation_length_penalty
        if gen_kwargs.get("gen_num_beams") is None and self.args.generation_num_beams is not None:
            gen_kwargs["gen_num_beams"] = self.args.generation_num_beams
        if gen_kwargs.get("repr_num_beams") is None and self.args.representation_num_beams is not None:
            gen_kwargs["repr_num_beams"] = self.args.representation_num_beams
        self.gather_function, self._gen_kwargs  = self.accelerator.gather, gen_kwargs

        if self._perform_representation(unwrap_model(self.model)): self._build_lbl_index(eval_dataset)
            
        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
    
    def predict(self, test_dataset: Dataset, ignore_keys:Optional[List[str]]=None, metric_key_prefix: str = "test",**gen_kwargs):
        gen_kwargs = gen_kwargs.copy()
        if gen_kwargs.get("length_penalty") is None and self.args.generation_length_penalty is not None:
            gen_kwargs["length_penalty"] = self.args.generation_length_penalty
        if gen_kwargs.get("gen_num_beams") is None and self.args.generation_num_beams is not None:
            gen_kwargs["gen_num_beams"] = self.args.generation_num_beams
        if gen_kwargs.get("repr_num_beams") is None and self.args.representation_num_beams is not None:
            gen_kwargs["repr_num_beams"] = self.args.representation_num_beams

        self.gather_function, self._gen_kwargs = self.accelerator.gather, gen_kwargs
        self._memory_tracker.start()
        
        if self._perform_representation(unwrap_model(self.model)): self._build_lbl_index(test_dataset)
            
        test_dataloader = self.get_test_dataloader(test_dataset)
        start_time = time.time()
        
        output = self.evaluation_loop(test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(metric_key_prefix,start_time,num_samples=output.num_samples,num_steps=math.ceil(output.num_samples / total_batch_size),)
        )
        self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
        self._memory_tracker.stop_and_update_metrics(output.metrics)
        return XCPredictionOutput(pred_idx=output.pred_idx, pred_ptr=output.pred_ptr, pred_score=output.pred_score, metrics=output.metrics, num_samples=output.num_samples)
    
    def _gather_host_output(self, output, host_output):
        if output is not None:
            output = self.accelerator.pad_across_processes(output, dim=1, pad_index=-100)
            output = self.gather_function((output))
            return output if host_output is None else nested_concat(host_output, output, padding_index=-100)
        else: return host_output

    def _gather_all_output(self, host_output, all_output):
        if host_output is not None:
            if isinstance(host_output, torch.Tensor): host_output = host_output.cpu()
            return host_output if all_output is None else nested_concat(all_output, host_output, padding_index=-100)
        else: return all_output
            
            

In [None]:
#| export
@patch
def generation_output(
    self:XCLearner,
    model:nn.Module,
    inputs:Dict[str, Union[torch.Tensor, Any]],
    **kwargs
):
    inputs = self._prepare_inputs(inputs)
    n_bm = kwargs.pop("gen_num_beams") if "gen_num_beams" in kwargs and kwargs["gen_num_beams"] is not None else self.args.generation_num_beams
    len_penalty = kwargs.pop("length_penalty") if "length_penalty" in kwargs and kwargs["length_penalty"] is not None else self.args.generation_length_penalty
    
    with torch.no_grad(): o = self.tbs.proc(self.model, inputs.copy(), n_bm=n_bm, len_penalty=len_penalty)
        
    return {'pred_idx':o['info2seq2data_idx'], 'pred_score':o['info2seq2data_score'], 'pred_ptr':o['info2seq2data_data2ptr']}

@patch
def representation_output(
    self:XCLearner,
    model:nn.Module,
    inputs:Dict[str, Union[torch.Tensor, Any]],
    **kwargs
):
    inputs = self._prepare_inputs(inputs)
    n_bm = kwargs.pop("repr_num_beams") if "repr_num_beams" in kwargs and kwargs["repr_num_beams"] is not None else self.args.representation_num_beams
    
    with torch.no_grad(): o = getattr(model(**inputs), self.args.representation_attribute)
    o = self.idxs.proc(o.cpu(), n_bm=n_bm)
        
    return {'pred_idx':o['info2data_idx'], 'pred_score':o['info2data_score'], 'pred_ptr':o['info2data_data2ptr']}
    

In [None]:
#| export
@patch
def _perform_generation(self:XCLearner, model:nn.Module, predict_with_generation:Optional[bool]=None):
    model = unwrap_model(model)
    predict_with_generation = self.args.predict_with_generation if predict_with_generation is None else predict_with_generation
    return getattr(model,'use_generation') if hasattr(model,'use_generation') else predict_with_generation

@patch
def _perform_representation(self:XCLearner, model:nn.Module, predict_with_representation:Optional[bool]=None):
    model = unwrap_model(model)
    predict_with_representation = self.args.predict_with_representation if predict_with_representation is None else predict_with_representation
    return getattr(model,'use_representation') if hasattr(model,'use_representation') else predict_with_representation

@patch
def resize_pred(cls:XCLearner, t, n_t):
    max_n_t = n_t.max()
    xn_t = max_n_t.max()-n_t+1
    t_ptr = n_t.cumsum(dim=0)-1
    r_t = torch.ones((len(t),), dtype=xn_t.dtype).scatter(0, t_ptr, xn_t)
    xt = t.repeat_interleave(r_t).view(len(n_t), -1)
    return xt

@patch
def output_mask(cls:XCLearner, n_t, l):
    max_n_t = n_t.max()
    xn_t = max_n_t.max()-n_t+1
    t_ptr = n_t.cumsum(dim=0)-1
    mask_ptr = t_ptr+torch.arange(len(t_ptr))+1
    mask = torch.ones((l+len(n_t),), dtype=mask_ptr.dtype).scatter(0, mask_ptr, 0)
    r_mask = torch.ones((l+len(n_t),), dtype=mask_ptr.dtype).scatter(0, mask_ptr, xn_t-1)
    mask = mask.repeat_interleave(r_mask).view(len(n_t), -1)
    return mask

@patch
def resize_output(cls:XCLearner, pred_idx, pred_score, pred_ptr):
    return cls.resize_pred(pred_idx, pred_ptr), cls.resize_pred(pred_score, pred_ptr), cls.output_mask(pred_ptr, len(pred_idx)), pred_ptr

@patch
def concatenate_output(cls:XCLearner, gen_o:Dict, repr_o:Dict):
    gen_o['pred_score'] = torch.exp(gen_o['pred_score'])*cls.args.output_concatenation_weight
    gen_o, repr_o = cls.resize_output(**gen_o), cls.resize_output(**repr_o)
    pred_idx, pred_score, mask = [torch.hstack([gen_o[i], repr_o[i]]).flatten() for i in range(3)]
    idx = torch.where(mask)[0]
    return {
        'pred_idx': pred_idx[idx],
        'pred_score': pred_score[idx],
        'pred_ptr': gen_o[3]+repr_o[3],
    }
    
@patch
def prediction_step(
    self:XCLearner,
    model: nn.Module,
    inputs: Dict[str, Union[torch.Tensor, Any]],
    prediction_loss_only: bool,
    predict_with_generation: bool,
    predict_with_representation: bool,
    ignore_keys: Optional[List[str]] = None,
    **kwargs,
) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
    with torch.no_grad():
        with self.compute_loss_context_manager(): outputs = model(**inputs)
        loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
    prediction_loss_only = self.args.prediction_loss_only if prediction_loss_only is None else prediction_loss_only
    if prediction_loss_only: return loss, None

    output, repr_o = None, None
    if self._perform_generation(model, predict_with_generation): output = self.generation_output(model, inputs, **kwargs)
    if self._perform_representation(model, predict_with_representation): repr_o = self.representation_output(model, inputs, **kwargs)
    if output is None: output = repr_o
    elif repr_o is not None: output = self.concatenate_output(output, repr_o)
        
    labels = {'targ_idx':inputs['lbl2data_idx'], 'targ_ptr':inputs['lbl2data_data2ptr']} if 'lbl2data_idx' in inputs else None
    if labels is not None: output.update(labels)
    
    return loss, output


In [None]:
#| export
@patch
def _build_lbl_index(self:XCLearner, dataset:Optional[Dataset]=None):
    dataset = dataset if self.eval_dataset is None else self.eval_dataset
    dataset = dataset if self.train_dataset is None else self.train_dataset
    if dataset is not None:
        lbl_dset = dataset.lbl_dset
        lbl_dl = self.get_test_dataloader(lbl_dset)
        lbl_repr = self.get_representation(lbl_dl)
        self.idxs.build(lbl_repr)
    else: raise ValueError('Failed to build `self.idxs`')
    
@patch
def evaluation_loop(
    self:XCLearner,
    dataloader:DataLoader,
    description:str,
    prediction_loss_only:Optional[bool] = None,
    predict_with_generation:Optional[bool]=None,
    predict_with_representation:Optional[bool]=None,
    ignore_keys:Optional[List[str]] = None,
    metric_key_prefix:str="eval",
) -> XCEvalLoopOutput:

    args = self.args
    prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only

    model = self._wrap_model(self.model, training=False, dataloader=dataloader)

    if len(self.accelerator._models) == 0 and model is self.model:
        model = self.accelerator.prepare(model) if self.is_deepspeed_enabled else self.accelerator.prepare_model(model, evaluation_mode=True)
        if self.is_fsdp_enabled: self.model = model
        if model is not self.model: self.model_wrapped = model
        if self.is_deepspeed_enabled: self.deepspeed = self.model_wrapped

    batch_size = self.args.eval_batch_size
    model.eval()
    self.callback_handler.eval_dataloader = dataloader
    eval_dataset = getattr(dataloader, "dataset", None)
    
    if args.past_index >= 0: self._past = None

    losses_host, all_losses = None, None
    host_output, all_output = {}, {}
    
    observed_num_examples = 0
    for step, inputs in enumerate(dataloader):
        observed_batch_size = find_batch_size(inputs)
        if observed_batch_size is not None:
            observed_num_examples += observed_batch_size
            if batch_size is None: batch_size = observed_batch_size
                
        loss, output = self.prediction_step(model, inputs, prediction_loss_only, predict_with_generation, predict_with_representation, ignore_keys=ignore_keys)
        
        if loss is not None:
            losses = self.gather_function((loss.repeat(batch_size)))
            losses_host = losses if losses_host is None else nested_concat(losses_host, losses, padding_index=-100)
        for k in output: host_output[k] = self._gather_host_output(output[k], host_output.get(k, None))
            
        self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
        
        if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
            if losses_host is not None: all_losses = losses_host if all_losses is None else nested_concat(all_losses, losses, padding_index=-100)
            for k in host_output: all_output[k], host_output[k] = self._gather_all_output(host_output[k], all_output.get(k, None)), None
    
    self.gather_function = self.accelerator.gather_for_metrics
    if args.past_index and hasattr(self, "_past"): delattr(self, "_past")

    if losses_host is not None: all_losses = losses_host if all_losses is None else nested_concat(all_losses, losses, padding_index=-100)
    for k in host_output: all_output[k], host_output[k] = self._gather_all_output(host_output[k], all_output.get(k, None)), None
        
    if has_length(eval_dataset): num_samples = len(eval_dataset)
    elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0:
        num_samples = eval_dataset.num_examples
    else:
        if has_length(dataloader): num_samples = self.num_examples(dataloader)
        else: num_samples = observed_num_examples
    if num_samples == 0 and observed_num_examples > 0: num_samples = observed_num_examples

    if (self.compute_metrics is not None and 
        'targ_idx' in all_output and all_output['targ_idx'] is not None and 
        'pred_idx' in all_output and all_output['pred_idx'] is not None): 
        metrics = self.compute_metrics(**all_output)
    else: metrics = {}
        
    metrics = denumpify_detensorize(metrics)

    if all_losses is not None: metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
    if hasattr(self, "jit_compilation_time"): metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time
        
    for key in list(metrics.keys()):
        if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
    
    return XCEvalLoopOutput(pred_idx=all_output['pred_idx'], pred_ptr=all_output['pred_ptr'], pred_score=all_output['pred_score'],
                            targ_idx=all_output['targ_idx'], targ_ptr=all_output['targ_ptr'], metrics=metrics, num_samples=num_samples)
    

In [None]:
#| export
@patch
def get_representation(self:XCLearner, dataloader: DataLoader):
    data_host, all_data = None, None
    for step, inputs in tqdm(enumerate(dataloader), total=len(dataloader)):
        inputs = inputs.to(self.model.device)
        with torch.no_grad(): data = getattr(self.model(**inputs), self.args.representation_attribute)
        data_host = self._gather_host_output(data, data_host)
        if self.args.representation_accumulation_steps is not None and (step + 1) % self.args.representation_accumulation_steps == 0:
            all_data, data_host = self._gather_all_output(data_host, all_data), None
    return self._gather_all_output(data_host, all_data)
    

### Training loop

In [None]:
#| export
@patch
def _get_train_sampler(self:XCLearner):
    if self.train_dataset is None or not has_length(self.train_dataset):
        return None
        
    if self.args.group_by_length:
        if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
            lengths = (
                self.train_dataset[self.args.length_column_name]
                if self.args.length_column_name in self.train_dataset.column_names
                else None
            )
        else:
            lengths = None
        model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
        return LengthGroupedSampler(
            self.args.train_batch_size * self.args.gradient_accumulation_steps,
            dataset=self.train_dataset,
            lengths=lengths,
            model_input_name=model_input_name,
        )

    elif self.args.group_by_cluster:
        return ClusterGroupedSampler(n=len(self.train_dataset))
    else:
        return RandomSampler(self.train_dataset)
        

In [None]:
#| export
@patch
def get_train_dataloader(self:XCLearner):
    if self.train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")

    train_dataset = self.train_dataset
    data_collator = self.data_collator
    if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
        train_dataset = self._remove_unused_columns(train_dataset, description="training")
    else:
        data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

    dataloader_params = {
        "batch_size": self._train_batch_size,
        "collate_fn": data_collator,
        "num_workers": self.args.dataloader_num_workers,
        "pin_memory": self.args.dataloader_pin_memory,
        "persistent_workers": self.args.dataloader_persistent_workers,
    }

    if not isinstance(train_dataset, torch.utils.data.IterableDataset):
        dataloader_params["sampler"] = self._get_train_sampler()
        dataloader_params["drop_last"] = self.args.dataloader_drop_last
        dataloader_params["worker_init_fn"] = seed_worker
        dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
    
    return DataLoader(train_dataset, **dataloader_params)
    

In [None]:
#| export
@patch
def _get_n_cluster(self:XCLearner, epochs_trained:int, num_train_epochs:int):
    if self.args.maximum_clusters is None: return self.args.minimum_clusters
    else:
        n_cluster = (self.args.maximum_clusters-self.args.minimum_clusters)/num_train_epochs*epochs_trained
        n_cluster = int(self.args.minimum_clusters+n_cluster)
        return n_cluster

@patch
def _get_train_data_cluster(self:XCLearner, epochs_trained:int, num_train_epochs:int):
    dataset = self.train_dataset.data_dset
    dataloader = self.get_test_dataloader(dataset)
    data_repr = learn.get_representation(dataloader)
    cluster, _ = BalancedClusters.proc(data_repr, n_cluster=self._get_n_cluster(epochs_trained, num_train_epochs))
    return cluster

@patch
def update_dataloader_sampler(self:XCLearner, dataloader:DataLoader, epochs_trained:int, num_train_epochs:int):
    if isinstance(dataloader.sampler, ClusterGroupedSampler):
        cluster = self._get_train_data_cluster(epochs_trained, num_train_epochs)
        dataloader.sampler.set_cluster(cluster)
    

In [None]:
#| export
@patch
def _validate_group_by_cluster(self:XCLearner):
    if self.args.group_by_cluster and (not hasattr(model,'use_representation') or  not getattr(unwrap_model(model),'use_representation')):
        raise ValueError('Cannot use `group_by_cluster` for models without `use_representation`.')
        self.args.group_by_cluster = False

@patch
def _inner_training_loop(
    self:XCLearner, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
):
    self.accelerator.free_memory()
    self._train_batch_size = batch_size
    if self.args.auto_find_batch_size:
        if self.state.train_batch_size != self._train_batch_size:
            from accelerate.utils import release_memory

            (self.model_wrapped,) = release_memory(self.model_wrapped)
            self.model_wrapped = self.model

            # Check for DeepSpeed *after* the intial pass and modify the config
            if self.is_deepspeed_enabled:
                # Temporarily unset `self.args.train_batch_size`
                original_bs = self.args.per_device_train_batch_size
                self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
                self.propagate_args_to_deepspeed(True)
                self.args.per_device_train_batch_size = original_bs
        self.state.train_batch_size = self._train_batch_size
    logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
    
    # Data loader and number of training steps
    self._validate_group_by_cluster()
    train_dataloader = self.get_train_dataloader()
    
    if self.is_fsdp_xla_v2_enabled:
        train_dataloader = tpu_spmd_dataloader(train_dataloader)

    # Setting up training control variables:
    # number of training epochs: num_train_epochs
    # number of training steps per epoch: num_update_steps_per_epoch
    # total number of training steps to execute: max_steps
    total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size

    len_dataloader = None
    num_train_tokens = None
    if has_length(train_dataloader):
        len_dataloader = len(train_dataloader)
        num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
        num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
        num_examples = self.num_examples(train_dataloader)
        if args.max_steps > 0:
            max_steps = args.max_steps
            num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
                args.max_steps % num_update_steps_per_epoch > 0
            )
            # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
            # the best we can do.
            num_train_samples = args.max_steps * total_train_batch_size
            if args.include_tokens_per_second:
                num_train_tokens = (
                    self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
                )
        else:
            max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
            num_train_epochs = math.ceil(args.num_train_epochs)
            num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
            if args.include_tokens_per_second:
                num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
    elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
        max_steps = args.max_steps
        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
        num_train_epochs = sys.maxsize
        num_update_steps_per_epoch = max_steps
        num_examples = total_train_batch_size * args.max_steps
        num_train_samples = args.max_steps * total_train_batch_size
        if args.include_tokens_per_second:
            num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
    else:
        raise ValueError(
            "args.max_steps must be set to a positive value if dataloader does not have a length, was"
            f" {args.max_steps}"
        )

    if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
        if self.args.n_gpu > 1:
            # nn.DataParallel(model) replicates the model, creating new variables and module
            # references registered here no longer work on other gpus, breaking the module
            raise ValueError(
                "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
                " (torchrun or torch.distributed.launch (deprecated))."
            )
        else:
            debug_overflow = DebugUnderflowOverflow(self.model)  # noqa

    delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled

    # We need to reset the scheduler, as its parameters may be different on subsequent calls
    if self._created_lr_scheduler:
        self.lr_scheduler = None
        self._created_lr_scheduler = False

    if self.is_deepspeed_enabled:
        self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)

    if not delay_optimizer_creation:
        self.create_optimizer_and_scheduler(num_training_steps=max_steps)

    self.state = TrainerState()
    self.state.is_hyper_param_search = trial is not None
    self.state.train_batch_size = self._train_batch_size

    # Compute absolute values for logging, eval, and save if given as ratio
    if args.logging_steps is not None:
        if args.logging_steps < 1:
            self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
        else:
            self.state.logging_steps = args.logging_steps
    if args.eval_steps is not None:
        if args.eval_steps < 1:
            self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
        else:
            self.state.eval_steps = args.eval_steps
    if args.save_steps is not None:
        if args.save_steps < 1:
            self.state.save_steps = math.ceil(max_steps * args.save_steps)
        else:
            self.state.save_steps = args.save_steps

    # Activate gradient checkpointing if needed
    if args.gradient_checkpointing:
        if args.gradient_checkpointing_kwargs is None:
            gradient_checkpointing_kwargs = {}
        else:
            gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs

        self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)

    model = self._wrap_model(self.model_wrapped)

    # as the model is wrapped, don't use `accelerator.prepare`
    # this is for unhandled cases such as
    # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
    use_accelerator_prepare = True if model is self.model else False

    if delay_optimizer_creation:
        if use_accelerator_prepare:
            self.model = self.accelerator.prepare(self.model)
        self.create_optimizer_and_scheduler(num_training_steps=max_steps)

    # prepare using `accelerator` prepare
    if use_accelerator_prepare:
        self.model.train()
        if hasattr(self.lr_scheduler, "step"):
            if self.use_apex:
                model = self.accelerator.prepare(self.model)
            else:
                model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
        else:
            # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
            model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
                self.model, self.optimizer, self.lr_scheduler
            )

    if self.is_fsdp_enabled:
        self.model = self.model_wrapped = model

    # for the rest of this function `model` is the outside model, whether it was wrapped or not
    if model is not self.model:
        self.model_wrapped = model

    # backward compatibility
    if self.is_deepspeed_enabled:
        self.deepspeed = self.model_wrapped

    # ckpt loading
    if resume_from_checkpoint is not None:
        if self.is_deepspeed_enabled:
            deepspeed_load_checkpoint(
                self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model)
            )
        elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
            self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)

    # Check if saved optimizer or scheduler states exist
    self._load_optimizer_and_scheduler(resume_from_checkpoint)

    # important: at this point:
    # self.model         is the Transformers Model
    # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
    # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.

    # Train!
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {num_examples:,}")
    logger.info(f"  Num Epochs = {num_train_epochs:,}")
    logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
    if self.args.per_device_train_batch_size != self._train_batch_size:
        logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {max_steps:,}")
    logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")

    self.state.epoch = 0
    start_time = time.time()
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    steps_trained_progress_bar = None

    # Check if continuing training from a checkpoint
    if resume_from_checkpoint is not None and os.path.isfile(
        os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
    ):
        self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
        epochs_trained = self.state.global_step // num_update_steps_per_epoch
        if not args.ignore_data_skip:
            steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
            steps_trained_in_current_epoch *= args.gradient_accumulation_steps
        else:
            steps_trained_in_current_epoch = 0

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info(f"  Continuing training from epoch {epochs_trained}")
        logger.info(f"  Continuing training from global step {self.state.global_step}")
        if not args.ignore_data_skip:
            logger.info(
                f"  Will skip the first {epochs_trained} epochs then the first"
                f" {steps_trained_in_current_epoch} batches in the first epoch."
            )

    # Update the references
    self.callback_handler.model = self.model
    self.callback_handler.optimizer = self.optimizer
    self.callback_handler.lr_scheduler = self.lr_scheduler
    self.callback_handler.train_dataloader = train_dataloader
    if self.hp_name is not None and self._trial is not None:
        # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
        # parameter to Train when using DDP.
        self.state.trial_name = self.hp_name(self._trial)
    if trial is not None:
        assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
        self.state.trial_params = hp_params(assignments)
    else:
        self.state.trial_params = None
    # This should be the same if the state has been saved but in case the training arguments changed, it's safer
    # to set this after the load.
    self.state.max_steps = max_steps
    self.state.num_train_epochs = num_train_epochs
    self.state.is_local_process_zero = self.is_local_process_zero()
    self.state.is_world_process_zero = self.is_world_process_zero()

    # tr_loss is a tensor to avoid synchronization of TPUs through .item()
    tr_loss = torch.tensor(0.0).to(args.device)
    # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
    self._total_loss_scalar = 0.0
    self._globalstep_last_logged = self.state.global_step
    model.zero_grad()
    grad_norm: Optional[float] = None

    self.control = self.callback_handler.on_train_begin(args, self.state, self.control)

    # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
    if not args.ignore_data_skip:
        for epoch in range(epochs_trained):
            sampler = get_dataloader_sampler(train_dataloader)
            sampler_kinds = [RandomSampler]
            if version.parse(accelerate_version) > version.parse("0.23.0"):
                sampler_kinds.append(SeedableRandomSampler)
            is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
            if not is_random_sampler:
                # We just need to begin an iteration to create the randomization of the sampler.
                for _ in train_dataloader:
                    break
            else:
                # Otherwise we need to call the whooooole sampler cause there is some random operation added
                # AT THE VERY END!
                sampler = sampler if sampler is not None else []
                _ = list(sampler)

    total_batched_samples = 0
    for epoch in range(epochs_trained, num_train_epochs):
        if self.args.group_by_cluster and epochs_trained % self.args.num_cluster_update_epochs == 0:
            self.update_dataloader_sampler(train_dataloader, epochs_trained, num_train_epochs)
            
        epoch_iterator = train_dataloader
        if hasattr(epoch_iterator, "set_epoch"):
            epoch_iterator.set_epoch(epoch)

        # Reset the past mems state at the beginning of each epoch if necessary.
        if args.past_index >= 0:
            self._past = None

        steps_in_epoch = (
            len(epoch_iterator)
            if len_dataloader is not None
            else args.max_steps * args.gradient_accumulation_steps
        )
        self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)

        if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
            self._load_rng_state(resume_from_checkpoint)

        rng_to_sync = False
        steps_skipped = 0
        if steps_trained_in_current_epoch > 0:
            epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
            steps_skipped = steps_trained_in_current_epoch
            steps_trained_in_current_epoch = 0
            rng_to_sync = True

        step = -1
        for step, inputs in enumerate(epoch_iterator):
            total_batched_samples += 1

            if self.args.include_num_input_tokens_seen:
                main_input_name = getattr(self.model, "main_input_name", "input_ids")
                if main_input_name not in inputs:
                    logger.warning(
                        "Tried to track the number of tokens seen, however the current model is "
                        "not configured properly to know what item is the input. To fix this, add "
                        "a `main_input_name` attribute to the model class you are using."
                    )
                else:
                    self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
            if rng_to_sync:
                self._load_rng_state(resume_from_checkpoint)
                rng_to_sync = False

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                if steps_trained_progress_bar is not None:
                    steps_trained_progress_bar.update(1)
                if steps_trained_in_current_epoch == 0:
                    self._load_rng_state(resume_from_checkpoint)
                continue
            elif steps_trained_progress_bar is not None:
                steps_trained_progress_bar.close()
                steps_trained_progress_bar = None

            if step % args.gradient_accumulation_steps == 0:
                self.control = self.callback_handler.on_step_begin(args, self.state, self.control)

            with self.accelerator.accumulate(model):
                tr_loss_step = self.training_step(model, inputs)

            if (
                args.logging_nan_inf_filter
                and not is_torch_tpu_available()
                and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
            ):
                # if loss is nan or inf simply add the average of previous logged losses
                tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
            else:
                tr_loss += tr_loss_step

            self.current_flos += float(self.floating_point_ops(inputs))

            is_last_step_and_steps_less_than_grad_acc = (
                steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
            )

            if (
                total_batched_samples % args.gradient_accumulation_steps == 0
                or
                # last step in epoch but step is always smaller than gradient_accumulation_steps
                is_last_step_and_steps_less_than_grad_acc
            ):
                # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                # in accelerate. So, explicitly enable sync gradients to True in that case.
                if is_last_step_and_steps_less_than_grad_acc:
                    self.accelerator.gradient_state._set_sync_gradients(True)

                # Gradient clipping
                if args.max_grad_norm is not None and args.max_grad_norm > 0:
                    # deepspeed does its own clipping

                    if is_sagemaker_mp_enabled() and args.fp16:
                        _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm)
                    elif self.use_apex:
                        # Revert to normal clipping otherwise, handling Apex or full precision
                        _grad_norm = nn.utils.clip_grad_norm_(
                            amp.master_params(self.optimizer),
                            args.max_grad_norm,
                        )
                    else:
                        _grad_norm = self.accelerator.clip_grad_norm_(
                            model.parameters(),
                            args.max_grad_norm,
                        )

                    if (
                        is_accelerate_available()
                        and self.accelerator.distributed_type == DistributedType.DEEPSPEED
                    ):
                        grad_norm = model.get_global_grad_norm()
                    else:
                        grad_norm = _grad_norm.item() if _grad_norm is not None else None

                # Optimizer step
                self.optimizer.step()
                optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
                if optimizer_was_run:
                    # Delay optimizer scheduling until metrics are generated
                    if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                        self.lr_scheduler.step()

                model.zero_grad()
                self.state.global_step += 1
                self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                self.control = self.callback_handler.on_step_end(args, self.state, self.control)

                self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
            else:
                self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

            if self.control.should_epoch_stop or self.control.should_training_stop:
                # PyTorch/XLA relies on the data loader to insert the mark_step for
                # each step. Since we are breaking the loop early, we need to manually
                # insert the mark_step here.
                if is_torch_tpu_available():
                    xm.mark_step()
                break
        if step < 0:
            logger.warning(
                "There seems to be not a single sample in your epoch_iterator, stopping training at step"
                f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
                f" num_steps ({max_steps}) higher than the number of available samples."
            )
            self.control.should_training_stop = True

        self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
        self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)

        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            if is_torch_tpu_available():
                # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                xm.master_print(met.metrics_report())
            else:
                logger.warning(
                    "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
                    "configured. Check your training configuration if this is unexpected."
                )
        if self.control.should_training_stop:
            break

    if args.past_index and hasattr(self, "_past"):
        # Clean the state at the end of training
        delattr(self, "_past")

    logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
    if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
        # Wait for everyone to get here so we are sure the model has been saved by process 0.
        if is_torch_tpu_available():
            xm.rendezvous("load_best_model_at_end")
        elif args.parallel_mode == ParallelMode.DISTRIBUTED:
            dist.barrier()
        elif is_sagemaker_mp_enabled():
            smp.barrier()

        self._load_best_model()

    # add remaining tr_loss
    self._total_loss_scalar += tr_loss.item()
    train_loss = self._total_loss_scalar / self.state.global_step

    metrics = speed_metrics(
        "train",
        start_time,
        num_samples=num_train_samples,
        num_steps=self.state.max_steps,
        num_tokens=num_train_tokens,
    )
    self.store_flos()
    metrics["total_flos"] = self.state.total_flos
    metrics["train_loss"] = train_loss

    self.is_in_train = False

    self._memory_tracker.stop_and_update_metrics(metrics)

    self.log(metrics)

    run_dir = self._get_output_dir(trial)
    checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)

    # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
    if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
        for checkpoint in checkpoints_sorted:
            if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                shutil.rmtree(checkpoint)

    self.control = self.callback_handler.on_train_end(args, self.state, self.control)

    # Wait for the checkpoint to be uploaded.
    self._finish_current_push()

    # After training we make sure to retrieve back the original forward pass method
    # for the embedding layer by removing the forward post hook.
    if self.neftune_noise_alpha is not None:
        self._deactivate_neftune(self.model)

    return TrainOutput(self.state.global_step, train_loss, metrics)


### Example

In [None]:
#| hide
os.environ['WANDB_MODE'] = 'disabled'

In [None]:
#| hide
block = XCBlock.from_cfg('data', valid_pct=0.001, tokz='bert-base-uncased')

  self._set_arrayXarray(i, j, x)


In [None]:
#| hide
args = XCLearningArguments(
    output_dir='/home/scai/phd/aiz218323/scratch/garbage/T1/',
    per_device_train_batch_size=10,
    per_device_eval_batch_size=64,
    eval_steps=10,
    representation_accumulation_steps=10,
    representation_attribute='data_repr',
    evaluation_strategy='steps',
    label_names=['lbl2data_idx'],
    group_by_cluster=True,
    minimum_clusters=5,
    maximum_clusters=15,
    num_cluster_update_epochs=1,
)

In [None]:
#| hide
bsz = max(args.per_device_train_batch_size, args.per_device_eval_batch_size)*torch.cuda.device_count()
model = BT0004.from_pretrained('bert-base-uncased', lw=0.5, bsz=bsz, tn_targ=10_000, ig_tok=0)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BT0004 were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dr_loss_fn.t', 'lm_loss_fn.o']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#| hide
trie = XCTrie.from_block(block)

  0%|          | 0/312330 [00:00<?, ?it/s]

In [None]:
#| hide
metric = PrecRecl(block.n_lbl, block.valid.data_lbl_filterer, prop=block.train.dset.data.data_lbl, pk=5, rk=5, rep_pk=[1, 3, 5], rep_rk=[5])

In [None]:
#| hide
learn = XCLearner(
    model=model, 
    args=args,
    trie=trie,
    data_collator=block.collator, 
    train_dataset=block.train.dset, 
    eval_dataset=block.valid.dset,
    compute_metrics=metric,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
#| hide
learn.train()

  0%|          | 0/5410 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,P@1,P@3,P@5,N@1,N@3,N@5,Psp@1,Psp@3,Psp@5,Psn@1,Psn@3,Psn@5,R@5
10,No log,6.70607,0.016012,0.01213,0.013683,0.016012,0.018282,0.024867,0.011378,0.019889,0.040578,0.011378,0.016335,0.024213,0.03341


  0%|          | 0/2441 [00:00<?, ?it/s]

  0%|          | 0/2441 [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object>>
Traceback (most recent call last):
  File "/scratch/scai/phd/aiz218323/anaconda3/envs/xc_nlg/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
#| hide
o = learn.predict(learn.eval_dataset)