In [1]:
import nlp
import torch
import operator
from torch.utils.data import TensorDataset
import transformers
from transformers import T5Tokenizer
import os

In [2]:
cd ..

/home/saareliad/workspace/async_pipeline


In [3]:
import models

In [4]:
partitioned_model_path = "models/partitioned/t5_small_tied_lmhead_4p_bw12_async_squad1.py"

In [5]:
def get_internal_transformer_config(partitioned_model_path):
    basename = os.path.basename(partitioned_model_path)
    if basename.endswith(".py"):
        basename = basename[:-3]
    return basename

In [6]:
internal_transformer_config = get_internal_transformer_config(partitioned_model_path)

In [7]:
model, tokenizer, config = models.transformers_utils.get_model_tokenizer_and_config_by_name(internal_transformer_config)

{'missing_keys': ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'], 'error_msgs': []}


In [8]:
######################
# T5 preprocessing
######################
# moving the preprocessing from the model outside.
# (its problematic for pipeline to do preprocessing inside the model)

def is_None(a):
    return operator.is_(a, None)


def is_not_None(a):
    return operator.is_not(a, None)


# Used to be a method. changed to just take it from config.
def _shift_right(config, input_ids):
    decoder_start_token_id = config.decoder_start_token_id
    pad_token_id = config.pad_token_id

    assert (
        # NOTE is not None
        # decoder_start_token_id is not None
        is_not_None(decoder_start_token_id)
    ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information"

    # shift inputs to the right
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
    shifted_input_ids[..., 0] = decoder_start_token_id

    #NOTE is not None
    # assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
    assert is_not_None(pad_token_id),"self.model.config.pad_token_id has to be defined."
    # replace possible -100 values in lm_labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100"

    return shifted_input_ids



################################################
# mask methods extracted from transformers.PreTrainedModel
# in order to enable precomputing of masks
################################################


def get_attention_mask(input_shape,attention_mask,device,is_decoder=False,dtype=torch.float32):
    # attention_mask is the original decoder/encoder attention mask given to the model
    # for encoder we will pass input_ids.size() and attention_mask
    # for decoder we will pass decoder_input_ids.size() and decoder_attention_mask
    if is_None(attention_mask):
        attention_mask = torch.ones(input_shape,device=device)

    # ourselves in which case we just need to make it broadcastable to all heads.    
    return get_extended_attention_mask(attention_mask, input_shape,is_decoder=is_decoder,dtype=dtype)

def get_inverted_encoder_attention_mask(mask_shape,encoder_attention_mask,device,dtype=torch.float32):
    # mask_shape is batch_size,encoder_seq_length
    # encoder_attention_mask is the original attention_mask given to the model
    if is_None(encoder_attention_mask):
        encoder_attention_mask = torch.ones(mask_shape,device=device)

    if is_not_None(encoder_attention_mask):
        inverted_encoder_attention_mask = invert_attention_mask(encoder_attention_mask,dtype=dtype)
    else:
        inverted_encoder_attention_mask = None
    
    return inverted_encoder_attention_mask



def invert_attention_mask(encoder_attention_mask,dtype=torch.float32):
    """type: torch.Tensor -> torch.Tensor"""
    if encoder_attention_mask.dim() == 3:
        encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
    if encoder_attention_mask.dim() == 2:
        encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
    # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
    # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
    # /transformer/transformer_layers.py#L270
    # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
    # encoder_extended_attention_mask.transpose(-1, -2))
    encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=dtype)  # fp16 compatibility

    if dtype == torch.float16:
        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
    elif dtype == torch.float32:
        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
    else:
        raise ValueError(
            "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
                dtype
            )
        )

    return encoder_extended_attention_mask


def get_extended_attention_mask(attention_mask, input_shape,is_decoder=False,dtype=torch.float32):
    """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.

    Arguments:
        attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to
        input_shape: tuple, shape of input_ids
        device: torch.Device, usually self.device

    Returns:
        torch.Tensor with dtype of attention_mask.dtype
    """
    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        # Provided a padding mask of dimensions [batch_size, seq_length]
        # - if the model is a decoder, apply a causal mask in addition to the padding mask
        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if is_decoder:
            batch_size, seq_length = input_shape
            seq_ids = torch.arange(seq_length,device=attention_mask.device)
            causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
            # causal and attention masks must have same type with pytorch version < 1.3
            causal_mask = causal_mask.to(attention_mask.dtype)
            extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
        else:
            extended_attention_mask = attention_mask[:, None, None, :]
    else:
        raise ValueError(
            "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
                input_shape, attention_mask.shape
            )
        )
    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and -10000.0 for masked positions.
    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask


In [9]:
t = torch.randn(4,3)

list(t)

[tensor([-0.8355,  0.5072, -1.0405]),
 tensor([-1.1658, -1.1889, -1.3526]),
 tensor([-2.0628, -0.9773, -0.6298]),
 tensor([-1.1769, -0.2107,  0.5137])]

In [10]:
max_length = 512

In [57]:
just = ['attention_mask', 'decoder_input_ids', 'input_ids']

just = ['attention_mask']

In [58]:
# def get_just_x_or_y_train_dev_dataset(just, DATA_DIR, **kw):
#     """ get x or y datset. """
#     max_length = kw['max_seq_length']
#     tokenizer = kw['tokenizer']
#     config = kw['config']
# pyarrow.lib.ArrowInvalid: Tried reading schema message, was null or length 0

if just == 'x':
    subset_of_inputs = {
            "input_ids",
            "attention_mask",
            "decoder_input_ids",
            "decoder_attention_mask",
            # "lm_labels"
            }
elif just == 'y':
    subset_of_inputs = {
            "lm_labels",
            }
elif isinstance(just, list):
    subset_of_inputs = set(just)
else:
    raise NotImplementedError()

# Define all preprocessing here

# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (
        example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
# NOTE: they use global tokenizer

def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(
        example_batch['input_text'],
        pad_to_max_length=True,
        truncation=True,
        max_length=max_length
    )  # NOTE: I think this could be changed to 384 like bert to save memory.
    target_encodings = tokenizer.batch_encode_plus(
        example_batch['target_text'],
        pad_to_max_length=True,
        truncation=True,
        max_length=16)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }
    return encodings

def preproc(ds):
    input_ids = ds['input_ids']
    lm_labels = ds['target_ids']
    attention_mask = ds['attention_mask']
    decoder_attention_mask = ds['target_attention_mask']

    input_ids = torch.tensor(input_ids) # .unsqueeze_(0)
    lm_labels = torch.tensor(lm_labels) # .unsqueeze_(0)
    attention_mask = torch.tensor(attention_mask) # .unsqueeze_(0)
    decoder_attention_mask = torch.tensor(decoder_attention_mask) # .unsqueeze_(0)

    lm_labels[lm_labels[:, :] == 0] = -100

    decoder_input_ids = _shift_right(config, lm_labels)

    precompute_masks = getattr(config, "precomputed_masks", False)
    if precompute_masks:
        # print("-I- precomputing t5 masks on CPU", end ="...")
        inverted_encoder_attention_mask = get_inverted_encoder_attention_mask(input_ids.size(),attention_mask,attention_mask.device)
        attention_mask = get_attention_mask(input_ids.size(),attention_mask,attention_mask.device,is_decoder=False)    
        decoder_attention_mask = get_attention_mask(decoder_input_ids.size(),decoder_attention_mask,decoder_attention_mask.device,is_decoder=True)
        # print("-I- done")
    else:
        # print("-W- preprocessing will happen inside the model...")
        inverted_encoder_attention_mask = None
        decoder_attention_mask = None

    # Now, we order according to signature
    # input_ids,
    # attention_mask=None,
    # decoder_input_ids=None,
    # decoder_attention_mask=None,
    # inverted_encoder_attention_mask=None,
    # lm_labels=None

    d = {}
    d['input_ids'] = input_ids
    d['attention_mask'] = attention_mask
    d['decoder_input_ids'] = decoder_input_ids
    d['decoder_attention_mask'] = decoder_attention_mask
    d['inverted_encoder_attention_mask'] = inverted_encoder_attention_mask
    d['lm_labels'] = lm_labels

    # too lazy to do it selectivly...
    keys = tuple(d.keys())
    for k in keys:
        if k not in subset_of_inputs:
            del d[k]

    keys = tuple(d.keys())
    for k in keys:
        if d[k] is None:
            del d[k]

    keys = tuple(d.keys())
    for k in keys:
    #     # d[k] = list(d[k])
    #    d[k].squeeze_(0)
        d[k] = d[k].tolist()

    return d


# TODO: allow squad2
train_dataset = nlp.load_dataset('squad', split="train[:1%]")
# train_dataset.cleanup_cache_files()  # Returns the number of removed cache files
train_dataset = train_dataset.map(add_eos_to_examples, load_from_cache_file=False)
train_dataset = train_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)
# train_dataset.set_format(type='torch', columns=None)

# for i in range(len(train_dataset)):
#     train_dataset[i] = preproc(train_dataset[i])

train_dataset = train_dataset.map(preproc, batched=True, load_from_cache_file=False)
train_dataset.set_format(type='torch', columns=just)
 

dev_dataset = nlp.load_dataset('squad', split=nlp.Split.VALIDATION)
dev_dataset = dev_dataset.map(add_eos_to_examples, load_from_cache_file=False)
dev_dataset = dev_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)
# dev_dataset.set_format(type='torch', columns=None)
dev_dataset = dev_dataset.map(preproc, batched=True, load_from_cache_file=False)
dev_dataset.set_format(type='torch', columns=just)

# TODO: evaluation (see squad.py)

# def set_eval(trainer):
#     pass
#     # trainer.features = features
#     # trainer.statistics.evaluate_squad = types.MethodType(
#     #    evaluate_squad, trainer.statistics)

# return train_dataset, dev_dataset, set_eval

HBox(children=(FloatProgress(value=0.0, max=876.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10570.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [59]:
to_drop = [i for i in dev_dataset.column_names if i not in subset_of_inputs]
dev_dataset.drop(to_drop)

In [60]:
dev_dataset.column_names

['attention_mask']

In [21]:
torch.save(dev_dataset, "dev_dataset.pt")
torch.load("dev_dataset.pt")

Dataset(features: {'answers': {'answer_start': [Value(dtype='int64', id=None)], 'text': [Value(dtype='string', id=None)]}, 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_text': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'target_text': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'input_ids': [Value(dtype='int64', id=None)], 'attention_mask': [[[Value(dtype='float64', id=None)]]], 'target_ids': [Value(dtype='int64', id=None)], 'target_attention_mask': [Value(dtype='int64', id=None)], 'decoder_input_ids': [Value(dtype='int64', id=None)]}, num_rows: 10570)

In [22]:
to_drop = [i for i in train_dataset.column_names if i not in subset_of_inputs]
train_dataset.drop(to_drop)

In [23]:
torch.save(dev_dataset, "train_dataset.pt")
torch.load("train_dataset.pt")

Dataset(features: {'answers': {'answer_start': [Value(dtype='int64', id=None)], 'text': [Value(dtype='string', id=None)]}, 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_text': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'target_text': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'input_ids': [Value(dtype='int64', id=None)], 'attention_mask': [[[Value(dtype='float64', id=None)]]], 'target_ids': [Value(dtype='int64', id=None)], 'target_attention_mask': [Value(dtype='int64', id=None)], 'decoder_input_ids': [Value(dtype='int64', id=None)]}, num_rows: 10570)

In [24]:
nlp.Dataset._get_cache_file_path

[0;31mSignature:[0m [0mnlp[0m[0;34m.[0m[0mDataset[0m[0;34m.[0m[0m_get_cache_file_path[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mfunction[0m[0;34m,[0m [0mcache_kwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Find a unique name from the filenames, kwargs and the function 
[0;31mFile:[0m      /home_local/saareliad/miniconda3/envs/py38/lib/python3.8/site-packages/nlp/arrow_dataset.py
[0;31mType:[0m      function


## Combine with partitioning

In [51]:
pipe_config , partition = models.cfg_to_model.get_partitioning(internal_transformer_config, 2, 16, model)

In [52]:
sum(p.numel() for p in partition.parameters())

6293504

In [55]:
just = list(pipe_config.get_dataset_inputs_for_stage(2).keys())

In [56]:
just

['attention_mask']

In [42]:
import t5_squad

In [None]:
a,b,c = t5_squad.get_just_x_or_y_train_dev_dataset(just, None, **dict(tokenizer=tokenizer, config=config, max_seq_length=max_length))

# Loading from cache

In [61]:
tmp = torch.load("/home_local/saareliad/data/cache_val.t5_squad_just_FULL.pt")

In [62]:
tmp

Dataset(features: {'answers': {'answer_start': [Value(dtype='int64', id=None)], 'text': [Value(dtype='string', id=None)]}, 'context': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'input_text': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'target_text': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'input_ids': [Value(dtype='int64', id=None)], 'attention_mask': [[[Value(dtype='float64', id=None)]]], 'target_ids': [Value(dtype='int64', id=None)], 'target_attention_mask': [Value(dtype='int64', id=None)], 'decoder_input_ids': [Value(dtype='int64', id=None)]}, num_rows: 10570)

In [64]:
tmp.set_format(type='torch')

In [66]:
tmp.set_format()

In [74]:
torch.tensor(tmp[0]['attention_mask'])

torch.float32

In [68]:
tmp.set_format(type='torch')

In [70]:
tmp.set_format()

['answers',
 'context',
 'id',
 'input_text',
 'question',
 'target_text',
 'title',
 'input_ids',
 'attention_mask',
 'target_ids',
 'target_attention_mask',
 'decoder_input_ids']

In [78]:
tmp.drop()



In [86]:
tmp = torch.load("/home_local/saareliad/data/cache_val.t5_squad_just_FULL.pt")

In [88]:
tmp[0].keys()

dict_keys(['answers', 'context', 'id', 'input_text', 'question', 'target_text', 'title', 'input_ids', 'attention_mask', 'target_ids', 'target_attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'inverted_encoder_attention_mask', 'lm_labels'])

In [90]:
to_drop = [i for i in tmp.column_names if i not in ['decoder_attention_mask', 'inverted_encoder_attention_mask', 'lm_labels']]

In [91]:
tmp.drop(to_drop)

In [92]:
tmp[0].keys()

dict_keys(['decoder_attention_mask', 'inverted_encoder_attention_mask', 'lm_labels'])

In [95]:
d = [torch.tensor(tmp[v]) for v in tmp.column_names]

In [111]:
tds = torch.utils.data.TensorDataset(torch.tensor(tmp[v]) for v in tmp.column_names)

AttributeError: 'generator' object has no attribute 'size'

In [94]:
t = torch.tensor(tmp['decoder_attention_mask'])

In [None]:
t1 = 

In [85]:
tmp.reset_format()
tmp[0].keys()

dict_keys(['answers', 'context', 'id', 'input_text', 'question', 'target_text', 'title', 'input_ids', 'attention_mask', 'target_ids', 'target_attention_mask', 'decoder_input_ids'])

In [104]:
len(tds.tensors)

3

In [105]:
a = (1,2,3)

In [110]:
def foo(*a):
    for i in a:
        print(i)
        
foo(*range(3))

0
1
2


In [145]:
class TorchCache:
    def __init__(self, cache_name, overwrite=False):
        self.cache_name = cache_name
        self.exists = os.path.exists(cache_name)
        self.overwrite = overwrite
        self.v = None
        
    def __enter__(self):
        if self.exists:
            print(f"loading from cache: {self.cache_name}")
            self.v = torch.load(self.cache_name)
        else:
            print(f"computing value for {self.cache_name}")
        return self
    
    def __exit__(self, type, value, traceback):
        if not self.exists or self.overwrite:
            print(f"saving to cache: {self.cache_name}")            
            assert self.v is not None, "You should enter a value"
            torch.save(self.v, self.cache_name)

def compute_and_cache(compute_function, cache_name, overwrite=False, *args, **kw):
    """
    Compute or load from cache, optionaly save results to cache.
    Return computed value
    Examples:
        # compute big
        # compute_and_cache(lambda: torch.ones(10), "big")
        
        # compute big, then small
        # compute_and_cache(lambda: torch.randn(10) * compute_and_cache(lambda: torch.ones(10), "big"), "small")
    """

    with TorchCache(cache_name) as big:
        if not big.exists:
            big.v = compute_function(*args, **kw)
    return big.v



loading from cache: small


tensor([-0.2397,  0.8943,  1.0996,  0.8660, -0.3662,  1.7107, -0.9624, -0.1194,
         0.8584,  0.6684])

In [146]:
!ls -r /home_local/saareliad/data/cache_*


/home_local/saareliad/data/cache_val.t5_squad_just_lm_labels_inverted_encoder_attention_mask_decoder_attention_mask.pt
/home_local/saareliad/data/cache_val.t5_squad_just_inverted_encoder_attention_mask_lm_labels_decoder_attention_mask.pt
/home_local/saareliad/data/cache_val.t5_squad_just_input_ids_decoder_input_ids_attention_mask.pt
/home_local/saareliad/data/cache_val.t5_squad_just_input_ids_attention_mask_decoder_input_ids.pt
/home_local/saareliad/data/cache_val.t5_squad_just_FULL.pt
/home_local/saareliad/data/cache_val.t5_squad_just_decoder_input_ids_input_ids_attention_mask.pt
/home_local/saareliad/data/cache_val.t5_squad_just_decoder_attention_mask_inverted_encoder_attention_mask_lm_labels.pt
/home_local/saareliad/data/cache_val.t5_squad_just_attention_mask.pt
/home_local/saareliad/data/cache_train.t5_squad_just_lm_labels_inverted_encoder_attention_mask_decoder_attention_mask.pt
/home_local/saareliad/data/cache_train.t5_squad_just_inverted_encoder_attention_mask_lm_labels_decoder_

In [None]:
t5_squad_just_decoder_attention_mask_inverted_encoder_attention_mask_lm_labels
t5_squad_just_inverted_encoder_attention_mask_lm_labels_decoder_attention_mask
t5_squad_just_lm_labels_inverted_encoder_attention_mask_decoder_attention_mask




In [150]:
torch.dtype??

[0;31mInit signature:[0m [0mtorch[0m[0;34m.[0m[0mdtype[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      <no docstring>
[0;31mFile:[0m           /home_local/saareliad/miniconda3/envs/py38/lib/python3.8/site-packages/torch/__init__.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     
