In [11]:
from typing import Dict, Iterator, Tuple
import numpy as np
import pandas as pd

from tokenizers import Tokenizer 
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit

In [5]:
def random_spans_noise_mask(length, noise_density, mean_noise_span_length):
    """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
    Noise mask consisting of random spans of noise tokens.
    The number of noise tokens and the number of noise spans and non-noise spans
    are determined deterministically as follows:
    num_noise_tokens = round(length * noise_density)
    num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
    Spans alternate between non-noise and noise, beginning with non-noise.
    Subject to the above restrictions, all masks are equally likely.
    Args:
        length: an int32 scalar (length of the incoming token sequence)
        noise_density: a float - approximate density of output mask
        mean_noise_span_length: a number
    Returns:
        a boolean tensor with shape [length]
    """
    orig_length = length

    num_noise_tokens = int(np.round(length * noise_density))
    # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
    num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
    num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length))

    # avoid degeneracy by ensuring positive number of noise spans
    num_noise_spans = max(num_noise_spans, 1)
    num_nonnoise_tokens = length - num_noise_tokens

    # pick the lengths of the noise spans and the non-noise spans
    def _random_segmentation(num_items, num_segments):
        """Partition a sequence of items randomly into non-empty segments.
        Args:
            num_items: an integer scalar > 0
            num_segments: an integer scalar in [1, num_items]
        Returns:
            a Tensor with shape [num_segments] containing positive integers that add
            up to num_items
        """
        mask_indices = np.arange(num_items - 1) < (num_segments - 1)
        np.random.shuffle(mask_indices)
        first_in_segment = np.pad(mask_indices, [[1, 0]])
        segment_id = np.cumsum(first_in_segment)
        # count length of sub segments assuming that list is sorted
        _, segment_length = np.unique(segment_id, return_counts=True)
        return segment_length

    noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
    nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)

    interleaved_span_lengths = np.reshape(
        np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
    )
    span_starts = np.cumsum(interleaved_span_lengths)[:-1]
    span_start_indicator = np.zeros((length,), dtype=np.int8)
    span_start_indicator[span_starts] = True
    span_num = np.cumsum(span_start_indicator)
    is_noise = np.equal(span_num % 2, 1)

    return is_noise[:orig_length]

In [6]:
def create_sentinel_ids( mask_indices, vocab_size):
    """
    Sentinel ids creation given the indices that should be masked.
    The start indices of each mask are replaced by the sentinel ids in increasing
    order. Consecutive mask indices to be deleted are replaced with `-1`.
    """
    start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
    start_indices[:, 0] = mask_indices[:, 0]

    sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
    sentinel_ids = np.where(sentinel_ids != 0, vocab_size - sentinel_ids, 0)
    sentinel_ids -= mask_indices - start_indices
    return sentinel_ids

In [7]:
def filter_input_ids( eos_token_id, input_ids, sentinel_ids):
    """
    Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
    This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
    """
    batch_size = input_ids.shape[0]

    input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
    # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
    # masked tokens coming after sentinel tokens and should be removed
    input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1))
    input_ids = np.concatenate(
        [input_ids, np.full((batch_size, 1), eos_token_id, dtype=np.int32)], axis=-1
    )
    return input_ids

In [8]:
def mask_sentence(sentence: str, tokenizer: Tokenizer) -> Tuple[str, str]:
    # TODO: ask Moatez if the input sentence needs EOS markers.
        # NOTE: not sure if we need the ids for the Transformer encoding. 
    token_ids = np.array([tokenizer.encode(sentence).ids]) 
    print(token_ids)
    # TODO: should we have a + 1 with len(token_ids[0])?
    mask_indices = np.array([random_spans_noise_mask(len(token_ids[0]), 0.15, 2)]) # following from my 2022-03-12 report.
    print(mask_indices)
    labels_mask = ~mask_indices
    # input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), 7)

    # NOTE: we can call len only on the Transformer style tokenizer 
    input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), tokenizer.get_vocab_size())
    input_ids = filter_input_ids(0, token_ids, input_ids_sentinel)

    labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), tokenizer.get_vocab_size())
    label_ids = filter_input_ids(0, token_ids, labels_sentinel)

    print(input_ids)
    # TODO: replace the zeros later.
    input_sentence = " ".join([tokenizer.id_to_token(input_id) for input_id in input_ids[0]])
    label_sentence = " ".join([tokenizer.id_to_token(label_id) for label_id in label_ids[0]])

    return (input_sentence, label_sentence)  

In [9]:
def build_tokenizer(vocab: Dict[str, int]) -> Tokenizer:
    tokenizer = Tokenizer(WordLevel(vocab))
    tokenizer.pre_tokenizer = WhitespaceSplit()
    return tokenizer

def get_vocab(sent_iterator: Iterator[str]) -> Dict[str, int]:
    vocab = {}
    vocab["</s>"] = 0
    curr_id = 1
    for sent in sent_iterator:
        tokens = sent.split(' ')
        for token in tokens:
            if token not in vocab:
                vocab[token] = curr_id
                curr_id += 1
            else: 
                continue 
    
    # TODO: fix the ordering.
    increment = curr_id + 100
    for i in range(100):
        # Possibly off-by-one error here? But unlikely
        vocab[f"<extra_id_{i}>"] = increment - i - 1
    return vocab


In [14]:
# sent = "للاسف الشديد ان يصل الامر لهذه"
frame = pd.read_csv(f"data/spreadsheets/ARLU_Binary_test_data_id_light.tsv", sep="\t")
frame = frame[frame["label"] == "MSA"]
iter_sentences = iter(frame['content'])

vocab = get_vocab(iter_sentences)
tokenizer = build_tokenizer(vocab)

In [15]:
print(tokenizer.get_vocab_size())

19494


In [21]:
iter_sentences = iter(frame['content'])
input_sentences = []
label_sentences = []
sent_count = 0
for sentence in iter_sentences:
    if len(sentence.split(' ')) > 3:
        input_sentence, label_sentence = mask_sentence(sentence, tokenizer)
        input_sentences.append(input_sentence)
        label_sentences.append(label_sentence)
        print(input_sentence)
        print(label_sentence)


[[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]]
[[False False False False False  True  True False False False False False
  False False False False  True]]
[[    1     2     3     4     5 19493     8     9    10    11    12    13
     14    15    16 19492     0]]
''الناتو سيجتمعون من اجل مراقبة <extra_id_0> مؤخرا وتقاربها مع روسيا في حال تتطور سيتم اقصائها <extra_id_1> </s>
<extra_id_0> تحركات تركيا <extra_id_1> مرحليا'' </s>
[[18 19 19 20 21 22 23 24 25 26 27 28 22 29]]
[[False False False False False False False False False False False False
   True  True]]
[[   18    19    19    20    21    22    23    24    25    26    27    28
  19493     0]]
'' USER USER يااريت الكل مثل جبهة النصرة الجيش الحر احرار الشام <extra_id_0> </s>
<extra_id_0> مثل داعش'' </s>
[[30 31 32 33 34 35]]
[[False False False False False  True]]
[[   30    31    32    33    34 19493     0]]
للاسف الشديد ان يصل الامر <extra_id_0> </s>
<extra_id_0> لهذه </s>
[[12 36 37 38 39 40 41]]
[[False False False False 

In [22]:
t5_training_frame = pd.DataFrame(
    data={
        "input_sentence": input_sentences,
        "label_sentence": label_sentences
    }
)

In [26]:
# print(t5_training_frame
print(t5_training_frame.iloc[0]['input_sentence'])


''الناتو سيجتمعون من اجل مراقبة <extra_id_0> مؤخرا وتقاربها مع روسيا في حال تتطور سيتم اقصائها <extra_id_1> </s>


In [24]:
t5_training_frame.to_csv("data/spreadsheets/msa_masked_data.csv", index=False)