In [1]:
!pip install datasets transformers sentencepiece tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
# load llama tokenizer 
from transformers import LlamaTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase
from typing import List, Dict, Any
from dataclasses import dataclass
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import itertools


tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")

In [2]:
def preprocess_dataset(
    ds,
    tokenizer: PreTrainedTokenizerBase,
    method="direct",
    include_choices=False,
    num_procs=8,
):
    def preprocess_function(examples):
        """
        * tokenizes dataset
        * token_type_ids are 1 where there are label tokens and 0 otherwise
        """
        if include_choices:
            inputs = [
                f"{inp}\nchoice: " + "\nchoice: ".join(choices) + " \n\n"
                for inp, choices in zip(
                    examples["inputs"], examples["multiple_choice_targets"]
                )
            ]
        else:
            inputs = [f"{inp} \n\n" for inp in examples["inputs"]]

        targets = [" ".join(targets) + " \n\n\n" for targets in examples["targets"]]

        # swap inputs and targets if method is "channel"
        if method == "channel":
            inputs, targets = targets, inputs

        # tokenize inputs and targets, and prepare outputs dictionary
        input_tokenized = tokenizer(inputs, add_special_tokens=False)
        target_tokenized = tokenizer(targets, add_special_tokens=False)
        outputs = {
            "input_ids": [],
            "attention_mask": [],
            "token_type_ids": [],
        }

        # merge input and target tokens and prepare outputs
        for i in range(len(input_tokenized["input_ids"])):
            input_ids = input_tokenized["input_ids"][i]
            target_ids = target_tokenized["input_ids"][i]
            outputs["input_ids"].append(input_ids + target_ids)

            input_attention = input_tokenized["attention_mask"][i]
            target_attention = target_tokenized["attention_mask"][i]
            outputs["attention_mask"].append(input_attention + target_attention)

            input_token_type = [0] * len(input_ids)
            target_token_type = [1] * (len(target_ids) - 2) + [0, 0]
            outputs["token_type_ids"].append(input_token_type + target_token_type)

        return outputs

    ds = ds.map(
        preprocess_function,
        batched=True,
        num_proc=num_procs,
    )
    return ds


In [3]:
@dataclass
class ICLCollator:
    tokenizer: PreTrainedTokenizerBase
    k_examples: int = 16
    max_length: int = 2048
    return_tensors: str = "pt"
    for_eval: bool = False

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        * creates batches for in context/few shot learning
        * length of [features] should be (k_examples * batch_size)
        * if for_eval create a labels field
        """
        batch = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

        if self.for_eval:
            # if collation for evaluation, features is a List[List[Dict[str, Any]]]
            # where the inner list contains our k_examples, so flatten it
            features = list(itertools.chain.from_iterable(features))

        for i in range(0, len(features), self.k_examples):
            batch["input_ids"].append(
                list(
                    itertools.chain.from_iterable(
                        example["input_ids"]
                        for example in features[i : i + self.k_examples]
                    )
                )#[: self.max_length]
            )
            batch["attention_mask"].append(
                list(
                    itertools.chain.from_iterable(
                        example["attention_mask"]
                        for example in features[i : i + self.k_examples]
                    )
                )#[: self.max_length]
            )
            batch["token_type_ids"].append(
                list(
                    itertools.chain.from_iterable(
                        example["token_type_ids"]
                        for example in features[i : i + self.k_examples]
                    )
                )#[: self.max_length]
            )

        batch = self.tokenizer.pad(
            batch,
            padding="longest",
            # max_length=self.max_length,
            pad_to_multiple_of=None,
            return_tensors=self.return_tensors,
        )

        if self.for_eval:
            batch["labels"] = batch["input_ids"].clone()
            batch["labels"] *= batch["token_type_ids"]

        return batch

In [4]:
import sys
sys.path.append('..')
from config import TRAIN_TASKS, TEST_TASKS

In [9]:
%%capture

import os
import numpy as np
from datasets import load_dataset


k=1
batch_size=1
collate_fn = ICLCollator(tokenizer, k_examples=k, max_length=2048)
tokenizer.pad_token = tokenizer.eos_token 

meta_stats = [] 
for task in tqdm(os.listdir('../data/augment_train_v2/')):
  # if not task.endswith(".json") or task in ["which_wiki_edit.json", "real_or_fake_text.json", "authorship_verification.json"]:
  #   continue
  print(task)
  
  ds = load_dataset("json", data_files=f"../data/augment_train_v2/{task}")['train']
  pds = preprocess_dataset(ds, tokenizer)
  dataloader = DataLoader(pds, batch_size=k*batch_size, collate_fn=collate_fn)
  if len(dataloader) == 0:
    continue

  max_length = 0
  avg_length = 0
  for batch in dataloader:
    batch_len = (batch['input_ids'].shape[1])
    avg_length += batch_len
    max_length = max(max_length, batch_len)
  
  avg_length /= len(dataloader)
  task_stats = {"task": task[:-5], "avg_length": avg_length, "max_length": max_length}
  meta_stats.append(task_stats)

for task in tqdm(TEST_TASKS):
  print(task)
  try:
    ds = load_dataset(f"tasksource/bigbench", task)['train']
  except:
    continue
  pds = preprocess_dataset(ds, tokenizer)
  dataloader = DataLoader(pds, batch_size=k*batch_size, collate_fn=collate_fn)
  if len(dataloader) == 0:
    continue

  max_length = 0
  avg_length = 0
  for batch in dataloader:
    batch_len = (batch['input_ids'].shape[1])
    avg_length += batch_len
    max_length = max(max_length, batch_len)
  
  avg_length /= len(dataloader)
  task_stats = {"task": task[:-5], "avg_length": avg_length, "max_length": max_length}
  meta_stats.append(task_stats)


In [10]:
import pandas as pd

df = pd.DataFrame(meta_stats).sort_values('max_length', ascending=False)

In [11]:
df.head()

Unnamed: 0,task,avg_length,max_length
91,language_identific,208.79675,3438
101,movie_dialog_same_or_diff,138.994,1741
95,undo_permut,162.754167,848
1,real_or_fake_text,451.942529,643
27,which_wiki_edit,345.2,609


In [8]:
df.to_csv("task_token_lengths.csv")