In [1]:
import os
import transformers
import haystack
import torch
import datasets
import importlib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(haystack.EMBED_MODEL)

json_config = {
  "attention_bias": False,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_scaling": None,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.0.dev0",
  "use_cache": True,
  "vocab_size": tokenizer.vocab_size
}

config = haystack.HaystackConfig.from_dict(json_config)
model = haystack.HaystackForCausalLM(config)
model.num_parameters() / 1e9

0.774184083

In [3]:
block_size = 200
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


def load_dataset(split):
  dataset = datasets.load_dataset(haystack.DATASET, split=split, cache_dir=haystack.CACHE_DIR, streaming=True, trust_remote_code=True)
  tokenized_dataset = dataset.map(
      lambda examples: tokenizer(examples["text"], padding="max_length", truncation=True, max_length=block_size),
      batched=True,
      remove_columns=dataset.features.keys(),
  )
  return tokenized_dataset.map(
      group_texts,
      batched=True,
      batch_size=32,
  )

lm_train, lm_test = load_dataset("train"), load_dataset("validation")
lm_test = lm_test.take(100)

In [5]:
from torch.profiler import profile, record_function, ProfilerActivity

inputs = next(iter(lm_train))
del inputs['token_type_ids']

import numpy as np
inputs['input_ids'] = torch.tensor(np.array(inputs['input_ids']))
inputs['attention_mask'] = torch.tensor(np.array(inputs['attention_mask']))

# add batch dimension to all inputs
inputs = {k: torch.tensor(v).unsqueeze(0) for k, v in inputs.items()}

with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(**inputs)

  inputs = {k: torch.tensor(v).unsqueeze(0) for k, v in inputs.items()}
STAGE:2024-03-12 19:31:43 23908:23908 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-12 19:31:43 23908:23908 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-12 19:31:43 23908:23908 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        30.74%     435.307ms       100.00%        1.416s        1.416s       0.000us         0.00%      59.643ms      59.643ms             1  
                                             aten::item         3.16%      44.771ms        51.79%     733.409ms      17.684us       0.000us         0.00%      41.747ms       1.007us         41473  
         

In [None]:
from transformers import Trainer, TrainingArguments
from torch.profiler import profile, record_function, ProfilerActivity


lm_train = lm_train.take(100)

model_name = 'haystack-v0'
training_args = TrainingArguments(
    f"{model_name}-pretrain-scholar",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    optim="adamw_torch",
    per_device_train_batch_size=1,
    fp16=True,
    max_steps=10000,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_train,
    eval_dataset=lm_test,
)
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 