In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [2]:
from transformers import AutoTokenizer
import haystack
import torch
import importlib

  from .autonotebook import tqdm as notebook_tqdm
INFO:AnglE:Prompt is set, the prompt will be automatically applied during the encoding phase. To disable prompt setting, please configure set_prompt(prompt=None)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

json_config = {
  "attention_bias": False,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_scaling": None,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.0.dev0",
  "use_cache": False,
  "vocab_size": 256000
}

config = haystack.HaystackConfig.from_dict(json_config)

model = haystack.HaystackForCausalLM.from_pretrained("google/gemma-2b", config=config)
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]
Some weights of HaystackForCausalLM were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['model.db.db.weight', 'model.db.embedding_to_hidden.bias', 'model.db.embedding_to_hidden.weight', 'model.db.hidden_to_key.bias', 'model.db.hidden_to_key.weight', 'model.db.keys.bias', 'model.db.keys.weight', 'model.layers.0.db_layer.db.weight', 'model.layers.0.db_layer.embedding_to_hidden.bias', 'model.layers.0.db_layer.embedding_to_hidden.weight', 'model.layers.0.db_layer.hidden_to_key.bias', 'model.layers.0.db_layer.hidden_to_key.weight', 'model.layers.0.db_layer.keys.bias', 'model.layers.0.db_layer.keys.weight', 'model.layers.1.db_layer.db.weight', 'model.layers.1.db_layer.embedding_to_hidden.bias', 'model.layers.1.db_layer.embedding_to_hidden.weight', 'model.layers.1.db_layer.hidden_to_key.bias', 'model.layers.1.db_layer.hidden_to_key.weight', 'model.layers.1.db_layer.keys.bias', 'm

In [4]:
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [6]:
tokenized_datasets["train"][1]


{'input_ids': [2, 589, 164672, 4016, 81321, 5638, 589, 235248, 108],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
block_size = 200
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [9]:
tokenizer.decode(lm_datasets["train"][10]["input_ids"])

" Media.Vision. The original scenario was written Kazuki Yamanobe , while the script was written by Hiroyuki Fujii , Koichi Majima , Kishiko Miyagi , Seiki Nagakawa and Takayuki Shouji . Its story was darker and more somber than that of its predecessor . \n<bos> The majority of material created for previous games , such as the BLiTZ system and the design of maps , was carried over . Alongside this , improvements were made to the game 's graphics and some elements were expanded , such as map layouts , mission structure , and the number of playable units per mission . A part of this upgrade involved creating unique polygon models for each character 's body . In order to achieve this , the cooperative elements incorporated into the second game were removed , as they took up a large portion of memory space needed for the improvements . They also adjusted the difficulty settings and ease of play so they could appeal to new players while retaining the essential components of the series '"

In [10]:
from transformers import Trainer, TrainingArguments

model_name = 'haystack-v0'
training_args = TrainingArguments(
    f"{model_name}-finetuned-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    optim="adafactor",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 21.96 GiB of which 1.24 GiB is free. Including non-PyTorch memory, this process has 20.71 GiB memory in use. Of the allocated memory 20.09 GiB is allocated by PyTorch, and 395.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)