In [1]:
import os

import pandas as pd
import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets, DatasetDict

In [2]:
llm_model_name = "acrastt/Marx-3B-V2"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype=torch.float16)
model.tie_weights()

MODEL_MAX_LENGTH = 1024

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
for param in model.parameters():
  param.requires_grad = False # freeze the model - train adapters later

model.gradient_checkpointing_enable() # reduce number of stored activations
model.enable_input_require_grads()

In [4]:
model.config

LlamaConfig {
  "_name_or_path": "acrastt/Marx-3B-V2",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 3200,
  "initializer_range": 0.02,
  "intermediate_size": 8640,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 26,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.33.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
summary(model)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   (102,400,000)
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-2                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-3                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-4                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-5                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-6                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-7                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-8                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-9                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-10                     (12

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params}\tall params: {all_param}\ttrainable%: {100 * trainable_params / all_param}"
    )

In [7]:
print_trainable_parameters(model)

trainable params: 0	all params: 3426473600	trainable%: 0.0


In [8]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset_train = load_dataset("parquet", data_files=os.path.join(os.getcwd(), os.pardir, "data", "total_data.parquet"), split="train[:90%]")
dataset_valid = load_dataset("parquet", data_files=os.path.join(os.getcwd(), os.pardir, "data", "total_data.parquet"), split="train[90%:]")

In [9]:
def tokenization(batch) -> dict:
    return tokenizer(
        batch["text"],
        max_length=MODEL_MAX_LENGTH,
        truncation=True,
    )

In [10]:
tokenized_dataset_train = dataset_train.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset_train = tokenized_dataset_train.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset_train

Dataset({
    features: ['dialog', 'text', 'input_ids', 'attention_mask'],
    num_rows: 20771
})

In [11]:
tokenized_dataset_valid = dataset_valid.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset_valid = tokenized_dataset_valid.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset_valid

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2311 [00:00<?, ? examples/s]

Dataset({
    features: ['dialog', 'text', 'input_ids', 'attention_mask'],
    num_rows: 2309
})

In [12]:
tokenized_dataset = DatasetDict({"train": tokenized_dataset_train, "validation": tokenized_dataset_valid})

In [13]:
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(32001, 3200)

In [14]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # can be 8 with llama
    lora_alpha=16, # can be 16 with llama
    target_modules=["q_proj", "v_proj"],
    # target_modules=['lm_head'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2662400	all params: 3429142400	trainable%: 0.07764040361811746


In [18]:
local_output_dir = os.path.join(os.getcwd(), 'models/openllama-v3')

train_args = TrainingArguments(
    per_device_train_batch_size=8, # can be 4 with llama
    per_device_eval_batch_size=8, # can be 4 with llama
    gradient_accumulation_steps=4,
    warmup_steps=20,
    # max_steps=200,
    optim="adamw_torch",
    learning_rate=4e-5, # many possible values here from 1e-5 to 2e-4
    # save_strategy="steps",
    fp16=True,
    # bf16=True,  # a100 required
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=400,
    logging_strategy="steps",
    logging_steps=10,
    logging_dir=f"{local_output_dir}/runs",
    report_to="tensorboard",
    output_dir=local_output_dir
)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    args=train_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. need to be re-enabled on inference
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,2.6326,2.505073
100,1.6857,1.6574
150,1.0786,1.108766
200,1.0218,1.055792
250,0.9826,1.027925
300,0.9747,1.014915
350,0.9721,1.010181
400,0.9406,1.005418
450,0.9612,1.002996
500,0.9851,1.000401


TrainOutput(global_step=1298, training_loss=1.1808484915775952, metrics={'train_runtime': 9867.5367, 'train_samples_per_second': 4.21, 'train_steps_per_second': 0.132, 'total_flos': 3.055844423778816e+17, 'train_loss': 1.1808484915775952, 'epoch': 2.0})

In [19]:
model.config.use_cache = True
model.save_pretrained(os.path.join(os.getcwd(), os.pardir, "models", "model"))

In [36]:
!nvidia-smi

Tue Sep 12 23:30:06 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090       WDDM | 00000000:01:00.0  On |                  N/A |
|  0%   46C    P8               30W / 350W|   8251MiB / 24576MiB |     12%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    