In [1]:
import os

import pandas as pd
import torch
from torchinfo import summary
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, LlamaTokenizer
from datasets import load_dataset, concatenate_datasets, DatasetDict

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
llm_model_name = "acrastt/Marx-3B-V2"

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, padding_side="left", legacy=False)
model = AutoModelForCausalLM.from_pretrained(llm_model_name, device_map="auto", torch_dtype=torch.float16)
model.tie_weights()

MODEL_MAX_LENGTH = 1024

In [3]:
for param in model.parameters():
  param.requires_grad = False # freeze the model - train adapters later

model.gradient_checkpointing_enable() # reduce number of stored activations
model.enable_input_require_grads()

In [4]:
model.config

LlamaConfig {
  "_name_or_path": "acrastt/Marx-3B-V2",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 3200,
  "initializer_range": 0.02,
  "intermediate_size": 8640,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 26,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.33.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
summary(model)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   (102,400,000)
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-2                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-3                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-4                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-5                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-6                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-7                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-8                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-9                      (123,910,400)
│    │    └─LlamaDecoderLayer: 3-10                     (12

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params}\tall params: {all_param}\ttrainable%: {100 * trainable_params / all_param}"
    )

In [7]:
print_trainable_parameters(model)

trainable params: 0	all params: 3426473600	trainable%: 0.0


In [8]:
tokenizer.add_special_tokens({"pad_token": "</s>"})
dataset_train = load_dataset("parquet", data_files=os.path.join(os.getcwd(), os.pardir, "data", "total_data.parquet"), split="train[:90%]")
dataset_valid = load_dataset("parquet", data_files=os.path.join(os.getcwd(), os.pardir, "data", "total_data.parquet"), split="train[90%:]")

In [9]:
def tokenization(batch) -> dict:
    return tokenizer(
        batch["text"],
        max_length=MODEL_MAX_LENGTH,
        truncation=True,
    )

In [10]:
tokenized_dataset_train = dataset_train.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset_train = tokenized_dataset_train.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset_train

Map:   0%|          | 0/20799 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20799 [00:00<?, ? examples/s]

Dataset({
    features: ['dialog', 'text', 'input_ids', 'attention_mask'],
    num_rows: 20771
})

In [11]:
tokenized_dataset_valid = dataset_valid.map(
    tokenization,
    batched=True
    # remove_columns=["instruction", "context", "response", "text", "category"],
)
tokenized_dataset_valid = tokenized_dataset_valid.filter(lambda rec: len(rec["input_ids"]) < MODEL_MAX_LENGTH)
tokenized_dataset_valid

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2311 [00:00<?, ? examples/s]

Dataset({
    features: ['dialog', 'text', 'input_ids', 'attention_mask'],
    num_rows: 2309
})

In [12]:
tokenized_dataset = DatasetDict({"train": tokenized_dataset_train, "validation": tokenized_dataset_valid})

In [13]:
tokenized_dataset = tokenized_dataset.shuffle(seed=47)

In [14]:
# model.resize_token_embeddings(len(tokenizer))

In [15]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # can be 8 with llama
    lora_alpha=16, # can be 16 with llama
    target_modules=["q_proj", "v_proj"],
    # target_modules=['lm_head'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2662400	all params: 3429136000	trainable%: 0.07764054852300988


In [16]:
local_output_dir = os.path.join(os.getcwd(), 'models/openllama-v3-test')

train_args = TrainingArguments(
    per_device_train_batch_size=8, # can be 4 with llama
    per_device_eval_batch_size=8, # can be 4 with llama
    gradient_accumulation_steps=4,
    warmup_steps=20,
    optim="adamw_torch",
    learning_rate=4e-5, # many possible values here from 1e-5 to 2e-4
    fp16=True,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=400,
    logging_strategy="steps",
    logging_steps=10,
    logging_dir=f"{local_output_dir}/runs",
    report_to="tensorboard",
    output_dir=local_output_dir
)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    args=train_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. need to be re-enabled on inference
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,2.0731,1.933362
100,1.186,1.200085
150,1.1258,1.136534
200,1.0382,1.103335
250,1.0573,1.088031
300,1.0257,1.080974
350,1.0668,1.076182
400,1.0381,1.072547
450,1.0606,1.070119
500,1.0791,1.067931


TrainOutput(global_step=1298, training_loss=1.10738366116728, metrics={'train_runtime': 9679.4186, 'train_samples_per_second': 4.292, 'train_steps_per_second': 0.134, 'total_flos': 2.865227894928e+17, 'train_loss': 1.10738366116728, 'epoch': 2.0})

In [18]:
model.config.use_cache = True
model.save_pretrained(os.path.join(os.getcwd(), os.pardir, "models", "model-test"))

In [23]:
model.push_to_hub(f"xtrbase/positive-llm", use_auth_token=True)



adapter_model.bin:   0%|          | 0.00/10.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xtrbase/positive-llm/commit/fddb7716b166b0078270bbcdf645b7bfeb575486', commit_message='Upload model', commit_description='', oid='fddb7716b166b0078270bbcdf645b7bfeb575486', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
!nvidia-smi

Mon Sep 18 08:29:43 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.14                 Driver Version: 531.14       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090       WDDM | 00000000:01:00.0  On |                  N/A |
|  0%   41C    P8               28W / 350W|  15417MiB / 24576MiB |      5%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    