In [None]:
!pip install -q git+https://github.com/huggingface/transformers #huggingface transformers for downloading models weights
!pip install -qqq transformers>=4.39.0
!pip install -qqq mamba-ssm causal-conv1d>=1.2.0
!pip install -qqq accelerate
!pip install -qqq bitsandbytes --progress-bar off
!pip install flash-attn --no-build-isolation
!pip install -q torch
!pip install -q datasets #huggingface datasets to download and manipulate datasets
!pip install -q peft #Parameter efficient finetuning - for qLora Finetuning
!pip install -q trl

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
from unsloth import FastLanguageModel
import torch
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
from transformers import AutoTokenizer, TrainingArguments
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM

max_seq_length = 4096 # Supports RoPE Scaling interally, so choose any!

# Load model in 4-bit precision
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    device_map="auto",
    llm_int8_skip_modules=["mamba"]
)

model = AutoModelForCausalLM.from_pretrained(
    "ai21labs/Jamba-v0.1",
    trust_remote_code=True,
    torch_dtype=torch.cuda.is_bf16_supported() and torch.bfloat16 or torch.float16,
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

lora_config = LoraConfig(
    target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
    init_lora_weights=False
)

model.add_adapter(lora_config, adapter_name="adapter_1")

Unused kwargs: ['device_map']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset

dataset = load_dataset("teknium/OpenHermes-2.5")

Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 4685
})


In [None]:
from trl import SFTTrainer
import torch
from peft import LoraConfig
from transformers import AutoTokenizer, TrainingArguments
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

lora_config = LoraConfig(
    r=16,
    lora_alpha = 32,
    target_modules=["embed_tokens", "x_proj", "in_proj", "out_proj"],
    lora_dropout = 0.1,
    task_type="CAUSAL_LM",
    bias="none"
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "system",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        num_train_epochs=3,
        learning_rate=2e-3,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        weight_decay = 0.01,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)

Map:   0%|          | 0/4685 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss
1,11.0494
2,10.9438
3,10.9425
4,10.9508
5,10.939
6,10.9388
7,10.9439
8,10.9359
9,10.9347
10,10.9153


In [None]:
import torch
from transformers import BitsAndBytesConfig
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

# Define model and adapter names
model_name = "ai21labs/Jamba-v0.1"
adapters_name = "/content/outputs/checkpoint-3500"

# Define the device to load the model onto
device = "cuda"

# Start loading the base model into memory
print(f"Starting to load the base model {model_name} into memory")

# Load model in 4-bit precision
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    device_map="auto",
    llm_int8_skip_modules=["mamba"]
)

# Load the base model
m = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    load_in_4bit=True,
    trust_remote_code=True,
    torch_dtype=torch.cuda.is_bf16_supported() and torch.bfloat16 or torch.float16,
    attn_implementation="flash_attention_2",
    device_map={"": 0}
)

# Print status
print("Base model loaded. Now loading the adapter.")

# Load and merge the adapters
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()

# Print status
print("Adapter loaded and merged. Now loading the tokenizer.")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Print final status
print(f"Successfully loaded and merged the model {model_name} with adapter {adapters_name} into memory.")


# Save the merged model and tokenizer
print("Saving the merged model and tokenizer.")
m.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")

# Authenticate with Hugging Face
# Make sure you have your Hugging Face token available
hf_token = ""

# Push the merged model and tokenizer to the hub
print("Pushing the merged model and tokenizer to the hub.")
m.push_to_hub("Severian/Jamba-Nexus-IKM-v1", use_auth_token=True)
tokenizer.push_to_hub("Severian/Jamba-Nexus-IKM-v1", use_auth_token=True)