In [None]:
model_folder = 'model'
model_id = 'ibm-granite/granite-3.3-8b-instruct'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, \
    DataCollatorForLanguageModeling
import json
from peft import LoraConfig, TaskType, get_peft_model
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype='auto', trust_remote_code=True)

In [None]:
#### Set up LoRA

In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    use_dora=False,
    use_rslora=False
)

model = get_peft_model(model, config)

#### DatasetS

In [None]:
with open('data/chunks.jsonl') as file:
    texts = [json.loads(line) for line in file]

dataset = Dataset.from_list(texts)
tokenized_dataset = dataset.map(
    lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=512),
    batched=True
)

#### Trainer setup

In [None]:
training_arguments = TrainingArguments(
    output_dir=f'./{model_folder}/granite3.3-lora',
    per_device_train_batch_size=1,
    num_train_epochs=3,
    save_steps=5,
    save_total_limit=1,
    learning_rate=1e-4,
    fp16=False,  # If you are using CUDA set this.
    bf16=True, # Use for Mac ARM
    report_to='none',
    logging_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

#### Train

In [None]:
trainer.train()

#### Save LoRA

In [None]:
model.save_pretrained('./{model_folder}/granite3.3-lora-adapter')