In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
from datasets import concatenate_datasets, load_dataset

import pandas as pd
import pyarrow as pa
import json
import random
import os

In [None]:
### set your HF token

os.environ['HF_TOKEN'] = ""

In [None]:
### read in raw data from HF

html_extraction_dataset_name = "Jiraya/html_job_extraction_dataset"
html_extraction_dataset = load_dataset(html_extraction_dataset_name)
html_extraction_dataset

In [None]:
# defining the delimiter, this will be used to seperate instruction and response sections of each training data point
delimiter = {
    "instruction":"### Instruction:\n", "response":"### Response:\n"
}

# defining the prompt, this can change to make the fine tuned model more performant.
prompt = """{}I am providing you with an HTML. I want you to extract Job Title, Job Location, Job ID and Job Link in a JSON format. I want you to respond only with a JSON, no descriptive text. There are exactly 10 Job Title, Job Location, Job ID and Job Link that you have to extract from the provided HTML.
Here is the expected JSON structure:
[
    {{
        'Job Title':
        'Job Location':
        'Job ID':
        'Job Link':
    }},
    {{
        'Job Title':
        'Job Location':
        'Job ID':
        'Job Link':
    }},
...
]

Again, only respond with a JSON and no desciptive text. If you do not spot job links in the HTML provided then just return an empty JSON.
Here is the HTML I want to extract Job Title, Job Location, Job ID and Job Link from,

{}

{}{}"""

In [None]:
# making the html_extraction_dataset training ready.
# meaning, formatting the examples to include the prompt and delimiters.

train_val_dataset = html_extraction_dataset['train'].map(lambda ex: {
    'formatted': prompt.format(
        delimiter["instruction"],
        ex["input_html"],
        delimiter["response"],
        ex["extracted_output"]
    )
})

test_dataset = html_extraction_dataset['test'].map(lambda ex: {
    'formatted': prompt.format(
        delimiter["instruction"],
        ex["input_html"],
        delimiter["response"],
        ex["extracted_output"]
    )
})

In [None]:
train_val_dataset = train_val_dataset.shuffle(seed=random.randint(0, 1000))
train_val_dataset = train_val_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_dataset["train"]
train_dataset

In [None]:
val_test_dataset = {
    "val_dataset":train_val_dataset["test"],
    "test_dataset":test_dataset
}
val_test_dataset

In [None]:
# define some variables - model names
model_name = "Llama-3.2-1B"
full_model_name = f"meta-llama/{model_name}"
_version_ = "v1-0"
new_model = f"../models/{_version_}/{model_name}_html-extractor_{_version_}"

################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 32
# Dropout probability for LoRA layers
lora_dropout = 0.1
# True for using DoRA, takes more memory but produces better results than LoRA
use_dora = False

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = f"../output/{_version_}/{model_name}_html-extractor_{_version_}/"
# Number of training epochs
num_train_epochs = 20
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training & eval
per_device_train_batch_size = 4
per_device_val_batch_size = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 8
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
# max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"
# Number of training steps (overrides num_train_epochs)
# max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 600
# Log every X updates steps
logging_steps = 1200
eval_steps = 1200

################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 5_000
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True # False
# Load the entire model on the GPU 0
# device_map = {"": 0}
device_map="auto"

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, # Activates 4-bit precision loading
    bnb_4bit_quant_type=bnb_4bit_quant_type, # nf4
    bnb_4bit_compute_dtype=compute_dtype, # float16
    bnb_4bit_use_double_quant=use_nested_quant, # False
)

In [None]:
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        bf16 = True
    else:
        bf16 = False

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    full_model_name,
    token=os.environ['HF_TOKEN'],
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(full_model_name,
                                          token=os.environ['HF_TOKEN'],
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"],
    use_dora = use_dora
)

In [None]:
training_arguments = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_val_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    eval_steps=eval_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    # max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    dataset_text_field="formatted",
    packing=packing,
    evaluation_strategy="steps",
    logging_strategy="steps",
    # no_cuda=True
)
training_arguments

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_test_dataset,
    peft_config=peft_config,
    # tokenizer=tokenizer,
    # max_seq_length=max_seq_length,
    # formatting_func=format_prompts_fn,
    args=training_arguments,
    # compute_metrics=custom_metric,
)

In [None]:
trainer.train()
trainer.model.save_pretrained(new_model)