In [1]:
import os
import pandas as pd  # type: ignore
import torch
import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler, TrainingArguments
from datasets import load_dataset, Dataset
from peft import PeftModel, LoraConfig, get_peft_model
from trl import SFTTrainer, setup_chat_format, SFTConfig
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [2]:
device = torch.device("mps")

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [5]:
df = pd.read_xml("./data/dataset.xml")
del df['index']
df.head()

Unnamed: 0,input,output
0,What is a 'customized integrated framework' in...,In the context of Audit and Assurance policy a...
1,What are some of the elements that audit and a...,Audit and assurance policies and procedures sh...
2,Why is it important to review and update the a...,It's important to review and update the audit ...
3,What is the role of an audit report in the aud...,An audit report plays a crucial role in the au...
4,What is the purpose of follow-up activities in...,Follow-up activities are designed to monitor t...


In [6]:
dataset = Dataset.from_pandas(df)

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["input"]},
               {"role": "assistant", "content": row["output"]}]
    row["text"] = tokenizer.apply_chat_template(
        row_json,
        add_generation_prompt=True,
        include_special_tokens=True,
        tokenize=False
    )
    return row

dataset = dataset.map(
    format_chat_template,
)


Map:   0%|          | 0/1038 [00:00<?, ? examples/s]Using unk_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using cls_token, but it 

In [7]:
dataset = dataset.train_test_split(test_size=0.1)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.14s/it]


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model, tokenizer = setup_chat_format(model, tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
sft_config = SFTConfig(
    output_dir="ccm_lora_weights",
    dataset_text_field="text",
    num_train_epochs=1,
    max_seq_length=512,
    learning_rate=2e-4,
)

In [17]:
dataset["train"][0]

{'input': 'Can you mention some of the channels through which the training and awareness program can be executed?',
 'output': 'Yes, the training and awareness program can be delivered through several awareness-raising activities like campaigns, booklets, posters, newsletters, websites, information sessions, briefings, e-learning modules, and emails.',
 'text': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nCan you mention some of the channels through which the training and awareness program can be executed?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nYes, the training and awareness program can be delivered through several awareness-raising activities like campaigns, booklets, posters, newsletters, websites, information sessions, briefings, e-learning modules, and emails.<|eot_id|>'}

In [18]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    args=sft_config,
)
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using pad_token, but it is not set yet.

Map: 100%|██████████| 934/934 [00:00<00:00, 20893.00 examples/s]
  0%|          | 0/117 [02:33<?, ?it/s]
  0%|          | 0/117 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  1%|          | 1/117 [00:33<1:04:09, 33.18s/it]