In [1]:
import os
import pandas as pd  # type: ignore
import torch
import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from peft import PeftModel, LoraConfig, get_peft_model
from trl import SFTTrainer, setup_chat_format, SFTConfig
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
device = torch.device("mps")

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map=device,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:24<00:00,  6.09s/it]


In [6]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [7]:
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.08480376642881861


In [8]:
df = pd.read_xml("./data/dataset.xml")
del df['index']
df.head()

Unnamed: 0,input,output
0,What is a 'customized integrated framework' in...,In the context of Audit and Assurance policy a...
1,What are some of the elements that audit and a...,Audit and assurance policies and procedures sh...
2,Why is it important to review and update the a...,It's important to review and update the audit ...
3,What is the role of an audit report in the aud...,An audit report plays a crucial role in the au...
4,What is the purpose of follow-up activities in...,Follow-up activities are designed to monitor t...


In [19]:
dataset = Dataset.from_pandas(df)

In [20]:
def instruct_format(examples):
    inputs = examples['input']
    targets = examples['output']
    model_inputs = tokenizer(inputs, padding='max_length', max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs
    

def reg_format(row):
    instruction = row["input"]
    response = row["output"]
    return tokenizer.prepare_seq2seq_batch(
        src_texts=instruction,
        tgt_texts=response,
        return_tensors='pt',
        max_length=512,  # Adjust the max length as needed
        truncation=True  # Enable truncation
    )
    

dataset = dataset.map(
    instruct_format,
    batched=True,
    remove_columns=["input", "output"]
)


Map: 100%|██████████| 1038/1038 [00:00<00:00, 15592.88 examples/s]


In [21]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 934
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 104
    })
})

In [22]:
decoded_input = tokenizer.decode(dataset['train']['input_ids'][0], skip_special_tokens=False)
decoded_output = tokenizer.decode(dataset['train']['labels'][0], skip_special_tokens=False)
print(f"Input:  {decoded_input}")
print(f"Output: {decoded_output}")

Input:  <|begin_of_text|>Why is it important to test and review business continuity plans at planned intervals?[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
Output: <|begin_of_text|>Regular testing and reviewing are essential to ensure that the business continuity plans are up-to-date and effective. Changes in the business environment, operational structure, or technology may make current plans ineffective or inadequate. Regular reviews ensure that the plan is aligned with current

In [26]:
sft_config = SFTConfig(
    output_dir="ccm_lora_weights",
    dataset_text_field="text",
    num_train_epochs=1,
    max_seq_length=200,
    learning_rate=2e-5,
)

trainer_config = TrainingArguments(
    output_dir="results",
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
)

In [27]:
"""
trainer = SFTTrainer(
    model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=lora_config,
    args=sft_config,
)
"""
trainer = Trainer(
    model = model,
    args=trainer_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)
trainer.train()

  0%|          | 0/117 [00:51<?, ?it/s]
                                                 
100%|██████████| 117/117 [14:19<00:00,  7.35s/it]

{'eval_loss': 3.9342875480651855, 'eval_runtime': 47.0979, 'eval_samples_per_second': 2.208, 'eval_steps_per_second': 0.276, 'epoch': 1.0}
{'train_runtime': 859.5123, 'train_samples_per_second': 1.087, 'train_steps_per_second': 0.136, 'train_loss': 4.0795979296040334, 'epoch': 1.0}





TrainOutput(global_step=117, training_loss=4.0795979296040334, metrics={'train_runtime': 859.5123, 'train_samples_per_second': 1.087, 'train_steps_per_second': 0.136, 'train_loss': 4.0795979296040334, 'epoch': 1.0})

In [30]:
type(model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

In [29]:
model = model.merge_and_unload()

In [31]:
model.save_pretrained("ccm_lora_weights")

In [None]:
model = AutoModelForCausalLM.from_pretrained("ccm_lora_weights", device_map=device)

In [32]:
messages = [
    {"role": "system", "content": ""},
    {"role": "user", "content": "What is a 'customized integrated framework' in the context of an Audit and Assurance policy and procedure?"},
]

In [33]:
conversation_history = ""
for message in messages:
    if message['role'] == 'system':
        conversation_history += f"System: {message['content']}\n"
    elif message['role'] == 'user':
        conversation_history += f"User: {message['content']}\n"

In [49]:
def generate_response(ch):
    inputs = tokenizer(ch, return_tensors='pt').to(device)
    outputs = model.generate(**inputs, max_length=1024, pad_token_id=tokenizer.pad_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip()

In [51]:
import pprint as pp
response = generate_response(conversation_history)
pp.pprint(response)


('System: \n'
 "User: What is a 'customized integrated framework' in the context of an Audit "
 'and Assurance policy and procedure?\n'
 'System: A customized integrated framework is a tailored approach to audit '
 'and assurance that combines various elements, such as risk assessment, audit '
 'procedures, and reporting, to meet the specific needs of an organization. It '
 'is a customized framework that integrates different components to ensure '
 "that audits are effective, efficient, and relevant to the organization's "
 'goals and objectives.\n'
 '\n'
 'In the context of an Audit and Assurance policy and procedure, a customized '
 "integrated framework would consider the organization's:\n"
 '\n'
 "1. Risk profile: The framework would take into account the organization's "
 'risk profile, including its business model, operations, and industry.\n'
 "2. Audit objectives: The framework would align with the organization's audit "
 'objectives, such as ensuring compliance with laws and 