## Required Installation

In [None]:
!pip install -q -U transformers peft accelerate optimum
!pip install datasets==2.15.0
!pip install trl
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.0/417.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

# Load model and tokenizer from huggingface

In [None]:
from peft import prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig

model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id)



# Load RAFT dataset

In [None]:
from datasets import Dataset
dataset_path = "/content/drive/MyDrive/Colab Notebooks/RAFT/data-00000-of-00001.arrow"
ds = Dataset.from_file(dataset_path)

In [None]:
ds

Dataset({
    features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction'],
    num_rows: 263
})

In [None]:
import pandas as pd
df = pd.DataFrame(ds)
df.head()

Unnamed: 0,id,type,question,context,oracle_context,cot_answer,instruction
0,seed_task_0,general,What did the two main things the author worked...,"{'sentences': [['No, I had not.) So although R...",\n\nWhat I Worked On\n\nFebruary 2021\n\nBefor...,assistant: \n\nReasoning:\n1. The author menti...,"<DOCUMENT>No, I had not.) So although Robert h..."
1,seed_task_1,general,Answer: Writing and programming.,{'sentences': [['Now you not only had colleagu...,\n\nWhat I Worked On\n\nFebruary 2021\n\nBefor...,assistant: \n\nQuestion: What did the person d...,"<DOCUMENT>Now you not only had colleagues, but..."
2,seed_task_2,general,What kind of writing did the author focus on b...,{'sentences': [[' What I Worked On February ...,\n\nWhat I Worked On\n\nFebruary 2021\n\nBefor...,assistant: \n\nReasoning:\n1. The context stat...,<DOCUMENT>\n\nWhat I Worked On\n\nFebruary 202...
3,seed_task_3,general,Answer: Short stories.,"{'sentences': [['I wrote simple games, a progr...",\n\nWhat I Worked On\n\nFebruary 2021\n\nBefor...,assistant: \n\nQuestion: What did the author w...,"<DOCUMENT>I wrote simple games, a program to p..."
4,seed_task_4,general,Where did the author and his friend Rich go to...,"{'sentences': [['So yes, I wanted it! [7] Ast...",\n\nWhat I Worked On\n\nFebruary 2021\n\nBefor...,assistant: \n\nReasoning:\n1. The author menti...,"<DOCUMENT>So yes, I wanted it! [7]\n\nAsterix ..."


In [None]:
# Combine 'user' and 'assistant' columns into a new 'messages' column as list of dictionaries
df['messages'] = df.apply(lambda row: [{'content': row['instruction'], 'role': 'user'},
                                       {'content': row['cot_answer'], 'role': 'assistant'}], axis=1)


In [None]:
import datasets
ds = datasets.Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction', 'messages'],
    num_rows: 263
})

# Chat Template for prompt

In [None]:
# Set chat template
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

In [None]:
import re
import random
from multiprocessing import cpu_count

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(ds.features)
raw_datasets = ds.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

  self.pid = os.fork()


Applying chat template (num_proc=2):   0%|          | 0/263 [00:00<?, ? examples/s]

In [None]:
raw_datasets

Dataset({
    features: ['text'],
    num_rows: 263
})

In [None]:
raw_datasets["text"][0]

'<|system|>\n</s>\n<|user|>\n<DOCUMENT>No, I had not.) So although Robert had his graduate student stipend, I needed that seed funding to live on.\n\nWe originally hoped to launch in September, but we got more ambitious about the software as we worked on it. Eventually we managed to build a WYSIWYG site builder, in the sense that as you were creating pages, they looked exactly like the static ones that would be generated later, except that instead of leading to static pages, the links all referred to closures stored in a hash table on the server.\n\nIt helped to have studied art, because the main goal of an online store builder is to make users look legit, and the key to looking legit is high production values. If you get page layouts and fonts and colors right, you can make a guy running a store out of his bedroom look more legit than a big company.\n\n(If you\'re curious why my site looks so old-fashioned, it\'s because it\'s still made with this software. </DOCUMENT>\n<DOCUMENT>\n\n

## Create train/test split

In [None]:
raw_datasets = raw_datasets.train_test_split(test_size=0.1)
# create the splits
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]


# Peft config

In [None]:
from peft import LoraConfig, get_peft_model
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Quantization Config

In [None]:
from transformers import GPTQConfig

quantization_config = GPTQConfig(bits=4, disable_exllama=True)

model_kwargs = dict(
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map="auto",
    quantization_config=quantization_config,
)


Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


# SFT Training

In [None]:

from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

# path where the Trainer will save its checkpoints and logs
output_dir = 'raft-sft-output'

# based on config
training_args = TrainingArguments(
    fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

In [None]:
trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        # packing=True,
        peft_config=peft_config,
        # max_seq_length=tokenizer.model_max_length,
        # max_seq_length=512,
        max_seq_length=2048

    )

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Using auto half precision backend


## Setting pad_token=eos_token

In [None]:
tokenizer.pad_token = tokenizer.eos_token

## Start Training

In [None]:
train_result = trainer.train()
# model

***** Running training *****
  Num examples = 236
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 128
  Total optimization steps = 1
  Number of trainable parameters = 8,388,608


Epoch,Training Loss,Validation Loss
0,No log,2.391823


***** Running Evaluation *****
  Num examples = 27
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
# metrics = train_result.metrics

## Save the finetuned_raft_model

In [None]:
output_dir_new = "finetuned_raft_model"
trainer.save_model(output_dir_new)

Saving model checkpoint to finetuned_raft_model
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7b-Chat-GPTQ/snapshots/d5ad9310836dd91b6ac6133e2e47f47394386cea/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "bits": 4,
    "damp_percent": 0.01,
    "desc_act": false,
    "group_size": 128,
    "model_file_base_name": "model",
    "model_name_or_path": null,
    "quant_method": "gptq",
    "sym": true,
    "true_sequential": true
  },
  "rms_norm_eps":

# Inference
## Load model

In [None]:
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
# disable_exllama=True
model = AutoModelForCausalLM.from_pretrained(output_dir_new,quantization_config=quantization_config_loading, device_map="auto")


Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7b-Chat-GPTQ/snapshots/d5ad9310836dd91b6ac6133e2e47f47394386cea/config.json
Model config LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7b-Chat-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "qu

In [None]:
raw_datasets["test"][0]

{'text': "<|system|>\n</s>\n<|user|>\n<DOCUMENT>And it wasn't, so I would.\n\nIn the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. </DOCUMENT>\n<DOCUMENT>I had no idea. </DOCUMENT>\n<DOCUMENT>Around this time, in the spring of 2000, I had an idea. It was clear from our experience with Viaweb that web apps were the future. Why not build a web app for making web apps? Why not let people edit code on our server through the browser, and then host the resulting applications for them? [9] You could run all sorts of services on the servers that these applications could use just by making an API call: making and receiving phone calls, manipulating ima

## Use Query only for inference

In [None]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a RAFT model you give answer based on context. You also give reason for why this is the answer.",
    },
    {"role": "user", "content": "What did Paul Graham believe was the future of web development?"},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=500,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0
}



<|system|>
You are a RAFT model you give answer based on context. You also give reason for why this is the answer.</s>
<|user|>
What did Paul Graham believe was the future of web development?</s>
<|assistant|>
Paul Graham believed that the future of web development would involve more use of machine learning and artificial intelligence. He believed that these technologies would allow for more personalized and intelligent web experiences. He also believed that the web would become more decentralized and that blockchain technology would play a larger role in the future of the web.


## Use Query along with retreived context for inference. 
* This is taken from the test set we created earlier.

In [None]:
context_and_query = """
\n<DOCUMENT>And it wasn't, so I would.\n\nIn the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. </DOCUMENT>\n<DOCUMENT>I had no idea. </DOCUMENT>\n<DOCUMENT>Around this time, in the spring of 2000, I had an idea. It was clear from our experience with Viaweb that web apps were the future. Why not build a web app for making web apps? Why not let people edit code on our server through the browser, and then host the resulting applications for them? [9] You could run all sorts of services on the servers that these applications could use just by making an API call: making and receiving phone calls, manipulating images, taking credit card payments, etc.\n\nI got so excited about this idea that I couldn't think about anything else. </DOCUMENT>\n
What did Paul Graham believe was the future of web development?
"""

In [None]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a RAFT model you give answer based on context. You also give reason for why this is the answer.",
    },
    {"role": "user", "content": context_and_query},
]

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=500,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0
}



<|system|>
You are a RAFT model you give answer based on context. You also give reason for why this is the answer.</s>
<|user|>


<DOCUMENT>And it wasn't, so I would.

In the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. </DOCUMENT>
<DOCUMENT>I had no idea. </DOCUMENT>
<DOCUMENT>Around this time, in the spring of 2000, I had an idea. It was clear from our experience with Viaweb that web apps were the future. Why not build a web app for making web apps? Why not let people edit code on our server through the browser, and then host the resulting applications for them? [9] You could run all sorts of services on the servers that these applications 

In [None]:
"""

<DOCUMENT>And it wasn't, so I would.

In the summer of 2012 my mother had a stroke, and the cause turned out to be a blood clot caused by colon cancer. The stroke destroyed her balance, and she was put in a nursing home, but she really wanted to get out of it and back to her house, and my sister and I were determined to help her do it. I used to fly up to Oregon to visit her regularly, and I had a lot of time to think on those flights. </DOCUMENT>

<DOCUMENT>I had no idea. </DOCUMENT>

<DOCUMENT>Around this time, in the spring of 2000, I had an idea. It was clear from our experience with Viaweb that web apps were the future. Why not build a web app for making web apps? Why not let people edit code on our server through the browser, and then host the resulting applications for them? [9] You could run all sorts of services on the servers that these applications could use just by making an API call: making and receiving phone calls, manipulating images, taking credit card payments, etc.



I got so excited about this idea that I couldn't think about anything else. </DOCUMENT>


What did Paul Graham believe was the future of web development?
"""

Reference:
1. https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Mistral/Supervised_fine_tuning_(SFT)_of_an_LLM_using_Hugging_Face_tooling.ipynb