# (QLora) Fine-tuning Mistral-7b-Instruct

### imports

In [None]:
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes

In [None]:
# resolving "No inf checks were recorded for this optimizer." issue
# !pip uninstall torch -y
# !pip install torch==2.1

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers

### Load model

In [None]:
model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto", # automatically figures out how to best use CPU + GPU for loading model
                                             trust_remote_code=False, # prevents running custom model files on your machine
                                             revision="main") # which version of model to use in repo

### Load tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

### Using Base Model

In [None]:
model.eval() # model in evaluation mode (dropout modules are deactivated)

# craft prompt
comment = """
<test data replace this with your own data
Name,Age,Department,JoiningDate,Salary
Alice,29,Engineering,2021-05-14,75000
Bob,35,Marketing,2019-11-01,68000
Charlie,42,HR,2018-03-22,72000
Diana,26,Finance,2022-07-30,64000>
write this in json"""
prompt=f'''[INST] {comment} [/INST]'''

# tokenize input
inputs = tokenizer(prompt, return_tensors="pt")

# generate output
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=140)

print(tokenizer.batch_decode(outputs)[0])

#### Prompt Engineering

In [None]:
comment = """
<test data replace this with your own data 
Name,Age,Department,JoiningDate,Salary
Alice,29,Engineering,2021-05-14,75000
Bob,35,Marketing,2019-11-01,68000
Charlie,42,HR,2018-03-22,72000
Diana,26,Finance,2022-07-30,64000 >
write this in json"""

In [None]:
intstructions_string = f"""You reply in json. convert this data into json. the first row is the header \n"""

prompt_template = lambda comment: f'''[INST] {intstructions_string} \n{comment} \n[/INST]'''

prompt = prompt_template(comment)
print(prompt)

In [None]:
# tokenize input
inputs = tokenizer(prompt, return_tensors="pt")

# generate output
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=140)

print(tokenizer.batch_decode(outputs)[0])

### Prepare Model for Training

In [None]:
model.train() # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

In [None]:
# LoRA config
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA trainable version of model
model = get_peft_model(model, config)

# trainable parameter count
model.print_trainable_parameters()

trainable params: 2,097,152 || all params: 264,507,392 || trainable%: 0.7929


### Preparing Training Dataset

In [None]:
import pickle

In [None]:
with open('data.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['example'],
        num_rows: 49
    })
    test: Dataset({
        features: ['example'],
        num_rows: 10
    })
})

In [None]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["example"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=2000
    )

    return tokenized_inputs

# tokenize training and validation datasets
tokenized_data = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)


### Fine-tuning Model

In [None]:
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 10

# define training arguments
training_args = transformers.TrainingArguments(
    output_dir= "<dir-name>",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",

)

In [None]:
# configure trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator
)


# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True



Epoch,Training Loss,Validation Loss
0,0.9042,0.79672
1,0.8196,0.734464
2,0.7337,0.693645
4,0.6736,0.656725
5,0.6092,0.643762
6,0.6016,0.635596
8,0.5784,0.628546
9,0.3899,0.628335




### Push model to hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [None]:
hf_name = <hf-username>
model_id = hf_name + "/" + <dir-name>

In [None]:
model.push_to_hub(model_id)
trainer.push_to_hub(model_id)

### Load Fine-tuned Model

In [None]:
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# from peft import prepare_model_for_kbit_training
# from peft import LoraConfig, get_peft_model
# from datasets import load_dataset
# import transformers

In [None]:
# load model from hub
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

config = PeftConfig.from_pretrained("<hf-username>/<dir-name>")
model = PeftModel.from_pretrained(model, "<hf-username>/<dir-name>")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

### Use Fine-tuned Model

In [None]:
comment = """
<test data replace this with your own data 
Name,Age,Department,JoiningDate,Salary
Alice,29,Engineering,2021-05-14,75000
Bob,35,Marketing,2019-11-01,68000
Charlie,42,HR,2018-03-22,72000
Diana,26,Finance,2022-07-30,64000 >
"""

intstructions_string = f"""convert into json format first row is the header \n"""

prompt_template = lambda comment: f'''[INST] {intstructions_string} \n{comment} \n[/INST]'''


prompt = prompt_template(comment)
print(prompt)

In [None]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=1000)

print(tokenizer.batch_decode(outputs)[0])

In [None]:
model.eval()

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=1000)

print(tokenizer.batch_decode(outputs)[0])