**Setup development environment**

In [37]:
import argparse
import os
import torch
import logging
import mlflow
import traceback
import bitsandbytes as bnb
import pandas as pd

from random import randrange
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from transformers.utils import is_accelerate_available, is_bitsandbytes_available
from trl import SFTTrainer

In [30]:
print("Is accelerate available? ", is_accelerate_available())
print("Is bitsandbytes available? ", is_bitsandbytes_available())
!transformers-cli env

Is accelerate available?  True
Is bitsandbytes available?  True

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.38.2
- Platform: Linux-5.15.0-97-generic-x86_64-with-glibc2.35
- Python version: 3.11.7
- Huggingface_hub version: 0.20.3
- Safetensors version: 0.4.2
- Accelerate version: 0.26.1
- Accelerate config: 	- compute_environment: LOCAL_MACHINE
	- distributed_type: NO
	- mixed_precision: fp16
	- use_cpu: False
	- debug: False
	- num_processes: 1
	- machine_rank: 0
	- num_machines: 1
	- gpu_ids: all
	- rdzv_backend: static
	- same_network: True
	- main_training_function: main
	- downcast_bf16: no
	- tpu_use_cluster: False
	- tpu_use_sudo: False
	- tpu_env: []
- PyTorch version (GPU?): 2.1.2+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or

In [31]:
!nvidia-smi

Mon Mar 25 14:54:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          Off | 00000000:00:10.0 Off |                    0 |
| N/A   28C    P0              35W / 250W |      3MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

**Create and prepare Dataset**

There are several ways to create datasets for LLM fine-tuning, including:
- using existing open-source datasets
- using LLMs to create synthetically datasets
- using humans to create datasets 
- using a combination of the above methods

We use an aleardy exsiting dataset from Hugginface called: [CodeInstructions](https://huggingface.co/datasets/TokenBender/code_instructions_122k_alpaca_style)

In [19]:
# load existing dataset from hugginface
dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset = dataset.shuffle().select(range(100))

print(dataset)
print("Instruction: ", dataset[0]["instruction"])
print("input: ",dataset[0]["input"])
print("Output: ", dataset[0]["output"])


Dataset({
    features: ['output', 'instruction', 'input', 'text'],
    num_rows: 100
})
Instruction:  Generate a code to sort a list of strings alphabetically.
input:  
Output:  def sortStrings(string_list):
    return sorted(string_list)


**Prepare Prompt**

In [20]:
from typing import Dict

MISTRAL_TEMPLATE = """<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: {} 

### Input: {} [/INST] 

{}</s>"""


def create_prompt(sample: Dict):
    return {
        "text": MISTRAL_TEMPLATE.format(sample["instruction"], sample["input"], sample["output"]),
        "instruction": sample["instruction"],
        "input": sample["input"],
        "output": sample["output"]
    }

**Format Dataset**

In [21]:
# convert dataset
dataset = dataset.map(create_prompt, remove_columns=dataset.features)

# split dataset into 80 training samples and 20 test sample
dataset = dataset.train_test_split(test_size=0.2)

print(dataset["train"][60]["text"])

# save datasets to disk
dataset["train"].to_json("../data/train_dataset.json", orient="records")
dataset["test"].to_json("../data/test_dataset.json", orient="records")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: Create a data structure for storing book titles and authors. 

### Input:  [/INST] 

HashMap<String, String> books = new HashMap<String, String>(); 

// Add book titles and authors
books.put("Harry Potter", "JK Rowling");
books.put("The Lord of the Rings", "JRR Tolkien");
books.put("The Alchemist", "Paulo Coelho");
books.put("The Catcher in the Rye", "JD Salinger");</s>


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

28904

**Load Training Dataset**

In [22]:
# load training dataset
train_dataset = load_dataset("json", data_files="../data/train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

**Load Model and Tokenizer**

1. Define quantization configuration
2. Load model 
3. Load tokenizer
4. Inference with base model

In [33]:
# load bnb config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                             # load in 4bits
    bnb_4bit_use_double_quant=True,                # double quantize (quantize weightes and quantize the first quantization constants)
    bnb_4bit_quant_type="nf4",                     # use NF4 (normalized fp4)
    bnb_4bit_compute_dtype=torch.float16           # compute type is float16 (computations runs in float16)
)

# load model
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,          # model name
    quantization_config=bnb_config,                # bnb config
    device_map="auto",                             # auto selects device to put model on; ensures that model is moved to the GPU
    attn_implementation="flash_attention_2"        # attention implementation to use in the model
)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.unk_token          # we want the pad_token to be different from the eos token

# add special tokens to indicate the start and end of a prompt
#tokenizer.add_special_tokens({"bos_token": "<s>"})
#tokenizer.add_special_tokens({"eos_token": "</s>"})
#tokenizer.add_special_tokens({"unk_token": "<unk>"})

loading configuration file config.json from cache at /home/ssimon/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /home/ssimon/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instr

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/ssimon/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

loading file tokenizer.model from cache at /home/ssimon/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa79e0df6b32407ed873/tokenizer.model
loading file tokenizer.json from cache at /home/ssimon/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/snapshots/41b61a33a2483885c981aa

In [44]:
prompt = """[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: Print hello world in python c and c++.[/INST]"""

model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

base_model.eval()
with torch.no_grad():
    print(tokenizer.decode(base_model.generate(**model_input, max_new_tokens=1000, pad_token_id=tokenizer.unk_token_id)[0], skip_special_tokens=True))

[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: Print hello world in python c and c++.[/INST]
ikt


**Define LoRa Config**

1. Find trainable layers
2. Define LoRA Config
3. Get PEFT Model

In [34]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(base_model)

# ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "lm_head"]

print(modules)

['v_proj', 'q_proj', 'gate_proj', 'o_proj', 'k_proj', 'up_proj', 'down_proj']


In [35]:
# define LoRA config
lora_config = LoraConfig(
    r=256,                                       # rank of the update matrices, expressed in `int`. Lower rank results in smaller update matrices with fewer trainable parameters
    lora_alpha=128,                              # alpha parameter for LoRA scaling
    lora_dropout=0.05,                           # dropout probability for LoRA layers
    target_modules="all-linear",                 # name of modules to apply the adapter to
    bias="none",                                 # bias type for LoRA
    task_type="CAUSAL_LM",                       # type of task to perform
    )

In [38]:
# get PEFT model
model = get_peft_model(base_model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 671088640 | total: 7912820736 | Percentage: 8.4810%


**Define training arguments**

**Define Supervises Finetuning Trainer**

**Start Fine-Tuning**