In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 guardrail-ml==0.0.12 tensorboard
!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q unstructured["local-inference"]==0.7.4 pillow

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m122.9/244.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
from guardrail.client import (
    run_metrics,
    run_simple_metrics,
    create_dataset)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [111]:
from transformers import TrainingArguments


# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    save_steps=100,
    logging_steps=10,
    weight_decay=0.001,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    max_steps=100,
    optim="paged_adamw_32bit",
    fp16=False,  # Disable mixed-precision training
    group_by_length=True
)


In [4]:
# Quantization config
bb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,
    quant_batch_axis=0,
    quant_reduce_range=False
)

In [74]:
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")

In [75]:
len(dataset)

18612

In [76]:
from sklearn.model_selection import train_test_split

**The reason we have to perform all the computations below is simply because of the use of 'load_datasets'. 'load_dataset' ofteb represents datasets as a list of dictionaries where each distionary represents an individual example. When we load our dataset from hugging face, the entire dataset is loaded with 18,612 examples which are all represented as a dictionary with keys corresponding to different columns in the dataset. Hence using train_test_split is difficult as the function splits the list of dictionaries and not the individual examples themselves. Therefore, it is dividing the list into training and test sets but each set is still a list of dictionaries**

In [84]:
#initialize lists to store specific fields
instructions = []
inputs = []
outputs = []
prompts = []

#iterate through the dataset to extract the fields
for example in dataset:
    instructions.append(example["instruction"])
    inputs.append(example["input"])
    outputs.append(example["output"])
    prompts.append(example["prompt"])


In [85]:
#split the instructions list
instructions_train, instructions_temp = train_test_split(instructions, test_size=0.3, random_state=42)
instructions_validation, instructions_test = train_test_split(instructions_temp, test_size=0.5, random_state=42)

#split the inputs list
inputs_train, inputs_temp = train_test_split(inputs, test_size=0.3, random_state=42)
inputs_validation, inputs_test = train_test_split(inputs_temp, test_size=0.5, random_state=42)

#split the outputs list
outputs_train, outputs_temp = train_test_split(outputs, test_size=0.3, random_state=42)
outputs_validation, outputs_test = train_test_split(outputs_temp, test_size=0.5, random_state=42)

#split the prompts list
prompts_train, prompts_temp = train_test_split(prompts, test_size=0.3, random_state=42)
prompts_validation, prompts_test = train_test_split(prompts_temp, test_size=0.5, random_state=42)


In [86]:
len(prompts_train) #check lengths

13028

**Below is an attempt to re-incorporate the separated column lists back as objects of dataset.Dataset to unify the test set and validation set where each example contains all required fields and can be used for training and testing the model**

In [87]:
test_set = []
validation_set = []

for i in range(len(instructions_test)):
    example = {
        "instruction": instructions_test[i],
        "input": inputs_test[i],
        "output": outputs_test[i],
        "prompt": prompts_test[i]
    }
    test_set.append(example)

for i in range(len(instructions_validation)):
    example = {
        "instruction": instructions_validation[i],
        "input": inputs_validation[i],
        "output": outputs_validation[i],
        "prompt": prompts_validation[i]
    }
    validation_set.append(example)


In [88]:
from datasets import Dataset

test_dataset = Dataset.from_dict({key: [example[key] for example in test_set] for key in test_set[0]})
validation_dataset = Dataset.from_dict({key: [example[key] for example in validation_set] for key in validation_set[0]})


In [89]:
len(test_dataset)

2792

In [90]:
len(validation_dataset)

2792

In [91]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    "TinyPixel/Llama-2-7B-bf16-sharded",
    quantization_config=bb_config
)


Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

In [92]:
!pip install trl



In [93]:
from trl import SFTTrainer

In [94]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


In [95]:
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

In [96]:
from transformers import AutoTokenizer

In [97]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [60]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [63]:
print(dataset.column_names)

['instruction', 'input', 'output', 'prompt']


**Below we will define a function that formats the text fields before calling it in the SFTTrainer under formatting_func**

In [107]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts


In [112]:
max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset = validation_dataset,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
    #dataset_text_field="combined_text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=train_args,
)



Map:   0%|          | 0/18612 [00:00<?, ? examples/s]

Map:   0%|          | 0/2792 [00:00<?, ? examples/s]

In [113]:
trainer.train() #train the model

Step,Training Loss
10,0.8656
20,0.8774
30,0.8884
40,0.9125
50,1.0141
60,0.7872
70,0.7833
80,0.7332
90,0.7431
100,1.0194


TrainOutput(global_step=100, training_loss=0.8624306869506836, metrics={'train_runtime': 808.4291, 'train_samples_per_second': 0.495, 'train_steps_per_second': 0.124, 'total_flos': 1269797209374720.0, 'train_loss': 0.8624306869506836, 'epoch': 0.02})