# Introduction

* Datasets:
    * https://huggingface.co/datasets/TokenBender/code_instructions_122k_alpaca_style
* Models:
    * https://huggingface.co/microsoft/phi-1_5

In [1]:
!pip install -U accelerate transformers trl datasets bitsandbytes peft

Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl.metadata (1.8 kB)
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.7.3-py3-none-any.whl.metadata (7.7 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datase

In [2]:
# For Kaggle, if you get `TypeError: expected string or bytes-like object` when importing datasets.
!rm -r /opt/conda/lib/python3.10/site-packages/fsspec*
!pip install --force-reinstall --no-deps fsspec

In [4]:
import os
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import LoraConfig

2024-03-29 12:00:40.479653: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 12:00:40.479749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 12:00:40.615208: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Configuration

In [5]:
batch_size = 2
num_workers = os.cpu_count()
# max_steps = -1 for epoch-wise training.
# epochs = -1 for step-wise training.
# Both cannot be -1.
max_steps = -1
epochs = 1
bf16 = False
fp16 = True
gradient_accumulation_steps = 16
context_length = 1024
logging_steps = 50
save_steps = 50
learning_rate = 0.0002
model_name = 'microsoft/phi-1_5'
out_dir = 'outputs/phi_1_5_code_alpaca_qlora'

## Load Dataset 

In [6]:
dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 169M/169M [00:03<00:00, 44.9MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 121959
    })
})


In [8]:
print(dataset['train'][0])

{'instruction': 'Create a function to calculate the sum of a sequence of integers.', 'input': '[1, 2, 3, 4, 5]', 'output': '# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Create a function to calculate the sum of a sequence of integers. ### Input: [1, 2, 3, 4, 5] ### Output: # Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum'}


In [9]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 115861
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 6098
})


In [10]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

## Model

In [11]:
# Quantization configuration.
if bf16:
    compute_dtype = getattr(torch, 'bfloat16')
else: # FP16
    compute_dtype = getattr(torch, 'float16')

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True
)

In [12]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config
)

config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [13]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_laye

## Tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [15]:
print(tokenizer.pad_token)

<|endoftext|>


## Training

In [16]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias='none',
    task_type='CAUSAL_LM',
)

In [17]:
if max_steps == -1 and epochs > 0:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='epoch',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='epoch',
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

if max_steps > 0 and epochs == -1:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='steps',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='steps',
        logging_steps=logging_steps,
        save_steps=save_steps,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        max_steps=max_steps,
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    packing=True,
    peft_config=peft_params,
    formatting_func=preprocess_function
)

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2466 > 2048). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_

In [20]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break



### Input:


### Response:
def max_of_three(x, y, z):
      max_num = x
      if y > max_num:
          max_num = y
      if z > max_num:
          max_num = z
      return max_num<|endoftext|>### Instruction:
Generate two user classes called "User" and "Admin" with the following fields —

### Input:


### Response:
class User:
      def __init__(self, name, username, email, password):
          self.name = name
          self.username = username
          self.email = email
          self.password = password

class Admin:
      def __init__(self, name, username, email, password):
          self.name = name
          self.username = username
          self.email = email
          self.password = password<|endoftext|>### Instruction:
Create a HTML/CSS webpage where a user can input their favorite color and the background color of the page will change accordingly.

### Input:
Not applicable

### Response:
<!DOCTYPE html>
<html>
<head>
      <title>Color Picker</title>
      <script typ

In [21]:
history = trainer.train()

Epoch,Training Loss,Validation Loss
0,0.6759,0.662851


In [22]:
trainer.model.save_pretrained(f"{out_dir}/best_model")
trainer.tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/phi_1_5_code_alpaca_qlora/best_model/tokenizer_config.json',
 'outputs/phi_1_5_code_alpaca_qlora/best_model/special_tokens_map.json',
 'outputs/phi_1_5_code_alpaca_qlora/best_model/vocab.json',
 'outputs/phi_1_5_code_alpaca_qlora/best_model/merges.txt',
 'outputs/phi_1_5_code_alpaca_qlora/best_model/added_tokens.json')

## Inference

In [1]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

In [2]:
model = AutoModelForCausalLM.from_pretrained('outputs/phi_1_5_code_alpaca_qlora/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/phi_1_5_code_alpaca_qlora/best_model/')

  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=256,
    device='cuda',
    eos_token_id=tokenizer.eos_token_id
)

In [4]:
# logging.set_verbosity(logging.CRITICAL)

In [5]:
prompt = """### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:
"""

In [6]:
print(prompt)

### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:



In [7]:
result = pipe(
    prompt
)
print(result[0]['generated_text'])

### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]]

# Create a dictionary to store the result
result = {}

# Iterate through the list
for i in range(len(test_list[0])):
    # Initialize the result dictionary
    result[i] = []
    
    # Iterate through the list
    for j in range(len(test_list)):
        # Append the value of the current column to the result dictionary
        result[i].append(test_list[j][i])

# Print the result
print(result
