# Introduction

* Datasets:
    * https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k
* Models:
    * https://huggingface.co/Qwen/Qwen1.5-0.5B

In [1]:
!pip install -U accelerate transformers trl datasets bitsandbytes peft tensorboard

Collecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hDownloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m8.0 MB/s[0m et

In [2]:
# For Kaggle, if you get `TypeError: expected string or bytes-like object` when importing datasets.
!rm -r /opt/conda/lib/python3.10/site-packages/fsspec*
!pip install --force-reinstall --no-deps fsspec

rm: cannot remove '/opt/conda/lib/python3.10/site-packages/fsspec*': No such file or directory
Collecting fsspec
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.10.0
    Uninstalling fsspec-2023.10.0:
      Successfully uninstalled fsspec-2023.10.0
Successfully installed fsspec-2024.3.1


In [3]:
import os
import torch

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import LoraConfig

## Configuration

In [4]:
batch_size = 2
num_workers = os.cpu_count()
# max_steps = -1 for epoch-wise training.
# epochs = -1 for step-wise training.
# Both cannot be -1.
max_steps = -1
epochs = 10
bf16 = True
fp16 = False
gradient_accumulation_steps = 256
seq_length = 512
logging_steps = 50
save_steps = 50
learning_rate = 0.0001
model_name = 'Qwen/Qwen1.5-0.5B'
out_dir = 'outputs/qwen_05b_code_alpaca'

## Load Dataset 

In [5]:
dataset = load_dataset('sahil2801/CodeAlpaca-20k')

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 20022
    })
})


In [7]:
print(dataset['train'][0])

{'instruction': 'Create an array of length 5 which contains all even numbers between 1 and 10.', 'input': '', 'output': 'arr = [2, 4, 6, 8, 10]'}


In [8]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 19020
})
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1002
})


In [9]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

## Model

In [10]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)

In [11]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Line

## Tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
print(tokenizer.pad_token)

<|endoftext|>


## Training

In [12]:
if max_steps == -1 and epochs > 0:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='epoch',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='epoch',
        save_strategy='epoch',
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

if max_steps > 0 and epochs == -1:
    training_args = TrainingArguments(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='steps',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='steps',
        logging_steps=logging_steps,
        save_steps=save_steps,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        max_steps=max_steps,
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
    )

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=seq_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

 corners, a light shadow, and a maximum width of 500px.

### Input:


### Response:
.card {
    border-radius: 10px;
    box-shadow: 0 0 5px rgba(0, 0, 0, 0.2);
    max-width: 500px;
}<|endoftext|>### Instruction:
Write a code snippet to remove all white spaces from a given string in JavaScript.

### Input:
" Hello World! "

### Response:
let outputString = "Hello World!";
outputString = outputString.replace(/\s/g, '');  // removes white spaces
console.log(outputString);<|endoftext|>### Instruction:
Generate a list of all even numbers between 20 and 40.

### Input:


### Response:
even_numbers = []
for num in range(20, 41): 
  if num % 2 == 0: 
    even_numbers.append(num) 
  
print(even_numbers) 

# Output: [20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40]<|endoftext|>### Instruction:
Generate a python code that takes a list of integers, prints out an array that is all True if the values are in the input list and all False otherwise.

### Input:
list_of_numbers = [5, 7, 10, 2]

### Respons

In [15]:
history = trainer.train()

Epoch,Training Loss,Validation Loss
0,1.5644,0.958602
1,0.8401,0.833305
2,0.7304,0.81839
3,0.6492,0.825941
4,0.5744,0.858309
5,0.4973,0.908918
6,0.4285,0.961321
7,0.3603,1.043058
8,0.2466,1.106919


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [16]:
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/qwen_05b_code_alpaca/best_model/tokenizer_config.json',
 'outputs/qwen_05b_code_alpaca/best_model/special_tokens_map.json',
 'outputs/qwen_05b_code_alpaca/best_model/vocab.json',
 'outputs/qwen_05b_code_alpaca/best_model/merges.txt',
 'outputs/qwen_05b_code_alpaca/best_model/added_tokens.json')

## Inference

In [1]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

In [2]:
model = AutoModelForCausalLM.from_pretrained('outputs/qwen_05b_code_alpaca/best_model/')
tokenizer = AutoTokenizer.from_pretrained('outputs/qwen_05b_code_alpaca/best_model/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=512,
    device='cuda',
    eos_token_id=tokenizer.eos_token_id
)

In [4]:
# logging.set_verbosity(logging.CRITICAL)

In [5]:
prompt = """### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:
"""

In [6]:
print(prompt)

### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:



In [7]:
result = pipe(
    prompt
)
print(result[0]['generated_text'])

### Instruction:
Given a (N + 1) * N Matrix, assign each column of 1st row of matrix, the subsequent row of Matrix.

### Input:
test_list = [[5, 8, 10], [2, 0, 9], [5, 4, 2], [2, 3, 9]] Output : {5: [2, 0, 9], 8: [5, 4, 2], 10: [2, 3, 9]} 

### Response:
def assign_columns(test_list):
    result = []
    for row in test_list:
        result.append([row[0], row[1], row[2]])
    return result


In [8]:
prompt = """### Instruction:
Write Python code for merge sort.

### Input:


### Response:
"""

result = pipe(
    prompt
)
print(result[0]['generated_text'])

### Instruction:
Write Python code for merge sort.

### Input:


### Response:
def merge_sort(arr): 
    if len(arr) > 1: 
        mid = len(arr)//2 
        left = arr[:mid] 
        right = arr[mid:] 
  
        merge_sort(left) 
        merge_sort(right) 
  
        i = j = k = 0
        while i < len(left) and j < len(right): 
            if left[i] < right[j]: 
                arr[k] = left[i] 
                i+=1
            else: 
                arr[k] = right[j] 
                j+=1
            k+=1
        while i < len(left): 
            arr[k] = left[i] 
            i+=1
            k+=1
        while j < len(right): 
            arr[k] = right[j] 
            j+=1
            k+=1


In [9]:
prompt = """### Instruction:
Write a program to find the LCM of two numbers.

### Input:


### Response:
"""

result = pipe(
    prompt
)
print(result[0]['generated_text'])

### Instruction:
Write a program to find the LCM of two numbers.

### Input:


### Response:
def lcm(a, b):
    return (a*b)//gcd(a,b)


## Best Logs

```[130/130 28:52, Epoch 9/10]
Epoch	Training Loss	Validation Loss
0	1.835000	1.282727
1	1.159700	0.910980
2	0.718100	0.837208
3	0.876900	0.809701
4	0.609100	0.799354
5	0.756900	0.798754
6	0.524900	0.808557
7	0.646700	0.828692
8	0.353500	0.872785
```