In [1]:
!pip3 install bitsandbytes==0.41.3
!pip3 install peft==0.11.1
!pip3 install trl==0.8.6
!pip3 install accelerate==0.30.1
!pip3 install datasets==2.19.2
!pip3 install transformers==4.41.2
!pip3 install tensorboard==2.17.0










In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict)
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

2024-07-28 13:37:59.546033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 13:37:59.565072: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 13:37:59.570972: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-28 13:37:59.584910: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
dataset_name = "Vezora/Tested-22k-Python-Alpaca"
model_id = "tiiuae/falcon-rw-1b"

# Load model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0})

model.config.use_cache = False
model.config.pretraining_tp = 1


# Load Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prompts for testing

In [6]:
prompts = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

# Generate inference

In [7]:
%%time
def generate_inference(prompt):
  inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)

  outputs = model.generate(**inputs, max_length=300)
  text = tokenizer.batch_decode(outputs)[0]
  return text

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 5.25 μs


# PEFT parameters

In [8]:
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "lm_head",
]

BATCH_SIZE = 64
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300

In [9]:
peft_params = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training params

In [10]:
training_params = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=4,
  optim="paged_adamw_32bit",
  save_steps=100,
  logging_steps=100,
  learning_rate=2e-4,
  eval_strategy="steps",
  weight_decay=0.001,
  fp16=False,
  bf16=True,
  max_grad_norm=0.3,
  max_steps=-1,
  warmup_ratio=0.03,
  group_by_length=True,
  lr_scheduler_type="constant",
  report_to="tensorboard"
)


# Load Dataset

In [11]:
dataset = load_dataset(dataset_name)


In [12]:
def generate_prompt(data_point):
    return f"""Write a python code for following problem statement
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""

CUTOFF_LEN = 3056
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [13]:
train_val = dataset["train"].train_test_split(
    test_size=200, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()


trainable params: 837,632 || all params: 1,312,462,848 || trainable%: 0.0638


In [15]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True,
)

# Before fine-tuning inference

In [16]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt))
    print("="*100)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


************************ PROMPT ************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
************************ Inference ************************


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
Write a python code for following problem statement
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engine

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.
Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda functi

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem stateme

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!
Write a python code for following problem statement 
Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!
Write a python code for following problem statement 
Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!
Write a python code for following problem statement 
Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and space

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Find the sum of the first 1000 prime numbers that are greater than 100.
Write a python code for following problem statement 
Find the sum of the first 1000 prime numbers that are greater than 100.
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
Write a python code for following problem statement 
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
Write a python code for following problem statement 
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
Write a python code for following problem statement 
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
Write a python code for following problem statement 
Provide a function that finds the length of a string, but without u

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for fo

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.
Write a python code for following problem statement 
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following problem statement 
Write a python code for following pr

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a python code for following problem statement 
Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.
[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,
************************ PROMPT ************************
Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array.
************************ Inference ************************
Write a python code for following problem statement 
Write a function for finding the minimum value in a given array, with a time com

In [17]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

1722173970338

In [19]:
%%time
OUTPUT_DIR="/home/ec2-user/SageMaker"
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_params,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

# Clear cache to free up memory
torch.cuda.empty_cache()

model = torch.compile(model)

trainer.train()
model.save_pretrained(OUTPUT_DIR)

Step,Training Loss,Validation Loss
100,1.4958,1.53203
200,1.4415,1.401628
300,1.3023,1.260871
400,1.228,1.197126
500,1.1772,1.157251
600,1.1498,1.128124
700,1.1257,1.105676
800,1.1052,1.087476
900,1.089,1.074441
1000,1.0801,1.062072






CPU times: user 1h 25min 5s, sys: 54.1 s, total: 1h 25min 59s
Wall time: 1h 26min 21s


In [None]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt))
    print("="*100)

In [None]:
model.push_to_hub("swapnilj/falcon-rw-1b-sj")