In [2]:
!pip3 install bitsandbytes==0.41.3
!pip3 install peft==0.12.0
!pip3 install trl==0.8.6
!pip3 install datasets==2.19.2
!pip3 install transformers==4.43.3
!pip3 install tensorboard==2.17.0
!pip3 install accelerate==0.33.0









In [3]:
import os
import torch

from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [4]:
dataset_name = "Vezora/Tested-22k-Python-Alpaca"
model_id = "tiiuae/falcon-rw-1b"

# Load tokenizer

In [5]:
def load_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

# Quantization config for QLoRA

In [6]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16)

# Load model

In [7]:
def load_model(model_id, is_local=False):
    print("Loading: ", model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        #load_in_8bit=True,
        #quantization_config=bnb_config,
        device_map={"": 0},
        local_files_only = is_local
    )
    
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    return model

# Generate Inference

In [8]:
def generate_inference(model, tokenizer):
    pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=256)

    for prompt in prompts:
        result = pipe(prompt)
        print("******************************* PROMPT *******************************")
        print(prompt)
        print("******************************* Inference *******************************")
        print(result[0]['generated_text'])
        print("="*100)

# Test Inference Prompts

In [9]:
prompts = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

# Pre Fine-tuning inference

In [10]:
model = load_model(model_id) 
tokenizer = load_tokenizer(model_id)
generate_inference(model, tokenizer)

Loading:  tiiuae/falcon-rw-1b


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
******************************* Inference *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
The function should return the length of the string, or an error if the string is not a valid UTF-8 string.
The function should not use any built-in string length functions or methods.
The function should not use any iteration or recursion.
The function should not use any built-in string length functions or methods.
The function should not use any iteration or recursion.
The function should not use any built-in string length functions or methods.
The function should not use any iteration or recursion.
The function s

# PEFT Parameters

In [11]:
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "lm_head",
]

BATCH_SIZE = 64
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300

# LoRA parameters


In [12]:
peft_params = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Pre process fine-tune data record 

In [13]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

# Load Dataset

In [14]:
import datasets
from datasets import Dataset

dataset = load_dataset(dataset_name)

# Split Datasets into multiple equal parts

In [15]:
# instruction = dataset["train"]["instruction"]
# input = dataset["train"]["input"]
# output = dataset["train"]["output"]

# temp_dataset_1 = Dataset.from_dict({"instruction": instruction[:5500], "input": input[:5500], "output": output[:5500]})
# dataset_1 = datasets.DatasetDict({"train": temp_dataset_1})

# temp_dataset_2 = Dataset.from_dict({"instruction": instruction[5500:11000], "input": input[5500:11000], "output": output[5500:11000]})
# dataset_2 = datasets.DatasetDict({"train": temp_dataset_2})

# temp_dataset_3 = Dataset.from_dict({"instruction": instruction[11000:16500], "input": input[11000:16500], "output": output[11000:16500]})
# dataset_3 = datasets.DatasetDict({"train": temp_dataset_3})

# temp_dataset_4 = Dataset.from_dict({"instruction": instruction[16500:], "input": input[16500:], "output": output[16500:]})
# dataset_4 = datasets.DatasetDict({"train": temp_dataset_4})


# datasets = [dataset_1, dataset_2, dataset_3, dataset_4]

# test train split

In [16]:
def train_test_split(dataset):
    train_val = dataset["train"].train_test_split(test_size=4000, shuffle=True, seed=42)
    train_data = train_val["train"]
    val_data = train_val["test"]

    return train_data, val_data

In [17]:
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()

# for param in model.parameters():
#     if param.requires_grad:
#         param.data = param.data.float()

trainable params: 837,632 || all params: 1,312,462,848 || trainable%: 0.0638


# Training Params

In [18]:
OUTPUT_DIR="/home/ec2-user/SageMaker/falcon/LoRA"
#OUTPUT_DIR="/kaggle/working/falcon/LoRA"
training_args = TrainingArguments(
  output_dir=OUTPUT_DIR,
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=4,
  optim="paged_adamw_32bit",
  save_steps=1000,
  logging_steps=1000,
  learning_rate=0.0001,
  eval_strategy="steps",
  weight_decay=0.001,
  fp16=True,
  bf16=False,
  max_grad_norm=0.3,
  max_steps=-1,
  warmup_ratio=0.03,
  group_by_length=True,
  lr_scheduler_type="constant",
  report_to="tensorboard"
)

# Model trainer

In [19]:
def train(train_dataset, eval_dataset, model, tokenizer, output_path):
    
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        max_seq_length=context_length,
        tokenizer=tokenizer,
        args=training_args,
        formatting_func=preprocess_function,
        packing=True
    )
    
    print("Model output path: ", f"{output_path}/model/fine_tuned")
    
    history = trainer.train()
    model.save_pretrained(f"{output_path}/model/fine_tuned")
    #tokenizer.save_pretrained(f"{output_path}/tokenizer/fine_tuned")
    return model


In [20]:
context_length = 256

# finetune model with dataset 1

In [21]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
print("finetuning with dataset 1", current_timestamp_ms)

finetuning with dataset 1 1722998765757


In [22]:
BASE_DIR = "/home/ec2-user/SageMaker/falcon/LoRA/full-dataset"
#BASE_DIR = "/kaggle/working/gpt-2/dataset"
train_data, val_data = train_test_split(dataset)

#index = "full-data"
#model = load_model(model_id)
#tokenizer = load_tokenizer(model_id)
# model = get_peft_model(model, peft_params)
# model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, tokenizer, f"{BASE_DIR}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

Fine-Tuning:  tiiuae/falcon-rw-1b


Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1280 > 1024). Running this sequence through the model will result in indexing errors


Model output path:  /home/ec2-user/SageMaker/falcon/LoRA/full-dataset/model/fine_tuned


Step,Training Loss,Validation Loss
1000,1.7093,1.517562
2000,1.4616,1.418721
3000,1.4074,1.383986
4000,1.3738,1.363913
5000,1.3601,1.350129
6000,1.3511,1.339514
7000,1.337,1.332394
8000,1.3306,1.325872




1723004495237

# inference after finetuning with dataset 1

In [23]:
model = load_model(f"{BASE_DIR}/model/fine_tuned") 
#tokenizer = load_tokenizer(f"{BASE_DIR}1/tokenizer/fine_tuned")
generate_inference(model, tokenizer)

Loading:  /home/ec2-user/SageMaker/falcon/LoRA/full-dataset/model/fine_tuned
******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, 

******************************* PROMPT *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
******************************* Inference *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.

### Input:


### Response:
Here's a possible implementation of the function:

```python
def find_length(string):
    if len(string) == 0:
          return len(string)
         return len(string)
         return len(string)
        return len(string)
      return len(string)
    return len(string)
    return len(string)
    return len(string)
   return len(string)
   return len(string)
   return len(string)
   return len(string)
   return len(string)
   return len(string)
   return len(string)
   return len(string)


# Finetune with model with dataset 2

In [None]:
current_timestamp_ms = int(time.time() * 1000)
print("dataset 2 finetuning", current_timestamp_ms)

In [None]:
train_data, val_data = train_test_split(dataset_2)

model_id = f"{BASE_DIR}1/model/fine_tuned"
index = 2
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, tokenizer, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# inference after finetuning with dataset 2

In [None]:
model = load_model(f"{BASE_DIR}2/model/fine_tuned") 
generate_inference(model, tokenizer)

# Finetune with model with dataset 3

In [None]:
current_timestamp_ms = int(time.time() * 1000)
print("dataset 3 finetuning", current_timestamp_ms)

In [None]:
train_data, val_data = train_test_split(dataset_3)

model_id = f"{BASE_DIR}2/model/fine_tuned"
index = 3
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, tokenizer, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# inference after finetuning with dataset 3

In [None]:
model = load_model(f"{BASE_DIR}3/model/fine_tuned") 
generate_inference(model, tokenizer)

# Finetune with model with dataset 4

In [None]:
current_timestamp_ms = int(time.time() * 1000)
print("dataset 4 finetuning", current_timestamp_ms)

In [None]:
train_data, val_data = train_test_split(dataset_4)

model_id = f"{BASE_DIR}3/model/fine_tuned"
index = 4
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, tokenizer, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# inference after finetuning with dataset 4

In [None]:
model = load_model(f"{BASE_DIR}4/model/fine_tuned") 
generate_inference(model, tokenizer)