In [1]:
!pip3 install bitsandbytes==0.41.3
!pip3 install peft==0.12.0
!pip3 install trl==0.8.6
!pip3 install datasets==2.19.2
!pip3 install transformers==4.43.3
!pip3 install tensorboard==2.17.0
!pip3 install accelerate==0.33.0
!pip3 install sentencepiece==0.2.0









In [2]:
import os
import torch
import transformers

from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
    BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [3]:
dataset_name = "Vezora/Tested-22k-Python-Alpaca"
model_id = "TinyLlama/TinyLlama_v1.1"

# load tokenizer


In [4]:
def load_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, 
        trust_remote_code=True, 
        use_fast=False,
        add_eos_token=True,
        add_bos_token=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

# Quantization config for QLoRA

In [5]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16)

# Load model

In [6]:
def load_model(model_id, is_local=False):
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        local_files_only=is_local,
        device_map={"": 0})

    model.config.use_cache = False
    model.config.pretraining_tp = 1
    return model

# Test Inference Prompts

In [7]:
prompts = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

# Generate Inference

In [8]:
def generate_inference(model, tokenizer):
    pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=256)

    for prompt in prompts:
        result = pipe(prompt)
        print("******************************* PROMPT *******************************")
        print(prompt)
        print("******************************* Inference *******************************")
        print(result[0]['generated_text'])
        print("="*100)

# Pre Fine-tuning inference

In [9]:
TOKENIZER = load_tokenizer(model_id)
model = load_model(model_id)
generate_inference(model, TOKENIZER)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.
******************************* Inference *******************************
Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion. The 2018-19 season is now over.
The 2018-19 season is now over. The 2019-20 season will begin on September 1, 2019.
The 2018-19 season is now over. The 2019-20 season will begin on September 1, 2019. The 2019-20 season will be a 10-game season.
The 2018-19 season is now over. The 2019-20 season will begin on September 1, 2019. The 2019-20 season will be a 10-game season. The 2019-20 season will be a 10-game season.
The 2018-19 season is now over. The 2019-20 season will begin on September 1, 
***********************

# Load dataset 

In [10]:
import datasets
from datasets import Dataset

dataset = load_dataset(dataset_name)
instruction = dataset["train"]["instruction"]
input = dataset["train"]["input"]
output = dataset["train"]["output"]

temp_dataset_1 = Dataset.from_dict({"instruction": instruction[:5500], "input": input[:5500], "output": output[:5500]})
dataset_1 = datasets.DatasetDict({"train": temp_dataset_1})

temp_dataset_2 = Dataset.from_dict({"instruction": instruction[5500:11000], "input": input[5500:11000], "output": output[5500:11000]})
dataset_2 = datasets.DatasetDict({"train": temp_dataset_2})

temp_dataset_3 = Dataset.from_dict({"instruction": instruction[11000:16500], "input": input[11000:16500], "output": output[11000:16500]})
dataset_3 = datasets.DatasetDict({"train": temp_dataset_3})

temp_dataset_4 = Dataset.from_dict({"instruction": instruction[16500:], "input": input[16500:], "output": output[16500:]})
dataset_4 = datasets.DatasetDict({"train": temp_dataset_4})


datasets = [dataset_1, dataset_2, dataset_3, dataset_4]

In [11]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return text

# Train test split 

In [12]:
def train_test_split(dataset):
    train_val = dataset["train"].train_test_split(test_size=400, shuffle=True, seed=42)
    train_data = train_val["train"]
    val_data = train_val["test"]

    return train_data, val_data

# Tokenize the prompts

In [13]:
def generate_and_tokenize_prompt(prompt):
    return TOKENIZER(preprocess_function(prompt))

# tokenize the train and test datasets

In [14]:
# train_data, val_data = train_test_split(dataset)

# tokenized_train_dataset = train_data.map(generate_and_tokenize_prompt)
# tokenized_val_dataset = val_data.map(generate_and_tokenize_prompt)

# LoRA parameters

In [15]:
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "lm_head",
]

BATCH_SIZE = 64
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300

# LoRA config

In [16]:
peft_params = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

In [17]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_params)

model.print_trainable_parameters()

trainable params: 13,160,448 || all params: 1,113,208,832 || trainable%: 1.1822


# Training params

In [18]:
OUTPUT_DIR="/home/ec2-user/SageMaker/tinyllama/LoRA"
#OUTPUT_DIR="/kaggle/working/tinyllama"
CONTEXT_LENGTH = 1024
training_args = TrainingArguments(
  output_dir=OUTPUT_DIR,
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=4,
  optim="paged_adamw_32bit",
  save_steps=300,
  logging_steps=300,
  learning_rate=0.0001,
  eval_strategy="steps",
  weight_decay=0.001,
  fp16=True,
  bf16=False,
  max_grad_norm=0.3,
  max_steps=-1,
  warmup_ratio=0.03,
  group_by_length=True,
  lr_scheduler_type="constant",
  report_to="tensorboard"
)

# Model trainer

In [19]:
def train(train_dataset, eval_dataset, model, tokenizer, output_path):
    
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        max_seq_length=CONTEXT_LENGTH,
        tokenizer=TOKENIZER,
        args=training_args,
        formatting_func=preprocess_function,
        packing=True,
        data_collator=transformers.DataCollatorForLanguageModeling(TOKENIZER, mlm=False),
    )
    
    print("Model output path: ", f"{output_path}/model/fine_tuned")
    
    history = trainer.train()
    model.save_pretrained(f"{output_path}/model/fine_tuned")
    #tokenizer.save_pretrained(f"{output_path}/tokenizer/fine_tuned")
    return model

In [20]:
# context_length = 1024
    
# trainer = SFTTrainer(
#     model=model,
#     #train_dataset=tokenized_train_dataset,
#     #eval_dataset=tokenized_val_dataset,
#     train_dataset=train_data,
#     eval_dataset=val_data,
#     max_seq_length=context_length,
#     tokenizer=TOKENIZER,
#     args=training_args,
#     formatting_func=preprocess_function,
#     packing=True,
#     data_collator=transformers.DataCollatorForLanguageModeling(TOKENIZER, mlm=False),
# )

# print("Model output path: ", f"{OUTPUT_DIR}/model/fine_tuned")

# history = trainer.train()
# model.save_pretrained(f"{OUTPUT_DIR}/model/fine_tuned")
# #TOKENIZER.save_pretrained(f"{OUTPUT_DIR}/fine_tuned")


# finetune model with dataset 1

In [21]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
print("finetuning with dataset 1", current_timestamp_ms)

finetuning with dataset 1 1723280852907


In [22]:
BASE_DIR = "/home/ec2-user/SageMaker/tinyllama/LoRA/dataset-"
#BASE_DIR = "/kaggle/working/tinyllama/dataset-"
train_data, val_data = train_test_split(dataset_1)

index = 1
model_id = "TinyLlama/TinyLlama_v1.1"
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, TOKENIZER, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

trainable params: 13,160,448 || all params: 1,113,208,832 || trainable%: 1.1822
Fine-Tuning:  TinyLlama/TinyLlama_v1.1
Model output path:  /home/ec2-user/SageMaker/tinyllama/LoRA/dataset-1/model/fine_tuned


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
300,0.7404,0.702278
600,0.8954,1.840713




1723283031878

# inference after finetuning with dataset 1

In [23]:
model_ft = load_model(f"{BASE_DIR}{index}/model/fine_tuned", True)
generate_inference(model_ft,TOKENIZER)

******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.
******************************* Inference *******************************
Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in des

# finetune model with dataset 2

In [24]:
current_timestamp_ms = int(time.time() * 1000)
print("finetuning with dataset 2", current_timestamp_ms)

finetuning with dataset 2 1723283104224


In [25]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"]= "max_split_size_mb:50"
torch.cuda.empty_cache()

In [26]:
BASE_DIR = "/home/ec2-user/SageMaker/tinyllama/LoRA/dataset-"
#BASE_DIR = "/kaggle/working/tinyllama/dataset-"
train_data, val_data = train_test_split(dataset_2)

index = 2
#model_id = f"{BASE_DIR}{index-1}/model/fine_tuned"
#model = load_model(model_id, True)
model = get_peft_model(model_ft, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, TOKENIZER, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

trainable params: 13,160,448 || all params: 1,113,208,832 || trainable%: 1.1822
Fine-Tuning:  TinyLlama/TinyLlama_v1.1
Model output path:  /home/ec2-user/SageMaker/tinyllama/LoRA/dataset-2/model/fine_tuned


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
300,0.6962,0.668945




1723284591544

# inference after finetuning with dataset 2

In [27]:
model_ft = load_model(f"{BASE_DIR}{index}/model/fine_tuned", True)
generate_inference(model_ft,TOKENIZER)

******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.
******************************* Inference *******************************
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value. ### Example
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібібліоні#
#Createібіблі
******************************* PROMPT *******************************
Find the index of the element 'c' in the following

# finetune model with dataset 3

In [28]:
current_timestamp_ms = int(time.time() * 1000)
print("finetuning with dataset 3", current_timestamp_ms)

finetuning with dataset 3 1723284664348


In [29]:
BASE_DIR = "/home/ec2-user/SageMaker/tinyllama/LoRA/dataset-"
#BASE_DIR = "/kaggle/working/tinyllama/dataset-"
train_data, val_data = train_test_split(dataset_3)

index = 3
#model_id = f"{BASE_DIR}{index-1}/model/fine_tuned"
#model = load_model(model_id)
model = get_peft_model(model_ft, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, TOKENIZER, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

trainable params: 13,160,448 || all params: 1,113,208,832 || trainable%: 1.1822
Fine-Tuning:  TinyLlama/TinyLlama_v1.1
Model output path:  /home/ec2-user/SageMaker/tinyllama/LoRA/dataset-3/model/fine_tuned


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
300,1.0035,1.081792




1723286528031

# inference after finetuning with dataset 3

In [30]:
model_ft = load_model(f"{BASE_DIR}{index}/model/fine_tuned", True)
generate_inference(model_ft,TOKENIZER)

******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.
******************************* Inference *******************************
Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in des

# finetune model with dataset 4

In [31]:
current_timestamp_ms = int(time.time() * 1000)
print("finetuning with dataset 4", current_timestamp_ms)

finetuning with dataset 4 1723286601832


In [32]:
BASE_DIR = "/home/ec2-user/SageMaker/tinyllama/LoRA/dataset-"
#BASE_DIR = "/kaggle/working/tinyllama/dataset-"
train_data, val_data = train_test_split(dataset_4)

index = 4
#model_id = f"{BASE_DIR}{index-1}/model/fine_tuned"
#model = load_model(model_id)
model = get_peft_model(model_ft, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
model = train(train_data, val_data, model, TOKENIZER, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

trainable params: 13,160,448 || all params: 1,113,208,832 || trainable%: 1.1822
Fine-Tuning:  TinyLlama/TinyLlama_v1.1
Model output path:  /home/ec2-user/SageMaker/tinyllama/LoRA/dataset-4/model/fine_tuned


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
300,1.6658,1.745128
600,1.8486,2.164718




1723288983809

# inference after finetuning with dataset 4

In [33]:
model_ft = load_model(f"{BASE_DIR}{index}/model/fine_tuned", True)
generate_inference(model_ft,TOKENIZER)

******************************* PROMPT *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?
******************************* Inference *******************************
Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in 

******************************* PROMPT *******************************
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.
******************************* Inference *******************************
Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.
******************************* PROMPT *******************************
Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.
******************************* Inference *******************************
Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.
******************************* PROMPT *******************************
Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length 