In [None]:
!pip3 install bitsandbytes==0.41.3
!pip3 install peft==0.11.1
!pip3 install trl==0.8.6
!pip3 install accelerate==0.30.1
!pip3 install datasets==2.19.2
!pip3 install transformers==4.41.2
!pip3 install tensorboard==2.17.0

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict)
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [None]:
dataset_name = "Vezora/Tested-22k-Python-Alpaca"
model_id = "tiiuae/falcon-rw-1b"

# Bits and Bytes config

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)


# Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prompts for testing

In [None]:
prompts = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

# Load Model

In [None]:
def load_model(model_id):

  print("Loading model: ", model_id)
  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      quantization_config=bnb_config,
      device_map="auto",
      force_download=True)

  model.config.use_cache = False
  model.config.pretraining_tp = 1
  return model

# Inference

In [None]:
%%time
def generate_inference(prompt, model):
  inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)

  outputs = model.generate(**inputs, max_length=500)
  text = tokenizer.batch_decode(outputs)[0]
  return text

# PEFT parameters

In [None]:
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "lm_head",
]

BATCH_SIZE = 64
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300

In [None]:
peft_params = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

# Load Dataset

In [None]:
import datasets
from datasets import Dataset

dataset = load_dataset(dataset_name)
instruction = dataset["train"]["instruction"]

input = dataset["train"]["input"]
output = dataset["train"]["output"]

temp_dataset_1 = Dataset.from_dict({"instruction": instruction[:5500], "input": input[:5500], "output": output[:5500]})
dataset_1 = datasets.DatasetDict({"train": temp_dataset_1})

temp_dataset_2 = Dataset.from_dict({"instruction": instruction[5500:11000], "input": input[5500:11000], "output": output[5500:11000]})
dataset_2 = datasets.DatasetDict({"train": temp_dataset_2})

temp_dataset_3 = Dataset.from_dict({"instruction": instruction[11000:16500], "input": input[11000:16500], "output": output[11000:16500]})
dataset_3 = datasets.DatasetDict({"train": temp_dataset_3})

temp_dataset_4 = Dataset.from_dict({"instruction": instruction[16500:], "input": input[16500:], "output": output[16500:]})
dataset_4 = datasets.DatasetDict({"train": temp_dataset_4})


# temp_dataset_1 = Dataset.from_dict({"instruction": instruction[0:1000], "input": input[0:1000], "output": output[0:1000]})
# dataset_1 = datasets.DatasetDict({"train": temp_dataset_1})

# temp_dataset_2 = Dataset.from_dict({"instruction": instruction[1000:2000], "input": input[1000:2000], "output": output[1000:2000]})
# dataset_2 = datasets.DatasetDict({"train": temp_dataset_2})

# temp_dataset_3 = Dataset.from_dict({"instruction": instruction[2000:3000], "input": input[2000:3000], "output": output[2000:3000]})
# dataset_3 = datasets.DatasetDict({"train": temp_dataset_3})


# temp_dataset_4 = Dataset.from_dict({"instruction": instruction[3000:4000], "input": input[3000:4000], "output": output[3000:4000]})
# dataset_4 = datasets.DatasetDict({"train": temp_dataset_4})


datasets = [dataset_1, dataset_2, dataset_3, dataset_4]

# Training params

In [None]:
def get_training_params():
  training_params = TrainingArguments(
      output_dir="./results",
      num_train_epochs=3,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=4,
      optim="paged_adamw_32bit",
      save_steps=100,
      logging_steps=100,
      learning_rate=2e-4,
      eval_strategy="steps",
      weight_decay=0.001,
      fp16=False,
      bf16=True,
      max_grad_norm=0.3,
      max_steps=-1,
      warmup_ratio=0.03,
      group_by_length=True,
      lr_scheduler_type="constant",
      report_to="tensorboard"
  )
  return training_params


In [None]:
def generate_prompt(data_point):
    return f"""Write a python code for following problem statement
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""

CUTOFF_LEN = 3056
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [None]:
def train_test_split(dataset):
  train_val = dataset["train"].train_test_split(
      test_size=100, shuffle=True, seed=42
  )
  train_data = (
      train_val["train"].map(generate_and_tokenize_prompt)
  )
  val_data = (
      train_val["test"].map(generate_and_tokenize_prompt)
  )

  return train_data, val_data

In [None]:
# data_collator = DataCollatorForCompletionOnlyLM(
#     tokenizer=tokenizer, response_template="### Response:", pad_to_multiple_of=8, return_tensors="pt"
# )
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True,
)

In [None]:
%%time
def train(train_data, val_data, training_params, llm, path):
  trainer = transformers.Trainer(
      model=llm,
      train_dataset=train_data,
      eval_dataset=val_data,
      args=training_params,
      data_collator=data_collator
  )
  llm.config.use_cache = False
  old_state_dict = llm.state_dict
  llm.state_dict = (
      lambda self, *_, **__: get_peft_model_state_dict(
          self, old_state_dict()
      )
  ).__get__(llm, type(llm))

  # Clear cache to free up memory
  torch.cuda.empty_cache()

  llm = torch.compile(model)

  trainer.train()
  llm.save_pretrained(path, save_embedding_layers=True)
  return llm

# Output before fine tuning

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(full_prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt, load_model(model_id)))
    print("="*100)

In [None]:
# for index, dataset in enumerate(datasets):
    
#     BASE_DIR = "/home/ec2-user/falcon/15-june/dataset-"
#     OUTPUT_DIR = f"{BASE_DIR}{index}"

#     train_data, val_data = train_test_split(dataset)
#     training_args = get_training_params()
    # path = ""

#     if index == 0:
#         path = model_id
#     else:
#         path = f"{BASE_DIR}{index-1}"

#     print("Dataset: ", index)
#     print("Model: ", path)

#     model = load_model(path)
#     model = get_peft_model(model, peft_params)
#     model.print_trainable_parameters()

#     print("Fine-Tuning: ", path)
#     train(train_data, val_data, get_training_params(), model, f"{BASE_DIR}{index}")

In [None]:
BASE_DIR = "/home/ec2-user/SageMaker/falcon/dataset-"

In [None]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

In [None]:
train_data, val_data = train_test_split(dataset_1)
training_args = get_training_params()

model_id = "tiiuae/falcon-rw-1b"
index = 1
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
#OUTPUT_DIR = f"{BASE_DIR}{index}"
model = train(train_data, val_data, get_training_params(), model, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# Inference after fine-tuning with dataset-1

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(full_prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt, load_model("/home/ec2-user/SageMaker/falcon/dataset-1")))
    print("="*100)

In [None]:
#model.push_to_hub("swapnilj/falcon-rw-1b-sj")

In [None]:
import datetime
import time

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

In [None]:
train_data, val_data = train_test_split(dataset_2)
training_args = get_training_params()

model_id = "/home/ec2-user/SageMaker/falcon/dataset-1"
index = 2
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
#OUTPUT_DIR = f"{BASE_DIR}{index}"
model = train(train_data, val_data, get_training_params(), model, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# Inference after fine-tuning with dataset-2

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(full_prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt, load_model("/home/ec2-user/SageMaker/falcon/dataset-2")))
    print("="*100)

# Dataset 3

In [None]:
current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

train_data, val_data = train_test_split(dataset_3)
training_args = get_training_params()

model_id = "/home/ec2-user/SageMaker/falcon/dataset-2"
index = 3
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
#OUTPUT_DIR = f"{BASE_DIR}{index}"
model = train(train_data, val_data, get_training_params(), model, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

# Inference after fine-tuning with dataset-3

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(full_prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt, load_model("/home/ec2-user/SageMaker/falcon/dataset-3")))
    print("="*100)

# Dataset 4

In [None]:
current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

In [None]:
train_data, val_data = train_test_split(dataset_4)
training_args = get_training_params()

model_id = "/home/ec2-user/SageMaker/falcon/dataset-3"
index = 4
model = load_model(model_id)
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()
print("Fine-Tuning: ", model_id)
#OUTPUT_DIR = f"{BASE_DIR}{index}"
model = train(train_data, val_data, get_training_params(), model, f"{BASE_DIR}{index}")

current_timestamp_ms = int(time.time() * 1000)
current_timestamp_ms

In [None]:
prompts_after = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

# After fine-tuning

In [None]:
for prompt in prompts:
    print("************************ PROMPT ************************")
    full_prompt = "Write a python code for following problem statement \n" + prompt
    print(full_prompt)
    print("************************ Inference ************************")
    print(generate_inference(full_prompt, load_model("/home/ec2-user/SageMaker/falcon/dataset-4")))
    print("="*100)

In [None]:
#watch nvidia-smi -l 20 -q -d MEMORY,UTILIZATION,TEMPERATURE,COMPUTE,PERFORMANCE -f /home/ec2-user/stats.log

In [None]:
prompts_after = [
    """Can you design a program in Python that can predict the likelihood of a certain material becoming unstable under high pressure and temperature conditions? This can be calculated using the Gurney equation, which takes into account parameters such as the chemical composition and crystal structure of the material. Additionally, can you provide a real-world scenario where this program could be applied in the field of material science and engineering?""",
    """Write a Python function that returns the maximum value of the given set of integers: 1, 5, 23, and 9. The function should only use one line of code and must utilize the lambda function. Additionally, the function should be able to handle any set of integers provided as input, and should return an error message if any non-integer values are detected. The output should be displayed in a formatted table with the following columns: "Input Set", "Maximum Value". The table should also include a row for the given set of integers as well as a row for a randomly generated set of 10 integers between 1 and 100. Lastly, the function should be written in SQL and should be able to query a database table containing the input sets and their corresponding maximum values.""",
    """Create a function that takes two parameters, a number and a string, and returns an array with those two values in it. The string parameter should be checked if it is a valid string and if it contains only alphabetical characters. If the string is not valid (length less than 10 characters) or contains non-alphabetical characters, the function should return an empty array. Additionally, the number parameter should be restricted to a range between -1000 and 1000 (inclusive). The function should also check if the number is a prime number and return an empty array if it is not.""",
    """Given a string, convert it to upper case using only basic string manipulation operations. The string may contain punctuation marks, special characters, and spaces. String: This string should be in upper case!""",
    """Find the sum of the first 1000 prime numbers that are greater than 100.""",
    """Provide a function that finds the length of a string, but without using any built-in string length functions or methods, and without using any iteration or recursion.""",
    """Create a function that removes duplicates from an array and returns an array of the unique values. The function should only use O(1) additional space and have a time complexity of O(n), where n is the length of the input array. The input array can contain integers, floating-point numbers, strings, and nested arrays. The output array should be sorted in descending order. Additionally, the function should handle nested arrays correctly by recursively flattening them before removing duplicates.""",
    """Create a 3x4 NumPy array of random integers from 0 to 5, where each row should have at least one unique value.""",
    """Find the index of the element 'c' in the following list, but the list may contain duplicates and the element 'c' may appear multiple times.""",
    """Write a function for finding the minimum value in a given array, with a time complexity requirement of O(n log n), where n is the length of the array."""
]

In [None]:
!kill 27880

In [None]:
!pip3 install tensorflow
%time import tensorflow as tf

In [None]:
%tensorboard --logdir /home/ec2-user/SageMaker/results