In [None]:
!pip install accelerate peft bitsandbytes transformers trl

import pandas as pd
from datasets import Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

In [None]:
# run if working in google collab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
dataset = "/content/drive/MyDrive/Colab Notebooks/dataset1.csv"
model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
output_model = "/content/drive/MyDrive/neuralpy-v1"

In [9]:
df = pd.read_csv(dataset)
df = df.drop(columns = ['prompt','input'])

### Data preparation

In [10]:
def formatted_train(input,response)->str:
    return f"<|user|>\n{input}</s>\n<|assistant|>\n{response}</s>"

In [11]:
def prepare_train_data(data_df):
    data_df["text"] = data_df[["instruction", "output"]].apply(lambda x: "<|user|>\n" + x["instruction"] + "</s>\n<|assistant|>\n" + x["output"] + "</s>", axis=1)
    data = Dataset.from_pandas(data_df)
    return data

In [12]:
data = prepare_train_data(df)

### Model the Model (not the base version)

In [13]:
def get_model_and_tokenizer(mode_id):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
model, tokenizer = get_model_and_tokenizer(model_id)

### Setting up the LoRA

In [15]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [None]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024,
    )

tokenized_data = data.map(tokenize, batched=True)

In [17]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=250,
        fp16=True,
        report_to=[],
    )

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_data,
        peft_config=peft_config,
        args=training_arguments,
    )

In [None]:
trainer.train()

### Merging the LoRA with the base model

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/drive/MyDrive/neuralpy-v1/checkpoint-250"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()

In [21]:
def formatted_prompt(question)-> str:
    return f"<|user|>\n{question}</s>\n<|assistant|>"

### Inference from the LLM

In [22]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=50,temperature=0.7,repetition_penalty=1.2,
      max_new_tokens=256,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  print('---')
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [23]:
generate_response(user_input='write a python function code to find average of integer array')

<|user|>
write a python function code to find average of integer array 
<|assistant|>
def avg(arr):
    sum=0
    for i in range(len(arr)):
        sum += arr[i]
    
    return sum/len(arr)
  
---
Time taken for inference: 3.78 seconds
