In [1]:
import os, torch, logging
from datasets import load_dataset, load_metric
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import transformers
import pandas as pd

In [2]:
data = pd.read_csv('final_data1.tsv', sep="\t")

data.head()

Unnamed: 0,Question,Answer
0,What is the key challenge with full fine-tunin...,"Full fine-tuning of large models like GPT-3, w..."
1,What is Low-Rank Adaptation (LoRA)?,LoRA is a method that freezes the pre-trained ...
2,How does LoRA compare to full fine-tuning in t...,LoRA can reduce the number of trainable parame...
3,What is the impact of LoRA on inference latency?,LoRA introduces no additional inference latenc...
4,Can LoRA be combined with other adaptation met...,"Yes, LoRA is orthogonal to many prior methods ..."


In [3]:
instruction = "<s>[INST] Answer the following question: "
data["text"] = (
    instruction + data["Question"] + "[/INST] " + data["Answer"] + " </s>"
)

# Drop other columns so that only the 'text' column remains
data = data[["text"]]

In [5]:
import pyarrow as pa
from datasets import Dataset, DatasetDict

training_data = Dataset(pa.Table.from_pandas(data.reset_index(drop=True)))

In [9]:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama2-7b-neuralearn-qlora-ft"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [10]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_neuralearn",
    num_train_epochs=10,
    save_steps=50,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    warmup_steps=2,
    logging_steps=50,
    fp16=True,
    seed=42,
    optim="paged_adamw_8bit",
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
)



Map:   0%|          | 0/454 [00:00<?, ? examples/s]

In [12]:
# Training
fine_tuning.train()

Step,Training Loss
50,0.553


KeyboardInterrupt: 

In [13]:
# save model
fine_tuning.save_model('llama2-ft-neuralearn')

In [14]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

output_dir = "llama2-ft-neuralearn"

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [16]:
prompt = "What is lora?"
input_prompt = "<s>[INST]You are a helpful assistant designed to help people study machine learning. Be concise in your answers. " + prompt + "[/INST] " 
inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")

output = model.generate(**inputs, max_new_tokens=512, do_sample=True, top_p=0.9,temperature=0.9)

output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output_text)

[INST]You are a helpful assistant designed to help people study machine learning. Be concise in your answers. What is lora?[/INST]  LoRA is a linear layer that uses Optimal Step Size Adaptive for better efficiency during training and velocity adaptation during inference. It is used to control the learning rate and velocity of the model.  Do you have any questions about LoRA?  Yes, please ask! There are many more details to explore.  This assistive AI is here to help you explore more about LoRA and the world of machine learning. Feel free to ask any question!  The LoRA linear layer is a tool for efficiency in training and velocity adaptation during inference. It is used to control the learning rate and velocity of the model. The name LoRA is derived from the concept of adaptive step sizes being optimized for better efficiency. The layer aims to improve the efficiency of training and velocity adaptation during inference. By using LoRA, the layer can automatically adapt the learning rate 

In [19]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()
# merged_model.save_pretrained("merged_model",safe_serialization=True)
# tokenizer.save_pretrained("merged_model")

merged_model.push_to_hub("RNDRandoM/llama2-7b-neuralearn-lora-ft", token="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")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


RuntimeError: [enforce fail at inline_container.cc:424] . unexpected pos 9534073536 vs 9534073432

