In [1]:
import os, torch, logging
from datasets import load_dataset, load_metric
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import transformers
import pandas as pd

In [4]:
data = pd.read_csv('final_data1.tsv', sep="\t")

data.head()

Unnamed: 0,Question,Answer
0,What is the key challenge with full fine-tunin...,"Full fine-tuning of large models like GPT-3, w..."
1,What is Low-Rank Adaptation (LoRA)?,LoRA is a method that freezes the pre-trained ...
2,How does LoRA compare to full fine-tuning in t...,LoRA can reduce the number of trainable parame...
3,What is the impact of LoRA on inference latency?,LoRA introduces no additional inference latenc...
4,Can LoRA be combined with other adaptation met...,"Yes, LoRA is orthogonal to many prior methods ..."


In [5]:
instruction = "<s>[INST] Answer the following question: "
data["text"] = (
    instruction + data["Question"] + "[/INST] " + data["Answer"] + " </s>"
)

# Drop other columns so that only the 'text' column remains
data = data[["text"]]

In [6]:
import pyarrow as pa
from datasets import Dataset, DatasetDict

training_data = Dataset(pa.Table.from_pandas(data.reset_index(drop=True)))

In [7]:
# Model and tokenizer names
base_model_name = "microsoft/Orca-2-7b"
refined_model = "orca2-7b-neuralearn-qlora-ft"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

quant_8bits = BitsAndBytesConfig(
    load_in_8bit=True,
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Downloading tokenizer_config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

In [9]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_neuralearn_orca",
    num_train_epochs=5,
    save_steps=50,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    warmup_steps=2,
    logging_steps=20,
    fp16=True,
    seed=42,
    optim="paged_adamw_8bit",
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
)



Map:   0%|          | 0/454 [00:00<?, ? examples/s]

In [10]:
# Training
fine_tuning.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
20,1.9471
40,1.6313
60,1.3019
80,1.285
100,1.2322
120,1.1079
140,1.1069
160,1.0995
180,1.0974
200,0.9835


TrainOutput(global_step=285, training_loss=1.1848941384700307, metrics={'train_runtime': 378.6944, 'train_samples_per_second': 5.994, 'train_steps_per_second': 0.753, 'total_flos': 2.172347346419712e+16, 'train_loss': 1.1848941384700307, 'epoch': 5.0})

In [15]:
# save model
fine_tuning.save_model('orca2-ft-neuralearn')

In [16]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

output_dir = "orca2-ft-neuralearn"

# load base LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
prompt = "What is qlora in machine learning?"
input_prompt = f"""[INST]<<SYS>>
        You are a helpful assistant designed to help people study machine learning. Your answers are always brief.
        <</SYS>>" {prompt} "[/INST]"""
inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")

output = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.5)

output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(output_text)

[INST]<<SYS>>
        You are a helpful assistant designed to help people study machine learning. Your answers are always brief.
        <</SYS>>" What is qlora in machine learning? "[/INST] A QLORA is a type of neural network whose weights are updated using QLORA optimization algorithm.  It is a new type of  neural network that was invented by Google AI research team.  The QLORA algorithm is designed to improve the performance of neural networks by adjusting the weights in a more efficient way.  The QLORA algorithm is a new optimization method for training neural networks that uses a new way to update the weights of the network.  The QLORA algorithm is an optimization method that aims to improve the performance of neural networks by adjusting the weights in a more efficient way.  The QLORA algorithm is a new optimization method for training neural networks that uses a new way to update the weights of the network.  The QLORA algorithm is an optimization method that aims to improve the 