<a href="https://colab.research.google.com/github/saravanan-nj/notebooks/blob/main/qlora-tiny-llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Requirements

In [5]:
!pip install -q accelerate==1.3.0 peft==0.14.0 bitsandbytes==0.45.2 transformers==4.48.3 trl==0.14.0 datasets==3.1.0 pretty_midi # fsspec==2024.10.0

In [6]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModel,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM

In [7]:
# Load the training dataset
from google.colab import drive
import pandas as pd

df = pd.read_csv('/content/imdb/train_data.txt', delimiter=":::", names=["index", "title", "genre", "description"])

  df = pd.read_csv('/content/imdb/train_data.txt', delimiter=":::", names=["index", "title", "genre", "description"])


In [15]:
dataset_df = pd.DataFrame()
def get_text(row):
  return f"""
### Instruction:
Given a movie description, read the description, understand and analyse the story of the movie based on the given description and return the genre of the movie.

### Description:
{row["description"]}

### Genre:
{row["genre"]}
"""
dataset_df["description"] = df["description"]
dataset_df["genre"] = df["genre"]
dataset_df["instruction"] = "Given a movie description, read the description, understand and analyse the story of the movie based on the given description and return the genre of the movie."
dataset_df["text"] = dataset_df.apply(get_text, axis=1)

In [16]:
from datasets import Dataset
dataset = Dataset.from_pandas(dataset_df)

In [19]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [20]:
model_name = "crumb/nano-mistral"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [26]:
device_map = "auto"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [27]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = SFTConfig(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=100,
    learning_rate=1e-5,
    weight_decay=0.01,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    max_seq_length=512,
    packing=False
)

In [28]:
def formatted_prompts_func(example):
  output_texts = []
  for i in range(len(example["description"])):
    output_texts.append(f"""
### Instruction:
Given a movie description, read the description, understand and analyse the story of the movie based on the given description and return the genre of the movie.

### Description:
{example["description"][i]}

### Genre:
{example["genre"][i]}
""")
  return output_texts

response_template = " ### Genre:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
    processing_class=tokenizer
)
trainer.train()
trainer.model.save_pretrained("imdb-classifier")

Map:   0%|          | 0/9560 [00:00<?, ? examples/s]

Step,Training Loss
100,3.5621
200,3.4911
300,3.3981
400,3.207
500,3.1478
600,3.0597
700,2.9239
800,2.8596
900,2.7836
1000,2.7356


In [29]:
!zip -r imdb-classifier.zip results imdb-classifier

  adding: results/ (stored 0%)
  adding: results/checkpoint-2390/ (stored 0%)
  adding: results/checkpoint-2390/adapter_model.safetensors (deflated 7%)
  adding: results/checkpoint-2390/README.md (deflated 66%)
  adding: results/checkpoint-2390/training_args.bin (deflated 51%)
  adding: results/checkpoint-2390/trainer_state.json (deflated 73%)
  adding: results/checkpoint-2390/special_tokens_map.json (deflated 73%)
  adding: results/checkpoint-2390/optimizer.pt (deflated 9%)
  adding: results/checkpoint-2390/adapter_config.json (deflated 54%)
  adding: results/checkpoint-2390/rng_state.pth (deflated 25%)
  adding: results/checkpoint-2390/scheduler.pt (deflated 56%)
  adding: results/checkpoint-2390/tokenizer.json (deflated 85%)
  adding: results/checkpoint-2390/tokenizer.model (deflated 55%)
  adding: results/checkpoint-2390/tokenizer_config.json (deflated 69%)
  adding: results/runs/ (stored 0%)
  adding: results/runs/Feb09_13-18-11_41e14a874267/ (stored 0%)
  adding: results/runs/Feb

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"