<a href="https://colab.research.google.com/github/ram130849/llm-finetuning/blob/main/llama2_7b_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install transformers langchain trl optimum peft accelerate streamlit bitsandbytes -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
import transformers
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch
from trl import SFTTrainer
from accelerate import Accelerator
from trl.trainer import ConstantLengthDataset

In [50]:
from dataclasses import dataclass, field

In [23]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
dataset = load_dataset(
    "AlexHung29629/stack-exchange-paired-128K",
    split="train"
).shuffle(seed=42)
original_columns = dataset.column_names

Downloading readme:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/82.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/128000 [00:00<?, ? examples/s]

In [11]:
original_columns

['prompt', 'chosen', 'rejected']

In [15]:
dataset[0]

{'prompt': 'Question: According to [DxO tests](http://www.dxomark.com/index.php/Cameras/Camera-Sensor-Ratings/%28type%29/usecase_landscape), cameras have 10 to 12 stops of dynamic range. Is that correct? Noise can completely screw some lowers values (easily resulting in loss of some stops).\n\nAlso [Norman Koren says](http://www.normankoren.com/digital_tonality.html) that a digital camera\'s original dynamic range can be 9 to 11 stops, but prints have "only" 6.5 stops.\n\nIn a section on dynamic range, Wikipedia says the human eye has a contrast ratio of around [6.5 stops](http://en.wikipedia.org/wiki/Human_eye#Dynamic_range). If that is the case, why is the human eye clearly much better than cameras to record scenes with high dynamic range?\n\nAnswer: ',
 'chosen': 'The main reason for this is that the human eye registers brightness on a logarithmic scale, whereas digital sensors are linear. Take a look [at this site](http://www.petapixel.com/2011/05/05/biology-for-photographers-why-i

In [32]:
# # load the base model in 4-bit quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# base_model = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Llama-2-7b-hf",        # "meta-llama/Llama-2-7b-hf"
#     quantization_config=bnb_config,
#     device_map={"": 0},
#     trust_remote_code=True,
#     use_auth_token=True,
# )
# base_model.config.use_cache = False

# # add LoRA layers on top of the quantized base model
# peft_config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     lora_dropout=0.05,
#     target_modules=["q_proj", "v_proj"],
#     bias="none",
#     task_type="CAUSAL_LM",
# )

In [35]:
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", return_tensors='pt', trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"
tokenizer = None

In [40]:
def prepare_sample_text(example):
    text = f"{example['prompt']} {example['chosen']}"
    return text

In [47]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        # print('text:',text)
        total_characters += len(text)
        total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [45]:
def create_datasets(dataset,tokenizer):
    dataset = dataset.train_test_split(test_size=0.005, seed=None)
    train_data = dataset["train"]
    valid_data = dataset["test"]
    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=2048,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=2048,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

In [48]:
# train_dataset, eval_dataset = create_datasets(dataset,tokenizer)

In [57]:
training_args =  TrainingArguments(
            output_dir="./results",
            max_steps=500,
            logging_steps=10,
            save_steps=10,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            gradient_checkpointing=False,
            group_by_length=False,
            learning_rate=1e-4,
            lr_scheduler_type="cosine",
            warmup_steps=100,
            weight_decay=0.05,
            optim="paged_adamw_32bit",
            bf16=True,
            remove_unused_columns=False,
            run_name="sft_llama2",
            report_to="wandb",
        )

In [59]:
training_args.output_dir

'./results'

In [61]:
# trainer = SFTTrainer(
#     model=base_model,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     peft_config=peft_config,
#     packing=True,
#     max_seq_length=None,
#     tokenizer=tokenizer,
#     args=training_args         # HF Trainer arguments
# )
# trainer.train()

In [None]:
# model = AutoPeftModelForCausalLM.from_pretrained(
#     training_args.output_dir, # location of saved SFT model
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     load_in_4bit=True,
#     is_trainable=True,
# )
# model_ref = AutoPeftModelForCausalLM.from_pretrained(
#     training_args.output_dir,  # same model as the main one
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     load_in_4bit=True,
# )

# dpo_trainer = DPOTrainer(
#     model,
#     model_ref,
#     args=training_args,
#     beta=script_args.beta,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     peft_config=peft_config,
# )
# dpo_trainer.train()
# dpo_trainer.save_model()