# Preprocess data (Implementation derived from data preparation file by litgpt)



In [1]:
!nvidia-smi

Tue Jan  2 23:09:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
def install_dependencies():
    !pip install -Uqq  git+https://github.com/huggingface/peft.git
    !pip install -Uqq transformers datasets accelerate bitsandbytes
    !pip install -Uqq wandb

# uncomment the following line to install the required dependencies
install_dependencies()

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
model_name = ("meta-llama/Llama-2-7b-chat-hf","meta-llama/Llama-2-7b-chat-hf")
run_name = 'Llama-2-7b-lora-int8-bias-nonCoT'
dataset = 'yc4142/bias-nonCoT'
peft_name = 'Llama-2-7b-lora-int8-bias-nonCoT'
output_dir = 'Llama-2-7b-lora-int8-bias-nonCoT-results'

model_name[1],dataset,peft_name,run_name

('meta-llama/Llama-2-7b-chat-hf',
 'yc4142/bias-nonCoT',
 'Llama-2-7b-lora-int8-bias-nonCoT',
 'Llama-2-7b-lora-int8-bias-nonCoT')

In [4]:
report_to = "wandb" # "none"

if report_to != "none":
    import wandb
    wandb.login()
#7cc4240d81ca82afc28e021b820ba11f43bbcd3e

[34m[1mwandb[0m: Currently logged in as: [33myc4142[0m ([33mcapstone_columbia[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
wandb.init(project=run_name,config={
    "model": model_name[1],
    "dataset":dataset
})

In [6]:
!huggingface-cli login
#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


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [6]:
import huggingface_hub
repo_id = f'{huggingface_hub.whoami()["name"]}/{peft_name}'
print(repo_id)

In [7]:
import json
import sys
from pathlib import Path
from typing import Optional
from transformers import AutoTokenizer
import requests
from datasets import Dataset
from datasets import load_dataset
from tqdm import tqdm

def prepare(
    model_name:str = "togethercomputer/RedPajama-INCITE-instruct-3B-v1",
    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
    mask_inputs: bool = False,  # as in alpaca-lora
    seed:int = 42,
    dataset: str = "yc4142/bias-CoT",
    ignore_index: int = 0,
    max_seq_length:int = 2048,
) -> list:
    """Prepare the Alpaca dataset for instruction tuning.

    The output is a training and test dataset saved as `train.pt` and `test.pt`,
    which stores the preprocessed and tokenized prompts and labels.
    """


    # Partition the dataset into train and test
    print("Loading dataset...")
    split_dataset = load_dataset(dataset, split='train').train_test_split(test_size=test_split_fraction, seed = seed)
    train_set = split_dataset['train']
    test_set = split_dataset['test']
    print("example of original CoT data:")

    for key in train_set[3].keys():
      print((key, train_set[3][key]))

    print(f"train has {len(train_set):,} samples")
    print(f"test has {len(test_set):,} samples")

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name,add_eos_token=True)
    tokenizer.pad_token_id = 0
    tokenizer.add_special_tokens({'eos_token':'<eos>'})



    print("Processing train split ...")
    train_data = []
    for i in tqdm(range(0, len(train_set))):
      train_data.append(prepare_sample(
          example=train_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          mask_inputs=mask_inputs,
          ignore_index=ignore_index,
          )
      )

   # train_set = [
    #    prepare_sample(
     #       example=sample,
      #      tokenizer=tokenizer,
       #     max_length=max_seq_length,
        #    mask_inputs=mask_inputs,
         #   ignore_index=ignore_index,
        #)
        #for sample in tqdm(train_set)
    #]



    print("Processing test split ...")
    test_data = []
    for i in tqdm(range(0, len(test_set))):
      test_data.append(prepare_sample(
          example=test_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          mask_inputs=mask_inputs,
          ignore_index=ignore_index,
          )
      )
    #test_set = [
     #   prepare_sample(
      #      example=sample,
       #     tokenizer=tokenizer,
        #    max_length=max_seq_length,
         #   mask_inputs=mask_inputs,
          #  ignore_index=ignore_index,
        #)
        #for sample in tqdm(test_set)
    #]

    return tokenizer, Dataset.from_list(train_data), Dataset.from_list(test_data)



def prepare_sample(example: dict, tokenizer: AutoTokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
    """Processes a single sample.

    Each sample in the dataset consists of:
    - instruction: A string describing the task
    - input: A string holding a special input value for the instruction.
        This only applies to some samples, and in others this is empty.
    - output: The response string

    This function processes this data to produce a prompt text and a label for
    supervised training. The prompt text is formed as a single message including both
    the instruction and the input. The label/target is the same message but with the
    response attached.

    Finally, both the prompt and the label get tokenized. If desired, all tokens
    in the label that correspond to the original input prompt get masked out (default).
    """
    full_prompt = generate_prompt(example)
    full_prompt_and_response = full_prompt + example["output"]
    encoded_full_prompt = tokenizer(
        full_prompt + "<eos>",  # add the end-of-stream token
        truncation=True,
        max_length= max_length,
        padding="max_length",
        )
    encoded_full_prompt_and_response = tokenizer(
        full_prompt_and_response + "<eos>",
        truncation=True,
        max_length= max_length,
        padding="max_length",
        )

    # The labels are the full prompt with response, but with the prompt masked out
    #labels = encoded_full_prompt_and_response['input_ids'].copy()
    #if mask_inputs:
     # for i in range(0, len(tokenizer(full_prompt)['input_ids'])):

      #  labels[i] = ignore_index
        #labels[: len(encoded_full_prompt)] = ignore_index

    return {
        "instruction": example["instruction"],
        "input": example["input"],
        "output": example["output"],
        "input_ids": encoded_full_prompt_and_response['input_ids'],
        "input_ids_no_response": encoded_full_prompt['input_ids'],
        "labels": encoded_full_prompt_and_response['input_ids'][1:]
    }


def generate_prompt(example: dict) -> str:
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""

    if example["input"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )


In [8]:
tokenizer, train_data, eval_data = prepare(
    model_name = model_name[1],
    test_split_fraction = 0.03865,
    mask_inputs = False,
    seed = 42,
    dataset = dataset,
    ignore_index = -1,
    max_seq_length = 2048,
)

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train has 5,886 samples
test has 237 samples
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing train split ...


100%|██████████| 5886/5886 [00:10<00:00, 538.30it/s]


Processing test split ...


100%|██████████| 237/237 [00:00<00:00, 515.88it/s]


In [9]:
print(train_data[3].keys())
for key in train_data[3].keys():
  print((key, train_data[3][key]))


dict_keys(['instruction', 'input', 'output', 'input_ids', 'input_ids_no_response', 'labels'])
('instruction', 'Identify if the following context contains profession bias toward policeman.\n        If there is profession bias toward policeman in the context, reply "Yes, there is profession bias toward policeman.". \n        If there is no profession bias toward policeman, reply "No, there is no profession bias toward policeman.".')
('input', '\n        context: Ask a policeman for help if you ever get lost. He will help you find your parents with his walkie talkie.\n        ')
('output', 'Yes, there is profession bias toward policeman.')
('input_ids', [1, 13866, 338, 385, 15278, 393, 16612, 263, 3414, 29892, 3300, 2859, 411, 385, 1881, 393, 8128, 4340, 3030, 29889, 14350, 263, 2933, 393, 7128, 2486, 1614, 2167, 278, 2009, 29889, 13, 13, 2277, 29937, 2799, 4080, 29901, 13, 7648, 1598, 565, 278, 1494, 3030, 3743, 6351, 24003, 11183, 13665, 11422, 29889, 13, 4706, 960, 727, 338, 6351, 2400

# Fine tune

yc4142/Llama-2-7b-lora-int8-bias-nonCoT


In [11]:
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
print("Loading model for model: ", model_name[0])
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=False,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.float16
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name[0],
    quantization_config = bnb_config,
    device_map={"": 0},
)

Loading model for model:  meta-llama/Llama-2-7b-chat-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r= 8,
 lora_alpha=16,
 target_modules=["q_proj", "v_proj", "k_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)

# prepare int-8 model for training
model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 6,744,707,072 || trainable%: 0.09327989982127426


In [13]:
!pip install trl



In [14]:
import transformers
eval_steps = 100
save_steps = 100
logging_steps = 20

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset= eval_data,
    args=transformers.TrainingArguments(
        num_train_epochs=5,
        learning_rate=3e-4,
        logging_steps=logging_steps,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        output_dir=output_dir,
        report_to=report_to if report_to else "none",
        save_total_limit=3,
        load_best_model_at_end=True,
        push_to_hub=False,
        auto_find_batch_size=True,
        warmup_steps=100,
        weight_decay = 0.01,
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 32
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [17]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
trainer.train()

RuntimeError: ignored

In [None]:
trainer.model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)