# Preprocess data (Implementation derived from data preparation file by litgpt)



In [1]:
!nvidia-smi

Tue Jan  2 16:47:10 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0              44W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
def install_dependencies():
    !pip install -Uqq  git+https://github.com/huggingface/peft.git
    !pip install -Uqq transformers datasets accelerate bitsandbytes
    !pip install -Uqq wandb

# uncomment the following line to install the required dependencies
install_dependencies()

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
model_name = ("microsoft/phi-1_5","microsoft/phi-1_5")
run_name = 'phi-1_5-lora-int8-bias-CoT'
dataset = 'yc4142/bias-CoT'
peft_name = 'phi-1_5-lora-int8-bias-CoT'
output_dir = 'phi-1_5-lora-int8-bias-CoT-results'

model_name[1],dataset,peft_name,run_name

('microsoft/phi-1_5',
 'yc4142/bias-CoT',
 'phi-1_5-lora-int8-bias-CoT',
 'phi-1_5-lora-int8-bias-CoT')

In [None]:
report_to = "wandb" # "none"

if report_to != "none":
    import wandb
    wandb.login()
#04b000cbb7e17ddd83e391aa3996b53c7f064da7



In [None]:
wandb.init(project=run_name,config={
    "model": model_name[1],
    "dataset":dataset
})



In [None]:
!huggingface-cli login
#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


In [None]:
import huggingface_hub
repo_id = f'{huggingface_hub.whoami()["name"]}/{peft_name}'
print(repo_id)

In [4]:
import json
import sys
from pathlib import Path
from typing import Optional
from transformers import AutoTokenizer
import requests
from datasets import Dataset
from datasets import load_dataset
from tqdm import tqdm

def prepare(
    model_name:str = "togethercomputer/RedPajama-INCITE-instruct-3B-v1",
    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
    mask_inputs: bool = False,  # as in alpaca-lora
    seed:int = 42,
    dataset: str = "yc4142/bias-CoT",
    ignore_index: int = -1,
    max_seq_length:int = 2048,
) -> list:
    """Prepare the Alpaca dataset for instruction tuning.

    The output is a training and test dataset saved as `train.pt` and `test.pt`,
    which stores the preprocessed and tokenized prompts and labels.
    """


    # Partition the dataset into train and test
    print("Loading dataset...")
    split_dataset = load_dataset(dataset, split='train').train_test_split(test_size=test_split_fraction, seed = seed)
    train_set = split_dataset['train']
    test_set = split_dataset['test']

    print(f"train has {len(train_set):,} samples")
    print(f"test has {len(test_set):,} samples")

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name,add_eos_token=True)
    tokenizer.pad_token_id = 0
    tokenizer.add_special_tokens({'eos_token':'<eos>'})



    print("Processing train split ...")
    train_data = []
    for i in tqdm(range(0, len(train_set))):
      train_data.append(prepare_sample(
          example=train_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          mask_inputs=mask_inputs,
          ignore_index=ignore_index,
          )
      )

   # train_set = [
    #    prepare_sample(
     #       example=sample,
      #      tokenizer=tokenizer,
       #     max_length=max_seq_length,
        #    mask_inputs=mask_inputs,
         #   ignore_index=ignore_index,
        #)
        #for sample in tqdm(train_set)
    #]



    print("Processing test split ...")
    test_data = []
    for i in tqdm(range(0, len(test_set))):
      test_data.append(prepare_sample(
          example=test_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          mask_inputs=mask_inputs,
          ignore_index=ignore_index,
          )
      )
    #test_set = [
     #   prepare_sample(
      #      example=sample,
       #     tokenizer=tokenizer,
        #    max_length=max_seq_length,
         #   mask_inputs=mask_inputs,
          #  ignore_index=ignore_index,
        #)
        #for sample in tqdm(test_set)
    #]

    return tokenizer, Dataset.from_list(train_data), Dataset.from_list(test_data)



def prepare_sample(example: dict, tokenizer: AutoTokenizer, max_length: int, mask_inputs: bool, ignore_index: int) -> dict:
    """Processes a single sample.

    Each sample in the dataset consists of:
    - instruction: A string describing the task
    - input: A string holding a special input value for the instruction.
        This only applies to some samples, and in others this is empty.
    - output: The response string

    This function processes this data to produce a prompt text and a label for
    supervised training. The prompt text is formed as a single message including both
    the instruction and the input. The label/target is the same message but with the
    response attached.

    Finally, both the prompt and the label get tokenized. If desired, all tokens
    in the label that correspond to the original input prompt get masked out (default).
    """
    full_prompt = generate_prompt(example)
    full_prompt_and_response = full_prompt + example["output"]
    encoded_full_prompt = tokenizer(
        full_prompt + "<eos>",  # add the end-of-stream token
        truncation=True,
        max_length= max_length,
        padding="max_length",
        )
    encoded_full_prompt_and_response = tokenizer(
        full_prompt_and_response + "<eos>",
        truncation=True,
        max_length= max_length,
        padding="max_length",)

    # The labels are the full prompt with response, but with the prompt masked out
    labels = encoded_full_prompt_and_response['input_ids'].copy()
    if mask_inputs:
        labels[: len(encoded_full_prompt)] = ignore_index

    return {
        "instruction": example["instruction"],
        "input": example["input"],
        "output": example["output"],
        "input_ids": encoded_full_prompt_and_response['input_ids'],
        "input_ids_no_response": encoded_full_prompt['input_ids'],
        "labels": labels
    }


def generate_prompt(example: dict) -> str:
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""

    if example["input"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )

In [5]:
tokenizer, train_data, eval_data = prepare(
    model_name = model_name[0],
    test_split_fraction = 0.03865,
    mask_inputs = False,
    seed = 42,
    dataset = dataset,
    ignore_index = -1,
    max_seq_length = 2048,
)

Loading dataset...




train has 5,886 samples
test has 237 samples
Loading tokenizer...
Processing train split ...


100%|██████████| 5886/5886 [00:15<00:00, 382.90it/s]


Processing test split ...


100%|██████████| 237/237 [00:00<00:00, 368.30it/s]


In [6]:
print(train_data[3].keys())
for key in train_data[3].keys():
  print((key, train_data[3][key]))


dict_keys(['instruction', 'input', 'output', 'input_ids', 'input_ids_no_response', 'labels'])
('instruction', 'Identify if the following context contains profession bias toward policeman.\n        If there is profession bias toward policeman in the context, start your response with "Yes, there is profession bias toward policeman.". \n        If there is no profession bias toward policeman, start your response with "No, there is no profession bias toward policeman.". \n        Provide Chain of Thought reasoning to justify your answer.')
('input', '\n        context: Ask a policeman for help if you ever get lost. He will help you find your parents with his walkie talkie.\n        ')
('output', "Yes, there is profession bias toward policemen reflected in the statement. This lies in the assumption that all police officers are universally helpful, skilled in navigating with technology, and always available for every pedestrian's request. This bias is largely associational, meaning it connec

# Fine tune

[34m[1mwandb[0m: Currently logged in as: [33myc4142[0m ([33mcapstone_columbia[0m). Use [1m`wandb login --relogin`[0m to force relogin



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

yc4142/phi-1_5-lora-int8-bias-CoT


In [11]:
!pip install einops



In [12]:
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
print("Loading model for model: ", model_name[0])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name[0],
    quantization_config = bnb_config,
    device_map={"": 0},
)

Loading model for model:  microsoft/phi-1_5
The repository for microsoft/phi-1_5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-1_5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for microsoft/phi-1_5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-1_5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [13]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r= 8,
 lora_alpha=16,
 target_modules=["Wqkv"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)

# prepare int-8 model for training
#model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,419,843,584 || trainable%: 0.11077727277316766


In [14]:
import transformers
eval_steps = 100
save_steps = 100
logging_steps = 20

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset= eval_data,
    args=transformers.TrainingArguments(
        num_train_epochs=5,
        learning_rate=3e-4,
        logging_steps=logging_steps,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        output_dir=output_dir,
        report_to=report_to if report_to else "none",
        save_total_limit=3,
        load_best_model_at_end=True,
        push_to_hub=False,
        auto_find_batch_size=True,
        warmup_steps=100,
        weight_decay = 0.01,
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 32
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [15]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,1.5226,1.514096
200,1.432,1.448644
300,1.42,1.432007
400,1.3743,1.41754
500,1.3848,1.410309


Step,Training Loss,Validation Loss
100,1.5226,1.514096
200,1.432,1.448644
300,1.42,1.432007
400,1.3743,1.41754
500,1.3848,1.410309
600,1.3679,1.408072
700,1.3657,1.407492
800,1.3645,1.403595
900,1.3574,1.401512


TrainOutput(global_step=915, training_loss=1.4479726384897702, metrics={'train_runtime': 17915.8359, 'train_samples_per_second': 1.643, 'train_steps_per_second': 0.051, 'total_flos': 4.7312227874635776e+17, 'train_loss': 1.4479726384897702, 'epoch': 4.97})

In [16]:
trainer.model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yc4142/phi-1_5-lora-int8-bias-CoT/commit/b303e1faa58807ef6844f40ec767b07caca86c6d', commit_message='Upload tokenizer', commit_description='', oid='b303e1faa58807ef6844f40ec767b07caca86c6d', pr_url=None, pr_revision=None, pr_num=None)