# Preprocess data (Implementation derived from data preparation file by litgpt)



In [1]:
!nvidia-smi

Thu Jan  4 17:03:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
def install_dependencies():
    !pip install -Uqq  git+https://github.com/huggingface/peft.git
    !pip install -Uqq transformers datasets accelerate bitsandbytes
    !pip install -Uqq wandb

# uncomment the following line to install the required dependencies
install_dependencies()

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
model_name = ("microsoft/phi-1_5","microsoft/phi-1_5")
run_name = 'phi-1_5-lora-int8-double-stockmarket-CoT'
dataset = 'yc4142/stockmarket-CoT'
peft_name = 'phi-1_5-lora-int8-double-stockmarket-CoT'
output_dir = 'phi-1_5-lora-int8-double-stockmarket-CoT-results'

model_name[1],dataset,peft_name,run_name

('microsoft/phi-1_5',
 'yc4142/stockmarket-CoT',
 'phi-1_5-lora-int8-double-stockmarket-CoT',
 'phi-1_5-lora-int8-double-stockmarket-CoT')

In [4]:
report_to = "wandb" # "none"

if report_to != "none":
    import wandb
    wandb.login()
#04b000cbb7e17ddd83e391aa3996b53c7f064da7



[34m[1mwandb[0m: Currently logged in as: [33myc4142[0m ([33mcapstone_columbia[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
wandb.init(project=run_name,config={
    "model": model_name[1],
    "dataset":dataset
})



In [6]:
!huggingface-cli login
#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



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [7]:
import huggingface_hub
repo_id = f'{huggingface_hub.whoami()["name"]}/{peft_name}'
print(repo_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


yc4142/phi-1_5-lora-int8-double-stockmarket-CoT


In [8]:
import json
import sys
from pathlib import Path
from typing import Optional
from transformers import AutoTokenizer
import requests
from datasets import Dataset
from datasets import load_dataset
from tqdm import tqdm

def prepare(
    model_name:str = "togethercomputer/RedPajama-INCITE-instruct-3B-v1",
    test_split_fraction: float = 0.03865,  # to get exactly 2000 test samples,
    seed:int = 42,
    dataset: str = "yc4142/bias-CoT",
    ignore_index: int = 0,
    max_seq_length:int = 2048,
) -> list:
    """Prepare the Alpaca dataset for instruction tuning.

    The output is a training and test dataset saved as `train.pt` and `test.pt`,
    which stores the preprocessed and tokenized prompts and labels.
    """


    # Partition the dataset into train and test
    print("Loading dataset...")
    split_dataset = load_dataset(dataset, split='train').train_test_split(test_size=test_split_fraction, seed = seed)
    train_set = split_dataset['train']
    test_set = split_dataset['test']
    print("example of original CoT data:")

    for key in train_set[3].keys():
      print((key, train_set[3][key]))

    print(f"train has {len(train_set):,} samples")
    print(f"test has {len(test_set):,} samples")

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name,add_eos_token=True)
    tokenizer.pad_token_id = 0
    tokenizer.add_special_tokens({'eos_token':'<eos>'})



    print("Processing train split ...")
    train_data = []
    for i in tqdm(range(0, len(train_set))):
      train_data.append(prepare_sample(
          example=train_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          ignore_index=ignore_index,
          )
      )

   # train_set = [
    #    prepare_sample(
     #       example=sample,
      #      tokenizer=tokenizer,
       #     max_length=max_seq_length,
        #    mask_inputs=mask_inputs,
         #   ignore_index=ignore_index,
        #)
        #for sample in tqdm(train_set)
    #]



    print("Processing test split ...")
    test_data = []
    for i in tqdm(range(0, len(test_set))):
      test_data.append(prepare_sample(
          example=test_set[i],
          tokenizer=tokenizer,
          max_length=max_seq_length,
          ignore_index=ignore_index,
          )
      )
    #test_set = [
     #   prepare_sample(
      #      example=sample,
       #     tokenizer=tokenizer,
        #    max_length=max_seq_length,
         #   mask_inputs=mask_inputs,
          #  ignore_index=ignore_index,
        #)
        #for sample in tqdm(test_set)
    #]

    return tokenizer, Dataset.from_list(train_data), Dataset.from_list(test_data)



def prepare_sample(example: dict, tokenizer: AutoTokenizer, max_length: int, ignore_index: int) -> dict:
    """Processes a single sample.

    Each sample in the dataset consists of:
    - instruction: A string describing the task
    - input: A string holding a special input value for the instruction.
        This only applies to some samples, and in others this is empty.
    - output: The response string

    This function processes this data to produce a prompt text and a label for
    supervised training. The prompt text is formed as a single message including both
    the instruction and the input. The label/target is the same message but with the
    response attached.

    Finally, both the prompt and the label get tokenized. If desired, all tokens
    in the label that correspond to the original input prompt get masked out (default).
    """
    full_prompt = generate_prompt(example)
    full_prompt_and_response = full_prompt + example["output"]
    encoded_full_prompt = tokenizer(
        full_prompt + "<eos>",  # add the end-of-stream token
        truncation=True,
        max_length= max_length,
        padding="max_length",
        )
    encoded_full_prompt_and_response = tokenizer(
        full_prompt_and_response + "<eos>",
        truncation=True,
        max_length= max_length,
        padding="max_length",
        )

    return {
        "instruction": example["instruction"],
        "input": example["input"],
        "output": example["output"],
        "input_ids": encoded_full_prompt_and_response['input_ids'],
        "input_ids_no_response": encoded_full_prompt['input_ids'],
        "labels": encoded_full_prompt_and_response['input_ids'][1:]
    }


def generate_prompt(example: dict) -> str:
    """Generates a standardized message to prompt the model with an instruction, optional input and a
    'response' field."""

    if example["input"]:
        return (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
        )
    return (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{example['instruction']}\n\n### Response:"
    )

In [9]:
tokenizer, train_data, eval_data = prepare(
    model_name = model_name[0],
    test_split_fraction = 0.03865,
    seed = 42,
    dataset = dataset,
    ignore_index = 0,
    max_seq_length = 2048,
)

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


example of original CoT data:
('output', "People will have a neutral perspective about the stock given the headline “U.K.’s Largest Pension Fund Plans to Halve Hedge-Fund Holdings.” This is because from a financial market’s point of view, the impact of this action is largely balanced. \n\nLet's first understand the active participants here, the U.K.’s Largest Pension Fund and Hedge Funds. Pension funds are traditionally conservative investors that assemble vast pools of money to guarantee payments to employees once they retire. Hedge funds, on the other hand, are typically more aggressive in their investment strategies, often using leverage and derivatives in order to maximize returns. \n\nTo start with, when a pension fund reduces its exposure to hedge funds, it lessens its own volatility in the portfolio. Given that hedge funds often involve higher-risk strategies, a scaling back of such holdings could potentially reduce potential losses in scenarios of market volatility. From this v

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing train split ...


100%|██████████| 9174/9174 [00:27<00:00, 334.20it/s]


Processing test split ...


100%|██████████| 369/369 [00:01<00:00, 313.99it/s]


In [10]:
print(train_data[3].keys())
for key in train_data[3].keys():
  print((key, train_data[3][key]))


dict_keys(['instruction', 'input', 'output', 'input_ids', 'input_ids_no_response', 'labels'])
('instruction', 'Identify people\'s perspective on stock market as a reaction to the following context. If you infer that people expect the stock price to go down, start your answer with "People will have bearish perspective about the stock." If you infer that people expect the stock price to go up, start your answer with "People will have bullish perspective about the stock." If you infer that people expect the stock price to stay constant, start your answer with "People will have neutral perspective about the stock." ')
('input', '\n        context: U.K.’s Largest Pension Fund Plans to Halve Hedge-Fund Holdings\n        ')
('output', "People will have a neutral perspective about the stock given the headline “U.K.’s Largest Pension Fund Plans to Halve Hedge-Fund Holdings.” This is because from a financial market’s point of view, the impact of this action is largely balanced. \n\nLet's first u

# Fine tune

In [11]:
!pip install einops



In [12]:
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
print("Loading model for model: ", model_name[0])
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.float16
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name[0],
    quantization_config = bnb_config,
    device_map={"": 0},
)

Loading model for model:  microsoft/phi-1_5
The repository for microsoft/phi-1_5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-1_5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for microsoft/phi-1_5 contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/phi-1_5.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r= 8,
 lora_alpha=16,
 target_modules=["Wqkv"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)

# prepare int-8 model for training
#model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,419,843,584 || trainable%: 0.11077727277316766


In [14]:
import transformers
eval_steps = 100
save_steps = 100
logging_steps = 20

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset= eval_data,
    args=transformers.TrainingArguments(
        num_train_epochs=5,
        learning_rate=3e-4,
        logging_steps=logging_steps,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        output_dir=output_dir,
        report_to=report_to if report_to else "none",
        save_total_limit=3,
        load_best_model_at_end=True,
        push_to_hub=False,
        auto_find_batch_size=True,
        warmup_steps=100,
        weight_decay = 0.01,
        per_device_train_batch_size = 2,
        per_device_eval_batch_size = 2,
        gradient_accumulation_steps = 32
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [15]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,1.7923,1.770854
200,1.7114,1.700733
300,1.6885,1.681693
400,1.6729,1.671842
500,1.6682,1.664704
600,1.6576,1.663386
700,1.6517,1.659803
800,1.6552,1.65575
900,1.6495,1.657915
1000,1.6513,1.651805


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/toke

TrainOutput(global_step=1430, training_loss=1.6936440981351413, metrics={'train_runtime': 28464.2896, 'train_samples_per_second': 1.611, 'train_steps_per_second': 0.05, 'total_flos': 7.394151460188979e+17, 'train_loss': 1.6936440981351413, 'epoch': 4.99})

In [16]:
trainer.model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yc4142/phi-1_5-lora-int8-double-stockmarket-CoT/commit/37b415e1ff961984aaac2178d1ee3e6791dde3d6', commit_message='Upload tokenizer', commit_description='', oid='37b415e1ff961984aaac2178d1ee3e6791dde3d6', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
trainer.save_model("/Users/yuduc2/Desktop/JPmorgan project/LLMReasoningFinetuning/finetune/bias/phi-1_5/cot_8bit_singleQ")