In [1]:
# standard python imports
import os
import pandas as pd
import polars as pl
import torch

# huggingface libraries
from transformers import (
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from datasets import load_dataset, Dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM # , setup_chat_format
from trl.commands.cli import train

import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance

In [2]:
import wandb

wandb.init(
    project="optim00",  # Change this to your project name
    name="local_run_02",
    config={
        "model_name": "quant_for_local",
        "task": "response_only",
        "timestamp": "2024.11.18.18_02"
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrichard-archer[0m ([33myale-som[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
base_model = "/home/richardarcher/Dropbox/Sci24_LLM_Polarization/project_/weights_local/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"
new_model = "../weights/sft/run00"

PATH_data_to_train_on = "../data/1_train_test_split/df_train.csv"
PATH_data_to_test_on = "../data/1_train_test_split/df_test.csv"

In [4]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Match input dtype
)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=nf4_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    tokenizer_file=os.path.join(base_model, 'tokenizer.json'),
    tokenizer_config_file=os.path.join(base_model, 'tokenizer_config.json'),
    special_tokens_map_file=os.path.join(base_model, 'special_tokens_map.json'),
    trust_remote_code=True
)

In [6]:
tokenizer.pad_token_id = 128004 # tokenizer.convert_tokens_to_ids("<|finetune_right_pad_id|>")
model.config.pad_token_id =128004 # tokenizer.convert_tokens_to_ids("<|finetune_right_pad_id|>")

In [7]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)
# model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [8]:
def print_trainable_params(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = 100 * trainable_params / total_params
    print(f"{trainable_percentage:.2f}% of parameters are trainable")

# Call the function after applying PEFT
print_trainable_params(model)

0.04% of parameters are trainable


#### HANDLE DATA

In [9]:
def create_prompt(review):
    system_prompt = f"You read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
    prompt = f"Here is the review to evaluate: [[[{review}]]]. Remember, you read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
        
    return system_prompt, prompt

In [10]:
df_train = pl.read_csv(PATH_data_to_train_on)
df_test = pl.read_csv(PATH_data_to_test_on)

# TEMP: JUST TO SEE HOW IT GOES

In [11]:
df_train = df_train.sample(n=100_000, seed=0)
df_test = df_test.sample(n=10_000, seed=0)

In [12]:
lst_system_prompt, lst_prompt = [], []
for row in df_train.iter_rows(named=True):
    system_prompt, prompt = create_prompt(row["text"])
    lst_system_prompt.append(system_prompt)
    lst_prompt.append(prompt)
df_train = df_train.with_columns(pl.Series(lst_system_prompt).alias("instruction"), pl.Series(lst_prompt).alias("input"))
output = [int(i) for i in df_train["stars"].to_list()]
df_train = df_train.with_columns(pl.Series(output).alias("output"))

In [13]:
lst_system_prompt, lst_prompt = [], []
for row in df_test.iter_rows(named=True):
    system_prompt, prompt = create_prompt(row["text"])
    lst_system_prompt.append(system_prompt)
    lst_prompt.append(prompt)
df_test = df_test.with_columns(pl.Series(lst_system_prompt).alias("instruction"), pl.Series(lst_prompt).alias("input"))
output = [int(i) for i in df_test["stars"].to_list()]
df_test = df_test.with_columns(pl.Series(output).alias("output"))

In [14]:
train_dataset = Dataset.from_polars(df_train)
test_dataset = Dataset.from_polars(df_test)

In [15]:
# train_dataset.shape[0]
# test_dataset.shape[0]

In [16]:
# def TO_GET_LEN(tokenizer):
#     def TO_GET_LEN_INNER(row):
#         row_json = [{"role": "system", "content": row["instruction"]},
#                     {"role": "user", "content": row["input"]},
#                     {"role": "assistant", "content": row["output"]}]
# 
#         row["list_of_tokens"] = tokenizer.apply_chat_template(row_json, tokenize=True)
#         return row
# 
#     return TO_GET_LEN_INNER
# 
# train_dataset = train_dataset.map(
#     TO_GET_LEN(tokenizer),
# )
# 
# test_dataset = test_dataset.map(
#     TO_GET_LEN(tokenizer),
# )
# 
# 
# max_seq_length_needed1 = max(train_dataset.map(lambda x: {"length": len(x["list_of_tokens"])})["length"])+1
# max_seq_length_needed2 = max(test_dataset.map(lambda x: {"length": len(x["list_of_tokens"])})["length"])+1
# max_seq_length_needed=max(max_seq_length_needed1, max_seq_length_needed2)

In [17]:
# print(f"{max_seq_length_needed=}")

In [18]:
max_seq_length_needed = 1_638

### max seq length needed for the train set: 

don't run the above again, takes too long 

In [19]:
def format_but_not_tokenize(example):
    test = example["instruction"]
    # assert isinstance(test, list), "Input 'example' must be a list, this is probably because formatting function needs >1 eg"
    # assert not isinstance(test, str), "Input 'example' must be a list, not a string"
    
    output_texts = []
    
    if isinstance(test, list):
        K_range = len(test)
        
        for i in range(K_range):
            row_json = [{"role": "system", "content": example['instruction'][i]},
                {"role": "user", "content": example['input'][i]},
                {"role": "assistant", "content": example['output'][i]}]
            text = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=False)
            
            output_texts.append(text)
    
    elif isinstance(test, str):
        # K_range = 1
        row_json = [{"role": "system", "content": example['instruction']},
            {"role": "user", "content": example['input']},
            {"role": "assistant", "content": example['output']}]
        text = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=False)
        
        output_texts.append(text)
    else:
        assert False, "ERROR: WHAT IS GOING INTO FORMAT_BUT_NOT_TOKENIZE???" 
    
    return output_texts

# tests

In [20]:
response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [21]:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.config.use_cache = False  # Disable KV cache during training

In [22]:
training_args = SFTConfig(
    max_seq_length=max_seq_length_needed,
    output_dir=new_model,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4, # 4
    # optim="adamw_torch",
    # optim="paged_adamw_32bit",
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=100,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    # TK TK OPTIM1
    bf16=True, # was false
    group_by_length=True,
    # TK TK OPTIM2
    gradient_checkpointing=True,  # Enable gradient checkpointing
    report_to="wandb",
    run_name="local000"
)

In [23]:
trainer = SFTTrainer(
    model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args,
    formatting_func=format_but_not_tokenize,
    data_collator=collator,
    # ADDED THE BELOW IF IT BREAKS REMOVE IT OR FIX
    # compute_metrics=custom_evals,  # Add this line
)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [24]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [9]:
!ls ../weights/sft/run00/checkpoint-1000

adapter_config.json	   rng_state.pth	    tokenizer.json
adapter_model.safetensors  scheduler.pt		    trainer_state.json
optimizer.pt		   special_tokens_map.json  training_args.bin
README.md		   tokenizer_config.json
