In [1]:
# standard python imports
import os
# import pandas as pd
import torch

# huggingface libraries

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    # HfArgumentParser,
    # TrainingArguments,
    pipeline,
    # logging,
    LlamaForCausalLM
)
from peft import (
#     LoraConfig,
    PeftModel,
#     prepare_model_for_kbit_training,
#     get_peft_model,
)
from datasets import load_dataset, Dataset
# from trl import SFTTrainer, setup_chat_format

# import wandb

import polars as pl
# import pandas as pd

from transformers.pipelines.pt_utils import KeyDataset

from sklearn.metrics import mean_squared_error


In [2]:
def create_prompt(review):
    system_prompt = f"You read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
    prompt = f"Here is the review to evaluate: [[[{review}]]]. Remember, you read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
        
    return system_prompt, prompt

In [3]:
!ls data/imported/

testing.csv  training.csv  val.csv


In [4]:
# df_train = pl.read_csv("../data/1_train_test_split/df_train.csv")
df_val = pl.read_csv("data/imported/val.csv")

In [5]:
lst_system_prompt, lst_prompt = [], []
for row in df_val.iter_rows(named=True):
    system_prompt, prompt = create_prompt(row["text"])
    lst_system_prompt.append(system_prompt)
    lst_prompt.append(prompt)
df_val = df_val.with_columns(pl.Series(lst_system_prompt).alias("system_prompt"), pl.Series(lst_prompt).alias("prompt"))

In [6]:
test_texts = df_val["text"].to_list()
test_labels = df_val["score"].to_list()

data_ = Dataset.from_polars(df_val)

In [7]:
base_model = "/home/richardarcher/Dropbox/Sci24_LLM_Polarization/project_/weights_local/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"

# HERE

In [8]:
# !ls weights/run00

In [9]:
# PATH_adapter_custom_weights = "weights/run00/checkpoint-500"
# PATH_adapter_custom_weights = "weights/run00/checkpoint-2000"
# PATH_adapter_custom_weights = "weights/run00/checkpoint-3000"
PATH_adapter_custom_weights = "weights/run00/checkpoint-4000"
# PATH_adapter_custom_weights = "weights/run00/checkpoint-5000"
# PATH_adapter_custom_weights = "weights/run00/checkpoint-6500"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    tokenizer_file=os.path.join(base_model, 'tokenizer.json'),
    tokenizer_config_file=os.path.join(base_model, 'tokenizer_config.json'),
    special_tokens_map_file=os.path.join(base_model, 'special_tokens_map.json'),
    trust_remote_code=True,
    padding_side='left'
)

tokenizer.padding_side = 'left'

In [11]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # load_in_8bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Match input dtype

)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=nf4_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
model = PeftModel.from_pretrained(model, PATH_adapter_custom_weights)
model = model.merge_and_unload() # This line merges the weights



In [13]:
if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [14]:
def remove_header(text, K_times):
    for _ in range(K_times):
        if "<|end_header_id|>" in text:
            text = text.split("<|end_header_id|>", 1)[1]
    return text

In [15]:
def create_format_chat_template(tokenizer):
    def format_chat_template(row):
        row_json = [{"role": "system", "content": row["system_prompt"]},
                    {"role": "user", "content": row["prompt"]}]

        # row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
        row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)
        return row
    return format_chat_template

In [16]:
batch_size = 8

In [17]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.float32,
    torch_dtype=torch.float16,
    device_map="auto",
    batch_size=batch_size, # CHANGE TO FOUR IF TOO SLOW
    max_new_tokens=5,
)

In [18]:
data_ = data_.map(
    create_format_chat_template(tokenizer)
)

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

In [None]:
res = []
ix = 0 
for out in pipe(KeyDataset(data_, "text")):
    ix = ix + 1
    # print(ix)
    if ix % (batch_size*4) == 0:
        print(f"{ix}/{data_.shape[0]}")
    
    cleaned_text = remove_header(out[0]["generated_text"], 3).strip()
    res.append(cleaned_text)

32/622
64/622
96/622
128/622
160/622
192/622
224/622
256/622
288/622
320/622
352/622


In [None]:
res_int = [int(i) for i in res]

In [None]:
val_mse = mean_squared_error(res_int, test_labels)

In [None]:
print(f"{val_mse:,.2f}")

In [None]:
df_val = df_val.with_columns(pl.Series(res_int).alias("8b_quant_prediction_finetuned"))

In [None]:
# df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check500.csv")
# df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check2000.csv")
# df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check3000.csv")
df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check4000.csv")
# df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check5000.csv")
# df_val.write_csv("data/outputs/8b_quantized_predictions_for_eval_set_check6500.csv")