In [1]:
# standard python imports
import os
import torch

# huggingface libraries

from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    LlamaForCausalLM
)
# from peft import (
#     LoraConfig,
#     PeftModel,
#     prepare_model_for_kbit_training,
#     get_peft_model,
# )

from datasets import Dataset # NOTE: this is NOT the torch dataset type, which can cause confusion

import polars as pl

from transformers.pipelines.pt_utils import KeyDataset

import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
def create_prompt(review):
    system_prompt = f"You read student essays reviews and return a score from 0 to 60 that represents your besst guess of the number of rating given by the grader. Return just the number 0, 1, ..., 60 with no context, explanation, or special symbols."
    prompt = f"Here is the review to evaluate: [[[{review}]]]. You read student essays reviews and return a score from 0 to 60 that represents your besst guess of the number of rating given by the grader. Return just the number 0, 1, ..., 60 with no context, explanation, or special symbols."

    return system_prompt, prompt


In [3]:
df_val = pl.read_csv("../data/1_clean/val.csv")

In [4]:
lst_system_prompt, lst_prompt = [], []
for row in df_val.iter_rows(named=True):
    system_prompt, prompt = create_prompt(row["text"])
    lst_system_prompt.append(system_prompt)
    lst_prompt.append(prompt)
df_val = df_val.with_columns(pl.Series(lst_system_prompt).alias("system_prompt"), pl.Series(lst_prompt).alias("prompt"))

In [5]:
test_texts = df_val["text"].to_list()
test_labels = df_val["score"].to_list()

data_ = Dataset.from_polars(df_val)

In [6]:
base_model = "../models/llama_base/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"

In [7]:
!ls ../models/llama_base/snapshots/0e9e39f249a16976918f6564b8830bc894c89659

config.json			  model.safetensors.index.json
generation_config.json		  original
LICENSE				  README.md
model-00001-of-00004.safetensors  special_tokens_map.json
model-00002-of-00004.safetensors  tokenizer_config.json
model-00003-of-00004.safetensors  tokenizer.json
model-00004-of-00004.safetensors  USE_POLICY.md


In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    tokenizer_file=os.path.join(base_model, 'tokenizer.json'),
    tokenizer_config_file=os.path.join(base_model, 'tokenizer_config.json'),
    special_tokens_map_file=os.path.join(base_model, 'special_tokens_map.json'),
    trust_remote_code=True,
    padding_side='left'
)

tokenizer.padding_side = 'left'

In [9]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # load_in_8bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Match input dtype

)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=nf4_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
# model = PeftModel.from_pretrained(model, PATH_adapter_custom_weights)
# model = model.merge_and_unload() # This line merges the weights

In [11]:
if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [12]:
def remove_header(text, K_times):
    for _ in range(K_times):
        if "<|end_header_id|>" in text:
            text = text.split("<|end_header_id|>", 1)[1]
    return text

In [13]:
def create_format_chat_template(tokenizer):
    def format_chat_template(row):
        row_json = [{"role": "system", "content": row["system_prompt"]},
                    {"role": "user", "content": row["prompt"]}]

        # row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
        row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)
        return row
    return format_chat_template

In [14]:
batch_size = 8

In [15]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.float32,
    torch_dtype=torch.float16,
    device_map="auto",
    batch_size=batch_size, # CHANGE TO FOUR IF TOO SLOW
    max_new_tokens=5,
)

In [16]:
data_ = data_.map(
    create_format_chat_template(tokenizer)
)

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

In [17]:
res = []
ix = 0
for out in pipe(KeyDataset(data_, "text")):
    ix = ix + 1
    if ix % batch_size == 0:
        print(f"{ix}/{data_.shape[0]}")

    cleaned_text = remove_header(out[0]["generated_text"], 3).strip()
    res.append(cleaned_text)

8/622
16/622
24/622
32/622
40/622
48/622
56/622
64/622
72/622
80/622
88/622
96/622
104/622
112/622
120/622
128/622
136/622
144/622
152/622
160/622
168/622
176/622
184/622
192/622
200/622
208/622
216/622
224/622
232/622
240/622
248/622
256/622
264/622
272/622
280/622
288/622
296/622
304/622
312/622
320/622
328/622
336/622
344/622
352/622
360/622
368/622
376/622
384/622
392/622
400/622
408/622
416/622
424/622
432/622
440/622
448/622
456/622
464/622
472/622
480/622
488/622
496/622
504/622
512/622
520/622
528/622
536/622
544/622
552/622
560/622
568/622
576/622
584/622
592/622
600/622
608/622
616/622


In [18]:
res_int = [int(i) for i in res]

In [19]:
llm_predicted, true_values = np.array(res_int), np.array(test_labels)

In [20]:
test_mse = mean_squared_error(llm_predicted, true_values)

In [25]:
print(f"Test MSE: {test_mse:,.2f}")

Test MSE: 1,020.89


In [21]:
df_val = df_val.with_columns(pl.Series(res_int).alias("8b_quant_prediction"))
df_val.write_csv("../outputs/predictions/8b_quantized_base.csv")