In [1]:
# standard python imports
import os

import numpy as np
import torch


from tqdm import tqdm

# huggingface libraries

from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM
)

import polars as pl

from torch.utils.data import Dataset

In [2]:
def create_prompt(review):
    system_prompt = f"You read student essays reviews and return a score from 0 to 60 that represents your besst guess of the number of rating given by the grader. Return just the number 0, 1, ..., 60 with no context, explanation, or special symbols."
    prompt = f"Here is the review to evaluate: [[[{review}]]]. You read student essays reviews and return a score from 0 to 60 that represents your besst guess of the number of rating given by the grader. Return just the number 0, 1, ..., 60 with no context, explanation, or special symbols."

    return system_prompt, prompt


In [3]:
base_model = "../models/llama_base/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"

In [4]:
def add_prompts_to_df(df):
    lst_system_prompt, lst_prompt = [], []
    for row in df.iter_rows(named=True):
        system_prompt, prompt = create_prompt(row["text"])
        lst_system_prompt.append(system_prompt)
        lst_prompt.append(prompt)
    df = df.with_columns(pl.Series(lst_system_prompt).alias("system_prompt"), pl.Series(lst_prompt).alias("prompt"))
    return df

In [5]:
class CustomDataset(Dataset):
    def __init__(self, embeddings, labels):
        # embeddings: list of numpy arrays or torch tensors
        # labels: list of scalars
        self.X = torch.tensor(embeddings, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float32)  # or long, depending on your task

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [6]:
def df_to_dataset(df, batch_size, model, tokenizer):
    model.eval()

    embeddings = []
    labels = []

    rows = df.to_dicts()  # returns a list of row dictionaries
    with torch.no_grad():
        for i in tqdm(range(0, len(df), batch_size)):
            # if i % (batch_size * 1_000) == 0:
            #     print(f"CURRENTLY OPERATING ON IX={i}/{len(df)}")
            #     wandb.log({"ix": i})
            batch_rows = rows[i: i + batch_size]

            # Prepare batched input
            batch_messages = [
                [
                    {"role": "system", "content": r["system_prompt"]},
                    {"role": "user", "content": r["prompt"]}
                ]
                for r in batch_rows
            ]

            # Tokenize the entire batch at once
            inputs_message = tokenizer.apply_chat_template(
                batch_messages,
                add_generation_prompt=True,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to("cuda")

            # Single forward pass for the entire batch
            with torch.no_grad():
                outputs = model(
                    inputs_message,
                    output_hidden_states=True,
                    return_dict=True
                )
            # Extract embeddings for the entire batch at once
            hidden_states = outputs.hidden_states
            # Convert to float32 before moving to CPU and then NumPy
            embeddings_batch = hidden_states[-2][:, -1, :].to(dtype=torch.float32).cpu().numpy()

            # Add them to a growing list
            for j, r in enumerate(batch_rows):
                embeddings.append(embeddings_batch[j])
                labels.append(r["score"])

        # Convert to a Dataset
        dataset = CustomDataset(np.array(embeddings), labels)
    return dataset





In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    tokenizer_file=os.path.join(base_model, 'tokenizer.json'),
    tokenizer_config_file=os.path.join(base_model, 'tokenizer_config.json'),
    special_tokens_map_file=os.path.join(base_model, 'special_tokens_map.json'),
    trust_remote_code=True,
    padding_side='left'
)

tokenizer.padding_side = 'left'

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # load_in_8bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Match input dtype

)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=nf4_config)

# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     device_map="auto",
#     # device_map="balanced",
#     torch_dtype=torch.bfloat16
# )

if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id



`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
df_val = pl.read_csv("../data/1_clean/val.csv")

df_val = add_prompts_to_df(df_val)

dataset_val = df_to_dataset(df=df_val, batch_size=4, model=model, tokenizer=tokenizer)

torch.save(dataset_val, "../data/2_ready_for_training/mymethod/val.pt")

100%|██████████| 156/156 [00:44<00:00,  3.48it/s]


In [9]:
df_testing = pl.read_csv("../data/1_clean/testing.csv")

df_testing = add_prompts_to_df(df_testing)

dataset_testing = df_to_dataset(df=df_testing, batch_size=4, model=model, tokenizer=tokenizer)

torch.save(dataset_testing, "../data/2_ready_for_training/mymethod/testing.pt")

100%|██████████| 488/488 [02:17<00:00,  3.55it/s]


In [12]:
df_training = pl.read_csv("../data/1_clean/training.csv")

df_training = add_prompts_to_df(df_training)

dataset_training = df_to_dataset(df=df_training, batch_size=4, model=model, tokenizer=tokenizer)

torch.save(dataset_training, "../data/2_ready_for_training/mymethod/training.pt")

 14%|█▍        | 374/2601 [02:22<14:09,  2.62it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 912.00 MiB. GPU 0 has a total capacity of 23.62 GiB of which 845.75 MiB is free. Including non-PyTorch memory, this process has 22.42 GiB memory in use. Of the allocated memory 19.51 GiB is allocated by PyTorch, and 2.46 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)