In [None]:
# standard python imports
import os
# import pandas as pd
import numpy as np

import torch

# from tqdm import tqdm
from tqdm.notebook import tqdm

# huggingface libraries

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    # HfArgumentParser,
    # TrainingArguments,
    pipeline,
    # logging,
    LlamaForCausalLM
)
from peft import (
#     LoraConfig,
    PeftModel,
#     prepare_model_for_kbit_training,
#     get_peft_model,
)
# from datasets import load_dataset, Dataset
# from trl import SFTTrainer, setup_chat_format

# import wandb

import polars as pl
# import pandas as pd

from transformers.pipelines.pt_utils import KeyDataset

In [2]:
from torch.utils.data import Dataset

In [3]:
def create_prompt(review):
    system_prompt = f"You read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
    prompt = f"Here is the review to evaluate: [[[{review}]]]. Remember, you read Yelp reviews and return a number (1, 2, 3, 4, or 5) that represents your besst guess of the number of star ratings that were given by that reviewer. Return just the number 1, 2, 3, 4, or 5, with no context, explanation, or special symbols."
        
    return system_prompt, prompt

In [4]:
df_train = pl.read_csv("../data/1_train_test_split/df_train.csv")
df_test = pl.read_csv("../data/1_train_test_split/df_test.csv")
df_val = pl.read_csv("../data/1_train_test_split/df_validation.csv")

In [5]:
def add_prompts_to_df(df):
    lst_system_prompt, lst_prompt = [], []
    for row in df.iter_rows(named=True):
        system_prompt, prompt = create_prompt(row["text"])
        lst_system_prompt.append(system_prompt)
        lst_prompt.append(prompt)
    df = df.with_columns(pl.Series(lst_system_prompt).alias("system_prompt"), pl.Series(lst_prompt).alias("prompt"))
    return df

In [6]:
df_train = add_prompts_to_df(df_train)
df_test = add_prompts_to_df(df_test)
df_val = add_prompts_to_df(df_val)

In [7]:
base_model = "/home/richardarcher/Dropbox/Sci24_LLM_Polarization/project_/weights_local/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    tokenizer_file=os.path.join(base_model, 'tokenizer.json'),
    tokenizer_config_file=os.path.join(base_model, 'tokenizer_config.json'),
    special_tokens_map_file=os.path.join(base_model, 'special_tokens_map.json'),
    trust_remote_code=True,
    padding_side='left'
)

tokenizer.padding_side = 'left'

In [9]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # load_in_8bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # Match input dtype

)

model = LlamaForCausalLM.from_pretrained(base_model, quantization_config=nf4_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
# rosetta stone! 
# with torch.no_grad():
#     for row in df_val.iter_rows(named=True):
#         
#         message = [
#             {"role": "system", "content": row["system_prompt"]},
#             {"role": "user", "content": row["prompt"]},
#         ]
# 
#         inputs_message = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt").to("cuda")
# 
#         # outputs = model(inputs_message)
#         outputs = model(
#             # **inputs_message,
#             inputs_message,
#             output_hidden_states=True,
#             return_dict=True
#         )
#         
#         logits = outputs.logits          # shape: [batch_size, seq_len, vocab_size]
#         next_token_logits = logits[0, -1, :]
#         
#         hidden_states = outputs.hidden_states # len: 33
#         second_to_last_layer = hidden_states[-2]  # shape: batch_size, seq_len, 4096
#         
#         break

In [11]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        # embeddings: list of numpy arrays or torch tensors
        # labels: list of scalars
        self.X = torch.tensor(embeddings, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float16)  # or long, depending on your task
       
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
# # RUNS BUT TOO SLOW 
# # ALSO ROSETTA STONE 
# embeddings = []
# labels = []
# 
# model.eval() # should be duplicative but just in case
# with torch.no_grad():
#     # for row in df_val.iter_rows(named=True):
#     for row in tqdm(df_test.iter_rows(named=True), total=len(df_test)):
#         message = [
#             {"role": "system", "content": row["system_prompt"]},
#             {"role": "user", "content": row["prompt"]},
#         ]
# 
#         inputs_message = tokenizer.apply_chat_template(
#             message, 
#             add_generation_prompt=True, 
#             return_tensors="pt"
#         ).to("cuda")
# 
#         outputs = model(
#             inputs_message,
#             output_hidden_states=True,
#             return_dict=True
#         )
#         
#         # logits = outputs.logits          # shape: [batch_size, seq_len, vocab_size]
#         # next_token_logits = logits[0, -1, :]
# 
#         # Extract second-to-last layer hidden states
#         # `hidden_states` is a tuple of length num_layers
#         hidden_states = outputs.hidden_states 
#         # Typically: hidden_states[-2] shape: [batch_size, seq_len, hidden_dim]
#         # If you only have a single example per batch (batch_size=1), you can do:
#         embedding_vec = hidden_states[-2][0, -1, :].cpu().numpy()
#         
#         embeddings.append(embedding_vec)
#         labels.append(row["stars"])
#         
# # Convert to a Dataset
# dataset = EmbeddingDataset(embeddings, labels)

In [13]:
df_test.shape

(148609, 4)

In [14]:
df_train.shape

(850373, 4)

In [15]:
df_val.shape

(1018, 4)

# this is the switch for which dataset is getting embedded

In [16]:
# df = df_val

In [55]:
for i in range(0, 1_000, 4):
    # print(i)
    if i%(4*20)==0:
        print(i)

0
80
160
240
320
400
480
560
640
720
800
880
960


In [17]:
def df_to_dataset(df, batch_size = 4):
    model.eval()

    embeddings = []
    labels = []

    rows = df.to_dicts()  # returns a list of row dictionaries
    with torch.no_grad():
        # USE TQDM LOCAL OR THE IX ON THE CLUSTER
        # for i in tqdm(range(0, len(df), batch_size)):
        for i in range(0, len(df), batch_size):
            if i%(batch_size*20)==0:
                print(f"CURRENTLY OPERATING ON IX={i}/{len(df)}")
            batch_rows = rows[i : i + batch_size]
        
            # Prepare batched input
            batch_messages = [
                [
                    {"role": "system", "content": r["system_prompt"]},
                    {"role": "user", "content": r["prompt"]}
                ]
                for r in batch_rows
            ]
        
            # Tokenize the entire batch at once
            inputs_message = tokenizer.apply_chat_template(
                batch_messages,
                add_generation_prompt=True,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to("cuda")
        
            # Single forward pass for the entire batch
            with torch.no_grad():
                outputs = model(
                    inputs_message,
                    output_hidden_states=True,
                    return_dict=True
                )
        
            # Extract embeddings for the entire batch at once
            hidden_states = outputs.hidden_states
            # hidden_states[-2].shape: [batch_size, seq_len, hidden_dim]
            # We want the last token in seq_len dimension:
            embeddings_batch = hidden_states[-2][:, -1, :].cpu().numpy()
        
            # Add them to a growing list
            for j, r in enumerate(batch_rows):
                embeddings.append(embeddings_batch[j])
                labels.append(r["stars"])
        
        # Convert to a Dataset
        dataset = EmbeddingDataset(np.array(embeddings), labels)
    return dataset


  0%|          | 0/255 [00:00<?, ?it/s]

In [30]:
print("NOW OPERATING ON VAL")
dataset_val = df_to_dataset(df_val, 4)
print("NOW SAVING VAL")
torch.save(dataset_val,"../data/2_training_ready/mymethod/take00/training.pt")
print("VAL SAVED")
dataset_test = df_to_dataset(df_test, 4)
print("NOW SAVING TEST")
torch.save(dataset_test,"../data/2_training_ready/mymethod/take00/val.pt")
print("TEST SAVED")
dataset_train = df_to_dataset(df_train, 4)
print("NOW SAVING TRAIN")
torch.save(dataset_train,"../data/2_training_ready/mymethod/take00/testing.pt")
print("TRAIN SAVED")

In [44]:
# 
# 
