<a href="https://colab.research.google.com/github/rimbarbar/LL-LLM-Project/blob/main/LL_LLM_Project_Resub_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

2: Representation

In [None]:
# Imports
import torch
from transformers import AutoModel
from datasets import load_from_disk
import numpy as np

In [None]:
# Load tokenized dataset (already subset to 500 train + 500 test)
tokenized_datasets = load_from_disk("./tokenized_imdb")

In [None]:
# Explicitly ensures we only use train and test splits with 500 samples each
# Remove 'unsupervised' split if it exists and re-subset train/test
if "unsupervised" in tokenized_datasets:
    del tokenized_datasets["unsupervised"]
tokenized_datasets["train"] = tokenized_datasets["train"].select(range(500))
tokenized_datasets["test"] = tokenized_datasets["test"].select(range(500))

# Verify dataset sizes
print("Train samples:", len(tokenized_datasets["train"]))
print("Test samples:", len(tokenized_datasets["test"]))

# Load pre-trained BERT model
model = AutoModel.from_pretrained("bert-base-uncased")

Train samples: 500
Test samples: 500


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Function to extract embeddings
def extract_embeddings(batch):
    inputs = {k: torch.tensor(v) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        outputs = model(**inputs)
    # Using CLS token embedding as compressed representation of entire review
    embeddings = outputs.last_hidden_state[:, 0, :]
    return {"embeddings": embeddings.numpy()}

# Apply embedding extraction
tokenized_datasets = tokenized_datasets.map(extract_embeddings, batched=True, batch_size=8)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Save processed dataset
tokenized_datasets.save_to_disk("./embeddings_imdb")

# Convert embeddings to NumPy array before accessing shape
embedding_example = np.array(tokenized_datasets["train"][0]["embeddings"])
print("Embedding shape (hidden_dim):", embedding_example.shape)
print("Sample embedding values:", embedding_example[:5])

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Embedding shape (hidden_dim): (768,)
Sample embedding values: [ 0.09268344 -0.32289329 -0.12178016 -0.02571688 -0.38968098]
