# Representation Notebook
In this notebook, I handle feature extraction and vectorization.

In [None]:
# Imports
import torch
from transformers import AutoModel
from datasets import load_from_disk

In [None]:
# Load tokenized dataset
tokenized_datasets = load_from_disk("./tokenized_imdb")

# Load pre-trained BERT model
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
# Function to extract embeddings
def extract_embeddings(batch):
    inputs = {k: torch.tensor(v) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        outputs = model(**inputs)
    return {"embeddings": outputs.last_hidden_state[:, 0, :].numpy()}

# Apply embedding extraction
tokenized_datasets = tokenized_datasets.map(extract_embeddings, batched=True)

In [None]:
# Save processed dataset
tokenized_datasets.save_to_disk("./embeddings_imdb")