In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
import torch

# Load the tokenizer for the LLaMA model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token="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")

# Ensure that the padding token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the rotten_tomatoes dataset
dataset = load_dataset("rotten_tomatoes")

# Function to preprocess a single data batch
def preprocess_function(examples):
    # Format the text and label into the model input format
    inputs = [f"Classify the sentiment of this text: {text}" for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Encode the labels
    labels = torch.tensor(examples["label"])
    model_inputs["labels"] = labels
    return model_inputs

# Apply the preprocessing function to the entire dataset
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names
)

# Convert the dataset to PyTorch tensors
tokenized_datasets.set_format("torch")

# Print dataset information after preprocessing
print(tokenized_datasets)

# Explore a few examples after preprocessing
print(tokenized_datasets['train'][0])

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 179409.15 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 261225.06 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 341047.14 examples/s]
Map (num_proc=4): 100%|██████████| 8530/8530 [00:01<00:00, 6596.10 examples/s]
Map (num_proc=4): 100%|██████████| 1066/1066 [00:00<00:00, 3878.57 examples/s]
Map (num_proc=4): 100%|██████████| 1066/1066 [00:00<00:00, 3701.91 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1066
    })
})
{'input_ids': tensor([    1,  4134,  1598,   278, 19688,   310,   445,  1426, 29901,   278,
         7679,   338,  2731,  1312,   304,   367,   278, 29871, 29906, 29896,
          303,  6462, 29915, 29879,   716,   376,   378,   273,   376,   322,
          393,   540, 29915, 29879,  2675,   304,  1207,   263,  8536,  1161,
         1584,  7621,  1135,   564, 29876,  1025,  1364,  4495,  2256,   387,
          914,  1919,  1444,   273, 29899, 16398,   566,  1109,  5625,  1004,
          470,  1886,   854,  2377,   284,   869,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
     


