-
Notifications
You must be signed in to change notification settings - Fork 25.1k
Description
🐛 Describe the bug
Trying to do a simple training isn't working as expected with the following error:
File [~/Projects/ml/lib/python3.10/site-packages/torch/nn/functional.py:2551](http://localhost:8888/lab/workspaces/auto-m/tree/tags-classifier/~/Projects/ml/lib/python3.10/site-packages/torch/nn/functional.py#line=2550), in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2545 # Note [embedding_renorm set_grad_enabled]
2546 # XXX: equivalent to
2547 # with torch.no_grad():
2548 # torch.embedding_renorm_
2549 # remove once script supports set_grad_enabled
2550 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2551 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Placeholder storage has not been allocated on MPS device!
I've been Googling around for a long time now and I I've done everything I can think of that might be my fault... I can't get the below to work on my MBP via MPS or CPU....
Reproduce Script:
#!/usr/bin/env python
import pandas as pd # For loading data
import numpy as np
import torch
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
train_perc = 0.9
validate_perc = 0.5
dataframe = pd.read_csv('model-data/tags-training-data.csv')
dataframe = dataframe.convert_dtypes()
num_training_examples = int(dataframe.shape[0] * train_perc)
dataframe_train = dataframe.iloc[:num_training_examples]
dataframe_test = dataframe.iloc[num_training_examples:]
num_validate_examples = int(dataframe_test.shape[0] * validate_perc)
dataframe_validate = dataframe_test.iloc[:num_validate_examples]
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(dataframe_train)
dataset['test'] = Dataset.from_pandas(dataframe_test)
dataset['validation'] = Dataset.from_pandas(dataframe_validate)
labels = [label for label in dataset['train'].features.keys() if label not in ['query']]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", device=device)
def preprocess_data(examples):
text = examples["query"]
encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
labels_matrix = np.zeros((len(text), len(labels)))
for idx, label in enumerate(labels):
labels_matrix[:, idx] = labels_batch[label]
encoding["labels"] = labels_matrix.tolist()
return encoding;
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format('torch')
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
problem_type="multi_label_classification",
num_labels=len(labels),
id2label=id2label,
label2id=label2id)
model = model.to(device)
batch_size = 8
metric_name = "f1"
args = TrainingArguments(
f"bert-finetuned-sem_eval-english",
eval_strategy = "epoch",
save_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
metric_for_best_model=metric_name
)
def multi_label_metrics(predictions, labels, threshold=0.5):
# first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(predictions))
# next, use threshold to turn them into integer predictions
y_pred = np.zeros(probs.shape)
y_pred[np.where(probs >= threshold)] = 1
# finally, compute metrics
y_true = labels
f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
accuracy = accuracy_score(y_true, y_pred)
# return as dictionary
metrics = {'f1': f1_micro_average,
'roc_auc': roc_auc,
'accuracy': accuracy}
return metrics
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions,
tuple) else p.predictions
result = multi_label_metrics(
predictions=preds,
labels=p.label_ids)
return result
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train() // <-- This line causes the error
Versions
PyTorch version: 2.5.0.dev20240715
Is debug build: False
CUDA used to build PyTorch: None
ROCM used to build PyTorch: N/A
OS: macOS 13.6.7 (arm64)
GCC version: Could not collect
Clang version: 15.0.0 (clang-1500.1.0.2.5)
CMake version: version 3.27.8
Libc version: N/A
Python version: 3.10.9 (main, Jan 11 2023, 09:18:18) [Clang 14.0.6 ] (64-bit runtime)
Python platform: macOS-13.6.7-arm64-arm-64bit
Is CUDA available: False
CUDA runtime version: No CUDA
CUDA_MODULE_LOADING set to: N/A
GPU models and configuration: No CUDA
Nvidia driver version: No CUDA
cuDNN version: No CUDA
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
CPU:
Apple M1 Max
Versions of relevant libraries:
[pip3] numpy==1.26.4
[pip3] onnx==1.16.1
[pip3] onnxconverter-common==1.14.0
[pip3] onnxruntime==1.18.0
[pip3] skl2onnx==1.17.0
[pip3] torch==2.5.0.dev20240715
[pip3] torchaudio==2.4.0.dev20240715
[pip3] torchvision==0.20.0.dev20240715
[conda] numpy 1.24.2 pypi_0 pypi
[conda] torch 2.1.0.dev20230416 pypi_0 pypi
[conda] torchaudio 2.1.0.dev20230416 pypi_0 pypi
[conda] torchvision 0.16.0.dev20230416 pypi_0 pypi