Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pickling for PyTorch datasets to allow multiprocessing in dataloader #75

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 104 additions & 90 deletions muspy/datasets/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Base Dataset classes."""
import json
import warnings
from functools import partial
from pathlib import Path
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -34,6 +35,14 @@
from tensorflow.data import Dataset as TFDataset
from torch.utils.data import Dataset as TorchDataset

try:
# pylint: disable=import-outside-toplevel
from torch.utils.data import Dataset as TorchDataset

TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False


RemoteDatasetT = TypeVar("RemoteDatasetT", bound="RemoteDataset")
FolderDatasetT = TypeVar("FolderDatasetT", bound="FolderDataset")
Expand Down Expand Up @@ -284,96 +293,8 @@ def to_pytorch_dataset(
"Only one of `representation` and `factory` can be given."
)

try:
# pylint: disable=import-outside-toplevel
from torch.utils.data import Dataset as TorchDataset
except ImportError as err:
raise ImportError("Optional package pytorch is required.") from err

class TorchMusicFactoryDataset(TorchDataset):
"""A PyTorch dataset built from a Music dataset.

Parameters
----------
dataset : :class:`muspy.Dataset`
Dataset object to base on.
factory : Callable
Function to be applied to the Music objects. The input
is a Music object, and the output is an array or a
tensor.

"""

def __init__(
self,
dataset: Dataset,
factory: Callable,
subset: str = "Full",
indices: Sequence[int] = None,
):
super().__init__()
self.dataset = dataset
self.factory = factory
self.subset = subset
self.indices = indices
if self.indices is not None:
self.indices = sorted(
idx for idx in self.indices if idx < len(self.dataset)
)

def __repr__(self) -> str:
return (
f"TorchMusicFactoryDataset(dataset={self.dataset}, "
f"factory={self.subset}, subset={self.factory})"
)

def __getitem__(self, index):
if self.indices is None:
return self.factory(self.dataset[index])
return self.factory(self.dataset[self.indices[index]])

def __len__(self) -> int:
if self.indices is None:
return len(self.dataset)
return len(self.indices)

class TorchRepresentationDataset(TorchMusicFactoryDataset):
"""A PyTorch music dataset.

Parameters
----------
dataset : :class:`muspy.Dataset`
Dataset object to base on.
representation : str
Target representation. See
:func:`muspy.to_representation()` for available
representation.

"""

def __init__(
self,
dataset: Dataset,
representation: str,
subset: str = "Full",
indices: Sequence[int] = None,
**kwargs: Any,
):
self.representation = representation

def factory(music):
return music.to_representation(representation, **kwargs)

super().__init__(
dataset, factory=factory, subset=subset, indices=indices
)

def __repr__(self) -> str:
return (
f"TorchRepresentationDataset(dataset={self.dataset}, "
f"representation={self.representation}, "
f"subset={self.subset})"
)
if not TORCH_AVAILABLE:
raise ImportError("Optional package pytorch is required.")

# No split
if splits is None:
Expand Down Expand Up @@ -1241,3 +1162,96 @@ def __init__(
ignore_exceptions=ignore_exceptions,
use_converted=use_converted,
)


class TorchMusicFactoryDataset(TorchDataset):
"""A PyTorch dataset built from a Music dataset.

Parameters
----------
dataset : :class:`muspy.Dataset`
Dataset object to base on.
factory : Callable
Function to be applied to the Music objects. The input
is a Music object, and the output is an array or a
tensor.

"""

def __init__(
self,
dataset: Dataset,
factory: Callable,
subset: str = "Full",
indices: Sequence[int] = None,
):
super().__init__()
self.dataset = dataset
self.factory = factory
self.subset = subset
self.indices = indices
if self.indices is not None:
self.indices = sorted(
idx for idx in self.indices if idx < len(self.dataset)
)

def __repr__(self) -> str:
return (
f"TorchMusicFactoryDataset(dataset={self.dataset}, "
f"factory={self.subset}, subset={self.factory})"
)

def __getitem__(self, index):
if self.indices is None:
return self.factory(self.dataset[index])
return self.factory(self.dataset[self.indices[index]])

def __len__(self) -> int:
if self.indices is None:
return len(self.dataset)
return len(self.indices)


def _torch_representation_factory(music, representation: str, kwargs):
return music.to_representation(representation, **kwargs)


class TorchRepresentationDataset(TorchMusicFactoryDataset):
"""A PyTorch music dataset.

Parameters
----------
dataset : :class:`muspy.Dataset`
Dataset object to base on.
representation : str
Target representation. See
:func:`muspy.to_representation()` for available
representation.

"""

def __init__(
self,
dataset: Dataset,
representation: str,
subset: str = "Full",
indices: Sequence[int] = None,
**kwargs: Any,
):
self.representation = representation
factory = partial(
_torch_representation_factory,
representation=representation,
kwargs=kwargs,
)

super().__init__(
dataset, factory=factory, subset=subset, indices=indices
)

def __repr__(self) -> str:
return (
f"TorchRepresentationDataset(dataset={self.dataset}, "
f"representation={self.representation}, "
f"subset={self.subset})"
)
13 changes: 13 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test cases for dataset module."""
import pickle
import shutil

import pytest
Expand Down Expand Up @@ -102,6 +103,18 @@ def test_to_pytorch_dataset():
assert pytorch_dataset[0] is not None


def test_pickle_pytorch_dataset():
"""
PyTorch datasets must support pickling so that the dataloader can
use multiple workers when assembling a batch.
"""
dataset = Music21Dataset("demos")
pytorch_dataset = dataset.to_pytorch_dataset(representation="pitch")
obj = pickle.dumps(pytorch_dataset)
dataset = pickle.loads(obj)
assert dataset[0] is not None


def test_to_tensorflow_dataset():
tf.config.set_visible_devices([], "GPU")
dataset = Music21Dataset("demos")
Expand Down