-
Notifications
You must be signed in to change notification settings - Fork 26.1k
Closed
Description
Issue description
While using the dataloader from pytorch 0.4.1:
With num_workers > 0 the workers store the tensors in shared memory, but do not release the shared memory file handles after they return the tensor to the main process and file handles are no longer needed. The worker will then run out of file handles, if one stores the tensor in a list.
Code example
from torch.utils.data import Dataset
class testSet(Dataset):
def __init__(self):
super(testSet,self).__init__()
def `__len__(self):`
return 1000000
def __getitem__(self,index):
return {"index":index}
import torch
test_data = testSet()
test_data_loader = torch.utils.data.DataLoader( dataset=test_data, batch_size=1, num_workers=1)
index = []
for sample in test_data_loader:
index.append(sample['index'])
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-5-cf6ed576bc1c> in <module>()
----> 1 for sample in test_data_loader:
2 #print(sample['index'])
3 index.append(sample['index'])
~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
328 while True:
329 assert (not self.shutdown and self.batches_outstanding > 0)
--> 330 idx, batch = self._get_batch()
331 self.batches_outstanding -= 1
332 if idx != self.rcvd_idx:
~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _get_batch(self)
307 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
308 else:
--> 309 return self.data_queue.get()
310
311 def __next__(self):
~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/queues.py in get(self)
335 res = self._reader.recv_bytes()
336 # unserialize the data after having released the lock
--> 337 return _ForkingPickler.loads(res)
338
339 def put(self, obj):
~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py in rebuild_storage_fd(cls, df, size)
149 fd = multiprocessing.reduction.rebuild_handle(df)
150 else:
--> 151 fd = df.detach()
152 try:
153 storage = storage_from_cache(cls, fd_id(fd))
~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py in detach(self)
56 '''Get the fd. This should only be called once.'''
57 with _resource_sharer.get_connection(self._id) as conn:
---> 58 return reduction.recv_handle(conn)
59
60
~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/reduction.py in recv_handle(conn)
180 '''Receive a handle over a local connection.'''
181 with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
--> 182 return recvfds(s, 1)[0]
183
184 def DupFd(fd):
~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/reduction.py in recvfds(sock, size)
159 if len(ancdata) != 1:
160 raise RuntimeError('received %d items of ancdata' %
--> 161 len(ancdata))
162 cmsg_level, cmsg_type, cmsg_data = ancdata[0]
163 if (cmsg_level == socket.SOL_SOCKET and
RuntimeError: received 0 items of ancdata
System Info
- PyTorch
- OS: Ubuntu 16.04
- PyTorch version: 0.4.1
eslambakr, YongfeiYan and danxfreeman
Metadata
Metadata
Assignees
Labels
No labels