Skip to content

Too many open files error #11201

@whucdf

Description

@whucdf

Issue description

While using the dataloader from pytorch 0.4.1:
With num_workers > 0 the workers store the tensors in shared memory, but do not release the shared memory file handles after they return the tensor to the main process and file handles are no longer needed. The worker will then run out of file handles, if one stores the tensor in a list.

Code example


from torch.utils.data import Dataset
class testSet(Dataset):
    def __init__(self):
        super(testSet,self).__init__()
    def `__len__(self):`
        return 1000000
    def __getitem__(self,index):
        return {"index":index}

import torch

test_data = testSet()
test_data_loader = torch.utils.data.DataLoader( dataset=test_data, batch_size=1, num_workers=1)
index = []
for sample in test_data_loader:
    index.append(sample['index'])

The error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-5-cf6ed576bc1c> in <module>()
----> 1 for sample in test_data_loader:
      2     #print(sample['index'])
      3     index.append(sample['index'])

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
    328         while True:
    329             assert (not self.shutdown and self.batches_outstanding > 0)
--> 330             idx, batch = self._get_batch()
    331             self.batches_outstanding -= 1
    332             if idx != self.rcvd_idx:

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _get_batch(self)
    307                 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
    308         else:
--> 309             return self.data_queue.get()
    310 
    311     def __next__(self):

~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/queues.py in get(self)
    335             res = self._reader.recv_bytes()
    336         # unserialize the data after having released the lock
--> 337         return _ForkingPickler.loads(res)
    338 
    339     def put(self, obj):

~/anaconda3/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py in rebuild_storage_fd(cls, df, size)
    149         fd = multiprocessing.reduction.rebuild_handle(df)
    150     else:
--> 151         fd = df.detach()
    152     try:
    153         storage = storage_from_cache(cls, fd_id(fd))

~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py in detach(self)
     56             '''Get the fd.  This should only be called once.'''
     57             with _resource_sharer.get_connection(self._id) as conn:
---> 58                 return reduction.recv_handle(conn)
     59 
     60 

~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/reduction.py in recv_handle(conn)
    180         '''Receive a handle over a local connection.'''
    181         with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
--> 182             return recvfds(s, 1)[0]
    183 
    184     def DupFd(fd):

~/anaconda3/envs/pytorch/lib/python3.6/multiprocessing/reduction.py in recvfds(sock, size)
    159             if len(ancdata) != 1:
    160                 raise RuntimeError('received %d items of ancdata' %
--> 161                                    len(ancdata))
    162             cmsg_level, cmsg_type, cmsg_data = ancdata[0]
    163             if (cmsg_level == socket.SOL_SOCKET and

RuntimeError: received 0 items of ancdata

System Info

  • PyTorch
  • OS: Ubuntu 16.04
  • PyTorch version: 0.4.1

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions