-
Notifications
You must be signed in to change notification settings - Fork 75
/
ahf.py
415 lines (338 loc) · 17.7 KB
/
ahf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
import glob
import gzip
import os.path
import re
import warnings
import numpy as np
from .. import snapshot, util
from . import HaloCatalogue, HaloParticleIndices, logger
from .details.number_mapping import (
NonMonotonicHaloNumberMapper,
SimpleHaloNumberMapper,
create_halo_number_mapper,
)
class AHFCatalogue(HaloCatalogue):
"""
Class to handle catalogues produced by Amiga Halo Finder (AHF).
"""
def __init__(self, sim, make_grp=None, get_all_parts=None, use_iord=None, ahf_basename=None,
dosort=None, only_stat=None, write_fpos=True, halo_numbers='file-order',
ignore_missing_substructure=True,
**kwargs):
"""Initialize an AHFCatalogue.
**kwargs** :
*use_iord*: if True, the particle IDs in the Amiga catalogue
are taken to refer to the iord array. If False,
they are the particle offsets within the file. If
None, the parameter defaults to True for
GadgetSnap, False otherwise.
*ahf_basename*: specify the basename of the AHF halo catalog
files - the code will append 'halos',
'particles', and 'substructure' to this
basename to load the catalog data.
*halo_numbers*: specify how to number the halos. Options are:
'ahf' (default): use the halo numbers written in the AHF halos file,
or a zero-based indexing if none is present
'file-order': zero-based indexing of the halos
'v1': one-based indexing of the halos, compatible with pynbody v1.x
'length-order': sort by the number of particles in each halo, with the
halo with most particles being halo 0
'length-order-v1': as length-order, but indexing from halo 1,
compatible with dosort=True in pynbody v1
*ignore_missing_substructure*: if True (default), the code will not
raise an exception if the substructure file is
missing or corrupt. If False, it will raise an exception.
Deprecated kwargs:
*make_grp*: if True a 'grp' array is created in the underlying
snapshot specifying the lowest level halo that any
given particle belongs to. If it is False, no such
array is created; if None, the behaviour is
determined by the configuration system.
*get_all_parts*: if True, the particle file is loaded for all halos.
Suggested to keep this None, as this is memory intensive.
The default function is to load in this data as needed.
*dosort*: equivalent to halo_numbers='length-order'
*only_stat*: specify that you only wish to collect the halo
properties stored in the AHF_halos file and not
worry about particle information [deprecated, use
get_dummy_halo instead]
"""
if use_iord is None:
use_iord = isinstance(
sim.ancestor,
(snapshot.gadget.GadgetSnap, snapshot.gadgethdf.GadgetHDFSnap)
)
self._use_iord = use_iord
self._only_stat = only_stat
self._try_writing_fpos = write_fpos
if only_stat:
warnings.warn(DeprecationWarning("only_stat keyword is deprecated; instead, use the catalogue's get_dummy_halo method"))
if ahf_basename is not None:
self._ahfBasename = ahf_basename
else:
self._ahfBasename = self._infer_ahf_basename(sim)
self._determine_format_revision_from_filename()
logger.info("AHFCatalogue loading halo properties")
self._load_ahf_halo_properties(self._ahfBasename + 'halos')
if dosort:
warnings.warn(DeprecationWarning("dosort keyword is deprecated; instead pass halo_numbers='length-order-v1'"))
halo_numbers = 'length-order-v1'
number_mapper = self._setup_halo_numbering(halo_numbers)
super().__init__(sim, number_mapper)
self._remap_host_halo_property()
if self._use_iord:
self._init_iord_to_fpos()
try:
self._load_ahf_substructure(self._ahfBasename + 'substructure')
except (KeyError, ValueError, FileNotFoundError):
if not ignore_missing_substructure:
raise
logger.error("Unable to load AHF substructure file; continuing without. To expose the underlying problem as an exception, pass ignore_missing_substructure=False to the AHFCatalogue constructor")
if make_grp:
warnings.warn(DeprecationWarning("make_grp keyword is deprecated; instead, use the catalogue's get_group_array method"))
self.base['grp'] = self.get_group_array()
if get_all_parts:
warnings.warn(
DeprecationWarning("get_all_parts keyword is deprecated; instead, use the catalogue's load_all method"))
self.load_all()
logger.info("AHFCatalogue loaded")
def _setup_halo_numbering(self, halo_numbers):
has_id = 'ID' in self._halo_properties
if has_id:
self._ahf_own_number_mapper = create_halo_number_mapper(self._halo_properties['ID'])
else:
# if no explicit IDs, ahf implicitly numbers starting at 0 in file order
self._ahf_own_number_mapper = SimpleHaloNumberMapper(0, self._num_halos)
if halo_numbers == 'v1':
number_mapper = SimpleHaloNumberMapper(1, self._num_halos)
elif halo_numbers == 'file-order':
number_mapper = SimpleHaloNumberMapper(0, self._num_halos)
elif halo_numbers == 'length-order' or halo_numbers == 'length-order-v1':
npart = self._halo_properties['npart']
osort = np.argsort(-npart) # this is better than argsort(npart)[::-1], because it's stable
if halo_numbers.endswith('v1'):
number_mapper = NonMonotonicHaloNumberMapper(osort, ordering=True, start_index=1)
else:
number_mapper = NonMonotonicHaloNumberMapper(osort, ordering=True, start_index=0)
elif halo_numbers == 'ahf':
number_mapper = self._ahf_own_number_mapper
else:
raise ValueError(f"halo_numbers keyword {halo_numbers} not recognised")
return number_mapper
def _remap_host_halo_property(self):
"""When loaded from the .halos file, hostHalo is the AHF halo number of the host halo.
This maps it onto whatever halo number pynbody is using."""
if 'hostHalo' in self._halo_properties:
host_halo = self._halo_properties['hostHalo']
mask = host_halo != -1
host_halo[mask] = self.number_mapper.index_to_number(
self._ahf_own_number_mapper.number_to_index(host_halo[mask])
)
def _determine_format_revision_from_filename(self):
if self._ahfBasename.split("z")[-2][-1] == ".":
self._is_new_format = True
else:
self._is_new_format = False
@classmethod
def _infer_ahf_basename(cls, sim):
candidates = cls._list_candidate_ahf_basenames(sim)
if len(candidates) == 1:
candidate = candidates[0]
elif len(candidates) == 0:
raise FileNotFoundError("No candidate AHF catalogue found; try specifying a catalogue using the "
"ahf_basename keyword")
else:
candidate = candidates[0]
warnings.warn(f"Multiple candidate AHF catalogues found; using {candidate}. To specify a different "
"catalogue, use the ahf_basename keyword, or move the other catalogues.")
return util.cutgz(candidate)[:-9]
def _write_fpos(self):
try:
np.savetxt(self._ahfBasename+'fpos', self._fpos, fmt="%d")
except OSError:
warnings.warn("Unable to write AHF_fpos file; performance will be reduced. Pass write_fpos=False to halo constructor to suppress this message, or use load_all() method to remove need for storing fpos information.")
self._try_writing_fpos = False
@property
def base(self):
return self._base()
def _get_file_positions(self):
"""Get the starting positions of each halo's particle information within the
AHF_particles file for faster access later"""
if not hasattr(self, "_fpos"):
if os.path.exists(self._ahfBasename + 'fpos'):
self._fpos = np.loadtxt(self._ahfBasename+'fpos', dtype=int)
else:
self._fpos = np.empty(len(self.number_mapper), dtype=int)
with util.open_(self._ahfBasename + 'particles') as f:
nhalo = int(f.readline().strip())
assert nhalo == len(self.number_mapper)
for hnum in range(nhalo):
npart = int(f.readline().strip())
assert npart == self._halo_properties['npart'][hnum]
self._fpos[hnum] = f.tell()
for i in range(npart):
f.readline()
if self._try_writing_fpos:
if not os.path.exists(self._ahfBasename + 'fpos'):
self._write_fpos()
return self._fpos
def _load_ahf_particle_block(self, f, nparts):
"""Load the particles for the next halo described in particle file f"""
if self._is_new_format:
if not isinstance(f, gzip.GzipFile):
data = np.fromfile(
f,
dtype=int,
sep=" ",
count=nparts * 2
)[::2]
data = np.ascontiguousarray(data)
else:
# unfortunately with gzipped files there does not
# seem to be an efficient way to load nparts lines
data = np.empty(nparts, dtype=int)
for i in range(nparts):
data[i] = int(f.readline().split()[0])
data = self._ahf_to_pynbody_particle_ids(data)
else:
if not isinstance(f, gzip.GzipFile):
data = np.fromfile(f, dtype=int, sep=" ", count=nparts)
else:
# see comment above on gzipped files
data = np.empty(nparts, dtype=int)
for i in range(nparts):
data[i] = int(f.readline())
data.sort()
return data
def _ahf_to_pynbody_particle_ids(self, data):
ng = len(self.base.gas)
nd = len(self.base.dark)
ns = len(self.base.star)
nds = nd + ns
if self._use_iord:
data = self._iord_to_fpos.map_ignoring_order(data)
elif isinstance(self.base, snapshot.ramses.RamsesSnap):
# AHF only expects three families, DM, star, gas in this order
# and generates iords on disc according to this rule
# For classical Ramses snapshots, this is perfectly adequate, but
# for more modern outputs that have extra tracers, BHs families
# we need to offset the ids to return the correct slicing
# TODO These tests on snapshot type might not be necessary
# as using the family logic properly should be general.
# It is currently kept to ensure 100% backwards compatibility with previous behaviour,
# as this code is not explicitly checked by the pynbody test distribution
if len(self.base) != nd + ns + ng: # We have extra families to the base ones
# First identify DM, star and gas particles in AHF
ahf_dm_mask = data < nd
ahf_star_mask = (data >= nd) & (data < nds)
ahf_gas_mask = data >= nds
# Then offset them by DM family start, to account for
# additional families before it, e.g. gas tracers
data[np.where(ahf_dm_mask)] += self.base._get_family_slice('dm').start
# Star ids used to start at NDM, now they start with the star family slice
offset = self.base._get_family_slice('star').start - nd
data[np.where(ahf_star_mask)] += offset
# Gas ids were greater than NDM + NSTAR, now they start with the gas slice
offset = self.base._get_family_slice('gas').start - nds
data[np.where(ahf_gas_mask)] += offset
elif not isinstance(self.base, snapshot.nchilada.NchiladaSnap):
hi_mask = data >= nds
data[np.where(hi_mask)] -= nds
data[np.where(~hi_mask)] += ng
else:
st_mask = (data >= nd) & (data < nds)
g_mask = data >= nds
data[np.where(st_mask)] += ng
data[np.where(g_mask)] -= ns
return data
def _get_particle_indices_one_halo(self, halo_number):
fpos = self._get_file_positions()
file_index = self.number_mapper.number_to_index(halo_number)
with util.open_(self._ahfBasename + 'particles') as f:
f.seek(fpos[file_index],0)
ids = self._load_ahf_particle_block(f, nparts=self._halo_properties['npart'][file_index])
return ids
def _get_all_particle_indices(self):
boundaries = np.cumsum(np.concatenate(([0], self._halo_properties['npart'])))
boundaries = np.vstack((boundaries[:-1], boundaries[1:])).T
particle_ids = np.empty(boundaries[-1,1], dtype=int)
with util.open_(self._ahfBasename + 'particles') as f:
f.readline()
for nparts, (start, end) in zip(self._halo_properties['npart'], boundaries):
f.readline()
particle_ids[start:end] = self._load_ahf_particle_block(f, nparts=nparts)
return HaloParticleIndices(particle_ids=particle_ids, boundaries=boundaries)
def get_properties_one_halo(self, i):
index = self.number_mapper.number_to_index(i)
return {key: self._halo_properties[key][index] for key in self._halo_properties}
def get_properties_all_halos(self, with_units=True) -> dict:
return self._halo_properties
def _load_ahf_halo_properties(self, filename):
# Note: we need to open in 'rt' mode in case the AHF catalogue
# is gzipped.
with util.open_(filename, "rt") as f:
first_line = f.readline()
lines = f.readlines()
# get all the property names from the first, commented line
# remove (#)
fields = first_line.replace("#", "").split()
keys = [re.sub(r'\([0-9]*\)', '', field)
for field in fields]
if self._is_new_format:
# fix for column 0 being a non-column in some versions of the AHF
# output
if keys[0] == '':
keys = keys[1:]
self._halo_properties = {k: [] for k in keys}
self._num_halos = len(lines)
for line in lines:
values = [
float(x) if any(_ in x for _ in (".", "e", "nan", "inf"))
else int(x)
for x in line.split()
]
# XXX Unit issues! AHF uses distances in Mpc/h, possibly masses as
# well
if not self._is_new_format:
values = values[1:]
for key, value in zip(keys, values):
self._halo_properties[key].append(value)
for key in keys:
self._halo_properties[key] = np.array(self._halo_properties[key])
def _load_ahf_substructure(self, filename):
with util.open_(filename) as f:
lines = f.readlines()
logger.info("AHFCatalogue loading substructure")
if len(lines[0].split())!=2:
# some variants of the format start with a line count, which we ignore
lines = lines[1:]
self._halo_properties['children'] = children = [[] for _ in range(self._num_halos)]
self._halo_properties['parent'] = parent = np.empty(self._num_halos, dtype=int)
parent.fill(-1)
for halo_line, children_line in zip(lines[::2], lines[1::2]):
haloid, _nsubhalos = (int(x) for x in halo_line.split())
halo_index = self._ahf_own_number_mapper.number_to_index(haloid)
halo_number = self.number_mapper.index_to_number(halo_index)
children_ahf_numbering = [int(x) for x in children_line.split()]
children_index = self._ahf_own_number_mapper.number_to_index(children_ahf_numbering)
children_pynbody_numbering = self.number_mapper.index_to_number(children_index)
children[halo_index] = children_pynbody_numbering
for child_index in children_index:
parent[child_index] = halo_number
@staticmethod
def _list_candidate_ahf_basenames(sim):
candidates = set(glob.glob(f"{sim._filename}*z*particles*"))
# use a set to ensure that no duplicates can be produced
# This could arise in an edge case where _filename is a directory
# and having the "/" at the end of it would lead to a first detection here
# and a second one again below
if os.path.isdir(sim._filename):
candidates = candidates.union(glob.glob(os.path.join(sim._filename, "*z*particles*")))
return list(candidates)
@classmethod
def _can_load(cls, sim, ahf_basename=None, **kwargs):
if ahf_basename:
return True
candidates = cls._list_candidate_ahf_basenames(sim)
number_ahf_file_candidates = len(candidates)
return number_ahf_file_candidates > 0