In [1]:
%pip uninstall -y lsdb hipscat

# %pip install git+http://hipscat.rubin.science
# %pip install git+http://lsdb.rubin.science

%pip install lsdb

Found existing installation: lsdb 0.2.7.dev31+g79080ad
Uninstalling lsdb-0.2.7.dev31+g79080ad:
  Successfully uninstalled lsdb-0.2.7.dev31+g79080ad
Found existing installation: hipscat 0.3.6.dev2+g97643b6
Uninstalling hipscat-0.3.6.dev2+g97643b6:
  Successfully uninstalled hipscat-0.3.6.dev2+g97643b6
Note: you may need to restart the kernel to use updated packages.
Collecting lsdb
  Using cached lsdb-0.2.7-py3-none-any.whl.metadata (5.9 kB)
Collecting hipscat>=0.3.5 (from lsdb)
  Using cached hipscat-0.3.7-py3-none-any.whl.metadata (6.0 kB)
Using cached lsdb-0.2.7-py3-none-any.whl (65 kB)
Using cached hipscat-0.3.7-py3-none-any.whl (77 kB)
Installing collected packages: hipscat, lsdb
Successfully installed hipscat-0.3.7 lsdb-0.2.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
WITH_GAIA = True

In [2]:
catalog_name = 'ztf_ps1'
if WITH_GAIA:
    catalog_name += '_gaia'

In [3]:
%%time

from pathlib import Path

import dask.distributed
import lsdb
import pyarrow as pa
import pyarrow.dataset
import pyarrow.parquet

CPU times: user 13 s, sys: 7.9 s, total: 20.9 s
Wall time: 59.3 s


In [4]:
%%time

HIPSCAT_PATH = Path('/ocean/projects/phy210048p/shared/hipscat/catalogs/')

PS1_BANDS = 'riz'

ztf_dr17_coord = lsdb.read_hipscat(
    'hipscat/ztf_dr17_coord',
    margin_cache='hipscat/ztf_dr17_coord_2arcsec',
).query('filter == 2')  # r-band only
gaia_distances = lsdb.read_hipscat(
    HIPSCAT_PATH / 'gaia_dr3' / 'gaia_edr3_distances',
    margin_cache = str(HIPSCAT_PATH / 'gaia_dr3' / 'gaia_edr3_distances_10arcs'),
    # Remove RA and Dec - we have them from the main Gaia catalog
    columns=[
        'source_id',
        'r_med_geo', 'r_lo_geo', 'r_hi_geo',
        'r_med_photogeo', 'r_lo_photogeo', 'r_hi_photogeo',
        'flag',
    ]
)
gaia = lsdb.read_hipscat(
    HIPSCAT_PATH / 'gaia_dr3' / 'gaia',
    # We must do str() due to
    # https://github.com/astronomy-commons/lsdb/issues/380
    margin_cache=str(HIPSCAT_PATH / 'gaia_dr3' / 'gaia_10arcs'),
    columns=[
        'ra', 'dec',
        'source_id',
        'ruwe',
        'parallax', 'parallax_over_error',
        'pmra', 'pmdec', 'pmra_error', 'pmdec_error',
        'teff_gspphot', 'teff_gspphot_lower', 'teff_gspphot_upper',
        'logg_gspphot', 'logg_gspphot_lower', 'logg_gspphot_upper',
        # 'ag_gspphot', 'ag_gspphot_lower', 'ag_gspphot_upper',
    ],
).query(
    "teff_gspphot_upper < 3800"
    # " and ruwe < 1.4"
    
    # "parallax_over_error > 10.0"
    # " and teff_gspphot_upper < 3800"
    # " and (teff_gspphot_upper - teff_gspphot_lower) < 400"
    # " and logg_gspphot_lower > 4.5"
    # " and (logg_gspphot_upper - logg_gspphot_lower) < 0.2"
)
panstarrs = lsdb.read_hipscat(
    HIPSCAT_PATH / 'ps1' / 'ps1_otmo',
    margin_cache=str(HIPSCAT_PATH / 'ps1' / 'ps1_otmo_10arcs'),
    columns=
        ['raMean', 'decMean']
        + [f'{b}MeanPSFMag' for b in PS1_BANDS]
        + [f'{b}MeanPSFMagErr' for b in PS1_BANDS],
).query(
    "((rMeanPSFMag - iMeanPSFMag) + (rMeanPSFMagErr + iMeanPSFMagErr)) > 0.42"
    " and ((iMeanPSFMag - zMeanPSFMag) + (iMeanPSFMagErr + zMeanPSFMagErr)) > 0.23"
    " and rMeanPSFMagErr < 0.1 and iMeanPSFMagErr < 0.1 and zMeanPSFMagErr < 0.1"
)



CPU times: user 2.21 s, sys: 290 ms, total: 2.5 s
Wall time: 4.12 s


In [5]:
%%time

if WITH_GAIA:
    catalog = ztf_dr17_coord.crossmatch(
        gaia,
        radius_arcsec=1,
        n_neighbors=1,
        suffixes=['', '_gaia'],
    )
else:
    catalog = ztf_dr17_coord
    
catalog = catalog.crossmatch(
    panstarrs,
    radius_arcsec=1,
    n_neighbors=1,
    suffixes=['', ''],
)

# We must do join after cross-match to not lose margin cache due to
# https://github.com/astronomy-commons/lsdb/issues/382
if WITH_GAIA:
    catalog = catalog.join(gaia_distances, left_on='source_id_gaia', right_on='source_id', suffixes=['', '_edr3dist'])

CPU times: user 15min 2s, sys: 21.4 s, total: 15min 24s
Wall time: 15min 26s


In [8]:
%%time

with dask.distributed.Client(n_workers=16) as client:
    display(client)
    catalog._ddf.to_parquet(catalog_name)

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 16
Total threads: 128,Total memory: 247.07 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36643,Workers: 16
Dashboard: http://127.0.0.1:8787/status,Total threads: 128
Started: Just now,Total memory: 247.07 GiB

0,1
Comm: tcp://127.0.0.1:39389,Total threads: 8
Dashboard: http://127.0.0.1:36501/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:44627,
Local directory: /var/tmp/dask-scratch-space/worker-wmp426pj,Local directory: /var/tmp/dask-scratch-space/worker-wmp426pj

0,1
Comm: tcp://127.0.0.1:33365,Total threads: 8
Dashboard: http://127.0.0.1:37973/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:39791,
Local directory: /var/tmp/dask-scratch-space/worker-pvxot2er,Local directory: /var/tmp/dask-scratch-space/worker-pvxot2er

0,1
Comm: tcp://127.0.0.1:34055,Total threads: 8
Dashboard: http://127.0.0.1:40205/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:37141,
Local directory: /var/tmp/dask-scratch-space/worker-im47pn65,Local directory: /var/tmp/dask-scratch-space/worker-im47pn65

0,1
Comm: tcp://127.0.0.1:42179,Total threads: 8
Dashboard: http://127.0.0.1:34779/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:41139,
Local directory: /var/tmp/dask-scratch-space/worker-ayusb0x5,Local directory: /var/tmp/dask-scratch-space/worker-ayusb0x5

0,1
Comm: tcp://127.0.0.1:33073,Total threads: 8
Dashboard: http://127.0.0.1:41163/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:46131,
Local directory: /var/tmp/dask-scratch-space/worker-f61sti98,Local directory: /var/tmp/dask-scratch-space/worker-f61sti98

0,1
Comm: tcp://127.0.0.1:34119,Total threads: 8
Dashboard: http://127.0.0.1:42587/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:34203,
Local directory: /var/tmp/dask-scratch-space/worker-8fd0jwix,Local directory: /var/tmp/dask-scratch-space/worker-8fd0jwix

0,1
Comm: tcp://127.0.0.1:45915,Total threads: 8
Dashboard: http://127.0.0.1:42965/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:45491,
Local directory: /var/tmp/dask-scratch-space/worker-p6x0y73r,Local directory: /var/tmp/dask-scratch-space/worker-p6x0y73r

0,1
Comm: tcp://127.0.0.1:38209,Total threads: 8
Dashboard: http://127.0.0.1:40637/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:36147,
Local directory: /var/tmp/dask-scratch-space/worker-3kc3cs4b,Local directory: /var/tmp/dask-scratch-space/worker-3kc3cs4b

0,1
Comm: tcp://127.0.0.1:37253,Total threads: 8
Dashboard: http://127.0.0.1:37393/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:45701,
Local directory: /var/tmp/dask-scratch-space/worker-jr4b6pw3,Local directory: /var/tmp/dask-scratch-space/worker-jr4b6pw3

0,1
Comm: tcp://127.0.0.1:46257,Total threads: 8
Dashboard: http://127.0.0.1:36967/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:42807,
Local directory: /var/tmp/dask-scratch-space/worker-rw0eu0ny,Local directory: /var/tmp/dask-scratch-space/worker-rw0eu0ny

0,1
Comm: tcp://127.0.0.1:36593,Total threads: 8
Dashboard: http://127.0.0.1:44803/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:38981,
Local directory: /var/tmp/dask-scratch-space/worker-mhd0y18t,Local directory: /var/tmp/dask-scratch-space/worker-mhd0y18t

0,1
Comm: tcp://127.0.0.1:34679,Total threads: 8
Dashboard: http://127.0.0.1:34505/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:38107,
Local directory: /var/tmp/dask-scratch-space/worker-shsz8zzb,Local directory: /var/tmp/dask-scratch-space/worker-shsz8zzb

0,1
Comm: tcp://127.0.0.1:41077,Total threads: 8
Dashboard: http://127.0.0.1:42867/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:44771,
Local directory: /var/tmp/dask-scratch-space/worker-5onxrujh,Local directory: /var/tmp/dask-scratch-space/worker-5onxrujh

0,1
Comm: tcp://127.0.0.1:36691,Total threads: 8
Dashboard: http://127.0.0.1:39163/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:40743,
Local directory: /var/tmp/dask-scratch-space/worker-xgiv526i,Local directory: /var/tmp/dask-scratch-space/worker-xgiv526i

0,1
Comm: tcp://127.0.0.1:43451,Total threads: 8
Dashboard: http://127.0.0.1:36939/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:36569,
Local directory: /var/tmp/dask-scratch-space/worker-7dt9ypof,Local directory: /var/tmp/dask-scratch-space/worker-7dt9ypof

0,1
Comm: tcp://127.0.0.1:33721,Total threads: 8
Dashboard: http://127.0.0.1:43097/status,Memory: 15.44 GiB
Nanny: tcp://127.0.0.1:33909,
Local directory: /var/tmp/dask-scratch-space/worker-8hf1iib8,Local directory: /var/tmp/dask-scratch-space/worker-8hf1iib8


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
2024-07-26 11:40:56,585 - tornado.application - ERROR - Uncaught exception GET /individual-cluster-memory/ws (10.8.11.30)
HTTPServerRequest(protocol='http', host='127.0.0.1:8787', method='GET', uri='/individual-cluster-memory/ws', version='HTTP/1.1', remote_ip='10.8.11.30')
Traceback (most recent call last):
  File "/ocean/projects/phy210048p/malanche/ztfdr17-psdr2-gaiadr3/cenv/lib/python3.12/site-packages/tornado/websocket.py", line 938, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ocean/projects/phy210048p/malanche/ztfdr17-psdr2-gaiadr3/cenv/lib/python3.12/site-packages/tornado/web.py", line 3301, in wrapper
    return method(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ocean/projects/phy210048p/malanche/ztfdr17-psdr2-gaiadr3/cenv/lib/pyt

TimeoutError: timed out after 60 s.

2024-07-26 12:04:02,292 - distributed.scheduler - ERROR - Workers ['tcp://127.0.0.1:34119'] did not shut down within 30s; force closing
2024-07-26 12:04:02,372 - distributed.scheduler - ERROR - 1/1 nanny worker(s) did not shut down within 30s: {'tcp://127.0.0.1:34119'}
Traceback (most recent call last):
  File "/ocean/projects/phy210048p/malanche/ztfdr17-psdr2-gaiadr3/cenv/lib/python3.12/site-packages/distributed/utils.py", line 832, in wrapper
    return await func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ocean/projects/phy210048p/malanche/ztfdr17-psdr2-gaiadr3/cenv/lib/python3.12/site-packages/distributed/scheduler.py", line 6377, in restart_workers
    raise TimeoutError(
TimeoutError: 1/1 nanny worker(s) did not shut down within 30s: {'tcp://127.0.0.1:34119'}
2024-07-26 12:04:02,373 - distributed.scheduler - ERROR - 1/1 nanny worker(s) did not shut down within 30s: {'tcp://127.0.0.1:34119'}
Traceback (most recent call last):
  File "/ocean/projects/phy210048

In [9]:
%%time

dataset = pyarrow.dataset.dataset(catalog_name).sort_by('oid')
pa.dataset.write_dataset(
    dataset,
    f"{catalog_name}_sorted",
    format='parquet',
    partitioning=pyarrow.dataset.partitioning(
        flavor="filename",
        schema=pyarrow.schema([dataset.schema.field('fieldid')]),
    ),
)

CPU times: user 4min 44s, sys: 11min 58s, total: 16min 42s
Wall time: 7min 32s
