# Unit test data

This directory contains very small, toy, data sets that are used
for unit tests.

## Object catalog: small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

The following are imports and paths that are used throughout the notebook.

In [None]:
import shutil
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
from dask.distributed import Client
from hats_import import (
    ImportArguments,
    pipeline_with_client,
    IndexArguments,
    CollectionArguments,
)

import hats
from hats.catalog import PartitionInfo, TableProperties
from hats.io.file_io import remove_directory
from hats.pixel_math import HealpixPixel
import lsdb
from hats.pixel_math.spatial_index import healpix_to_spatial_index

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

## Assumes you also have a working local branch of hats-import
hats_import_dir = "../../../hats-import/tests/data/"
client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)

### small_sky

This "object catalog" is 131 randomly generated radec values. 

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

This catalog was generated with the following snippet:

In [None]:
remove_directory("./small_sky")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_path=Path(hats_import_dir) / "small_sky",
        output_path=".",
        file_reader="csv",
        output_artifact_name="small_sky",
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_order1

This catalog has the same data points as other small sky catalogs,
but is coerced to spreading these data points over partitions at order 1, instead
of order 0.

This means there are 4 leaf partition files, instead of just 1, and so can
be useful for confirming reads/writes over multiple leaf partition files.

NB: Setting `constant_healpix_order` coerces the import pipeline to create
leaf partitions at order 1.

This catalog was generated with the following snippet:

In [None]:
remove_directory("./small_sky_o1_collection")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = (
        CollectionArguments(
            output_artifact_name="small_sky_o1_collection",
            output_path=".",
            tmp_dir=pipeline_tmp,
            addl_hats_properties={"obs_regime": "Optical", "default_index": "id"},
        )
        .catalog(
            input_path=Path(hats_import_dir) / "small_sky",
            file_reader="csv",
            output_artifact_name="small_sky_order1",
            constant_healpix_order=1,
        )
        .add_margin(margin_threshold=7200, output_artifact_name="small_sky_order1_margin", is_default=True)
        .add_index(
            indexing_column="id",
            output_artifact_name="small_sky_order1_id_index",
            include_healpix_29=False,
            compute_partition_size=200_000,
        )
    )

    pipeline_with_client(args, client)

### small_sky_to_small_sky_order1

Association table that maps (pretty naively) the `small_sky` to `small_sky_order1`. Note that these are the *same* catalog data, but the stored pixels are at different healpix orders.

Note also that this doesn't really create a catalog! This is faking out a "soft" association catalog, which just contains the partition join information, and not the actual matching rows. It's not generated by any "import pipeline", but just through writing the files directly.

This catalog was generated using the following snippet:

In [None]:
out_catalog_name = "small_sky_to_small_sky_order1"

remove_directory(out_catalog_name, ignore_errors=True)
Path(out_catalog_name).mkdir(parents=True, exist_ok=True)

partition_info = PartitionInfo.from_healpix([HealpixPixel(1, p) for p in np.arange(44, 48)])
partition_info.write_to_file(f"{out_catalog_name}/partition_info.csv")

properties = TableProperties(
    catalog_name=out_catalog_name,
    catalog_type="association",
    total_rows=131,
    hats_primary_table_url="small_sky",
    hats_col_assn_primary="id",
    hats_col_assn_primary_assn="id_small_sky",
    hats_assn_join_table_url="small_sky_order1",
    hats_col_assn_join="id",
    hats_col_assn_join_assn="id_small_sky_order1",
    hats_assn_leaf_files=False,
    assn_max_separation=0,
).to_properties_file(out_catalog_name)

### small_sky_npix_alt_suffix

Copies small_sky but changes the parquet file suffix.

In [None]:
# hats does not constrain the suffix,
# but the suffix should make the file recognizable as parquet for compatibility with other libraries.
npix_suffix = ".parq"  # could also include the compression, e.g., ".snappy.parquet"

sso = hats.read_hats("small_sky")
paths = [hats.io.paths.pixel_catalog_file(sso.catalog_base_dir, pixel) for pixel in sso.get_healpix_pixels()]

out_catalog_name = "small_sky_npix_alt_suffix"
out_catalog_info = sso.catalog_info.copy_and_update(catalog_name=out_catalog_name, npix_suffix=npix_suffix)
out_paths = [
    hats.io.paths.pixel_catalog_file(out_catalog_name, pixel, npix_suffix=npix_suffix)
    for pixel in sso.get_healpix_pixels()
]

for path, out_path in zip(paths, out_paths):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy(path, out_path)
hats.io.write_parquet_metadata(out_catalog_name)
out_catalog_info.to_properties_file(out_catalog_name)
sso.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

### small_sky_npix_as_dir

Copies small_sky but makes Npix a directory.

In [None]:
import shutil
import hats

npix_suffix = "/"

sso = hats.read_hats("small_sky")
paths = [hats.io.paths.pixel_catalog_file(sso.catalog_base_dir, pixel) for pixel in sso.get_healpix_pixels()]

out_catalog_name = "small_sky_npix_as_dir"
out_catalog_info = sso.catalog_info.copy_and_update(catalog_name=out_catalog_name, npix_suffix=npix_suffix)
out_dirs = [
    hats.io.paths.pixel_catalog_file(out_catalog_name, pixel, npix_suffix=npix_suffix)
    for pixel in sso.get_healpix_pixels()
]

for path, out_dir in zip(paths, out_dirs):
    out_dir.mkdir(parents=True, exist_ok=True)
    # hats will only care about `out_dir`. It will be agnostic to filenames, given `npix_suffix = "/"`.
    shutil.copy(path, out_dir / "part0.parquet")
hats.io.write_parquet_metadata(out_catalog_name)
out_catalog_info.to_properties_file(out_catalog_name)
sso.partition_info.write_to_file(hats.io.paths.get_partition_info_pointer(out_catalog_name))

## Source catalog: small_sky_source

This "source catalog" is 131 detections at each of the 131 objects
in the "small_sky" catalog. These have a random magnitude, MJD, and 
band (selected from ugrizy). The full script that generated the values
can be found [here](https://github.com/delucchi-cmu/hipscripts/blob/main/twiddling/small_sky_source.py)

The catalog was generated with the following snippet, using raw data 
from the `hats-import` file.

NB: `pixel_threshold=3000` is set just to make sure that we're generating
a handful of files at various healpix orders.

In [None]:
remove_directory("./small_sky_source")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_path=Path(hats_import_dir) / "small_sky_source",
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        pixel_threshold=3000,
        row_group_kwargs={"num_rows": 1_000},
        highest_healpix_order=6,
        drop_empty_siblings=False,
        output_artifact_name="small_sky_source",
        skymap_alt_orders=[2, 4],
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

### small_sky_source_object_index

This catalog exists as an index of the SOURCE table, using the OBJECT ID
as the indexed column. This means you should be able to quickly find
partions of SOURCES for a given OBJECT ID.

NB: 

- Setting `compute_partition_size` to something less than `1_000_000` 
  coerces the import pipeline to create smaller result partitions, 
  and so we have three distinct index partitions.
- Setting `include_healpix_29=False` keeps us from needing a row for every 
  source and lets the indexing pipeline create only one row per 
  unique objectId/Norder/Npix

This catalog was generated using the following snippet:

In [None]:
remove_directory("./small_sky_source_object_index")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = IndexArguments(
        input_catalog_path="small_sky_source",
        indexing_column="object_id",
        output_path=".",
        output_artifact_name="small_sky_source_object_index",
        include_healpix_29=False,
        compute_partition_size=200_000,
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

## MapCatalog - square map

Silly little map catalog that contains the count of number of stars in each pixel (the count is just the square of the pixel number)

In [None]:
target_pixels = np.arange(0, 12)

healpix_29 = healpix_to_spatial_index(0, target_pixels)

square_vals = target_pixels * target_pixels
value_frame = pd.DataFrame({"_healpix_29": healpix_29, "star_count": square_vals})

In [None]:
remove_directory("./square_map")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    csv_file = Path(pipeline_tmp) / "square_map.csv"
    value_frame.to_csv(csv_file, index=False)
    args = ImportArguments(
        constant_healpix_order=1,
        catalog_type="map",
        use_healpix_29=True,
        ra_column=None,
        dec_column=None,
        file_reader="csv",
        input_file_list=[csv_file],
        output_artifact_name="square_map",
        output_path=".",
        tmp_dir=pipeline_tmp,
    )

    pipeline_with_client(args, client)

### Nested catalog: small_sky_nested

Nests light curves from `small_sky_source` into `small_sky_order1` object catalog.

In [None]:
remove_directory("./small_sky_nested")

small_sky_object = lsdb.read_hats("small_sky_o1_collection")
small_sky_source = lsdb.read_hats("small_sky_source")
small_sky_nested = small_sky_object.join_nested(
    small_sky_source, left_on="id", right_on="object_id", nested_column_name="lc"
)
small_sky_nested = small_sky_nested.map_partitions(
    lambda df, p: df.assign(Norder=p.order, Npix=p.pixel, Dir=p.pixel // 10000), include_pixel=True
)
lsdb.io.to_hats(
    small_sky_nested,
    base_catalog_path="small_sky_nested",
    catalog_name="small_sky_nested",
    histogram_order=5,
    overwrite=True,
)

In [None]:
remove_directory("./small_sky_healpix13")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_file_list=Path(hats_import_dir) / "test_formats" / "small_sky_healpix13.csv",
        output_path=".",
        file_reader="csv",
        pixel_threshold=3000,
        row_group_kwargs={"num_rows": 1_000},
        highest_healpix_order=2,
        output_artifact_name="small_sky_healpix13",
        tmp_dir=pipeline_tmp,
        add_healpix_29=False,
        addl_hats_properties={"hats_col_healpix": "healpix13", "hats_col_healpix_order": 13},
    )
    pipeline_with_client(args, client)

In [None]:
client.close()
tmp_path.cleanup()