# Unit test data

This directory contains very small, toy, data sets that are used
for unit tests.

## Object catalog: small_sky

This "object catalog" is 131 randomly generated radec values.

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

The following are imports and paths that are used throughout the notebook.

In [None]:
import shutil
import tempfile
from pathlib import Path

import lsdb
import pyarrow as pa
import pyarrow.dataset as pds
import pyarrow.parquet as pq
from dask.distributed import Client
from hats.io.file_io import remove_directory

from hats_import import pipeline_with_client, ImportArguments

tmp_path = tempfile.TemporaryDirectory()
tmp_dir = tmp_path.name

hats_import_dir = "."
client = Client(n_workers=1, threads_per_worker=1, local_directory=tmp_dir)

### small_sky

This "object catalog" is 131 randomly generated radec values.

- All radec positions are in the Healpix pixel order 0, pixel 11.
- IDs are integers from 700-831.

This catalog was generated with the following snippet:

In [None]:
remove_directory("./small_sky_object_catalog")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_path=Path(hats_import_dir) / "small_sky",
        output_path=".",
        file_reader="csv",
        highest_healpix_order=5,
        output_artifact_name="small_sky_object_catalog",
        tmp_dir=pipeline_tmp,
        addl_hats_properties={"hats_cols_default": ["ra", "dec", "id"]},
    )
    pipeline_with_client(args, client)

## Source catalog: small_sky_source

This "source catalog" is 131 detections at each of the 131 objects
in the "small_sky" catalog. These have a random magnitude, MJD, and
band (selected from ugrizy). The full script that generated the values
can be found [here](https://github.com/delucchi-cmu/hipscripts/blob/main/twiddling/small_sky_source.py)

The catalog was generated with the following snippet, using raw data
from the `hats-import` file.

NB: `pixel_threshold=3000` is set just to make sure that we're generating
a handful of files at various healpix orders.

In [None]:
remove_directory("./small_sky_source_catalog")
with tempfile.TemporaryDirectory() as pipeline_tmp:
    args = ImportArguments(
        input_path=Path(hats_import_dir) / "small_sky_source",
        output_path=".",
        file_reader="csv",
        ra_column="source_ra",
        dec_column="source_dec",
        catalog_type="source",
        highest_healpix_order=5,
        pixel_threshold=3000,
        drop_empty_siblings=False,
        output_artifact_name="small_sky_source_catalog",
        tmp_dir=pipeline_tmp,
    )
    pipeline_with_client(args, client)

In [None]:
client.close()
tmp_path.cleanup()

## Nested catalog: small_sky_nested_catalog

In [None]:
small_sky_object = lsdb.read_hats("small_sky_object_catalog", columns="all")
small_sky_source = lsdb.read_hats("small_sky_source_catalog")
small_sky_nested = small_sky_object.join_nested(
    small_sky_source, left_on="id", right_on="object_id", nested_column_name="lc"
)
lsdb.io.to_hats(
    small_sky_nested,
    base_catalog_path="small_sky_nested_catalog",
    catalog_name="small_sky_nested_catalog",
    histogram_order=5,
    overwrite=True,
)

## Association catalog: small_sky_object_source_association

This catalog contains a mapping between small_sky objects and sources.

In [None]:
remove_directory("./small_sky_object_source_association")

association = small_sky_object.crossmatch(small_sky_source, suffixes=("_obj", "_src"), radius_arcsec=3600)

lsdb.io.to_association(
    association[["id_obj", "object_id_src", "_dist_arcsec"]],
    base_catalog_path="small_sky_object_source_association",
    catalog_name="small_sky_object_source_association",
    primary_catalog_dir="small_sky_object_catalog",
    primary_column_association="id_obj",
    primary_id_column="id",
    join_catalog_dir="small_sky_source_catalog",
    join_column_association="object_id_src",
    join_id_column="object_id",
    separation_column="_dist_arcsec",
    overwrite=True,
)

## Malformed Catalogs: bad_schemas and wrong_files_and_rows

These datasets are designed to fail verification tests.
They are generated by mangling `small_sky_object_catalog`.

In [None]:
# Load the input data that will be used to generate the malformed catalogs.
input_dataset_path = Path(hats_import_dir) / "small_sky_object_catalog" / "dataset"
input_ds = pds.parquet_dataset(input_dataset_path / "_metadata")

# Unit tests expect the Npix=11 data file
input_frag = next(frag for frag in input_ds.get_fragments() if frag.path.endswith("Npix=11.parquet"))
frag_key = Path(input_frag.path).relative_to(input_dataset_path)
input_tbl = input_frag.to_table()

In [None]:
def collect_and_write_metadata(output_dataset_path: Path, schema: pa.Schema | None = None) -> None:
    schema = schema or input_tbl.schema
    dataset = pds.dataset(output_dataset_path)
    metadata_collector = []
    for frag in dataset.get_fragments():
        frag.ensure_complete_metadata()
        frag.metadata.set_file_path(str(Path(frag.path).relative_to(output_dataset_path)))
        metadata_collector.append(frag.metadata)
    pq.write_metadata(
        schema=schema, where=output_dataset_path / "_metadata", metadata_collector=metadata_collector
    )

### bad_schemas

This dataset is designed to fail all schema verification tests.

```
bad_schemas/
|- dataset/
    |- _common_metadata.import_truth        # mimics schema provided by user upon import
    |- _common_metadata                     # wrong types
    |- _metadata                            # wrong file-level metadata
    |- Norder=0/Dir=0/
        |- Npix=11.parquet                  # direct copy of input
        |- Npix=11.extra_column.parquet     # extra column
        |- Npix=11.missing_column.parquet   # missing column
        |- Npix=11.wrong_dtypes.parquet     # wrong types
        |- Npix=11.wrong_metadata.parquet   # wrong metadata
```

In [None]:
output_dataset_path = Path(".") / "bad_schemas" / "dataset"
remove_directory(output_dataset_path)

# Existing files may result in unexpected metadata output.
if output_dataset_path.parent.exists() and any(output_dataset_path.parent.iterdir()):
    raise FileExistsError("bad_schemas directory exists and is not empty. Remove it and try again.")

# We will create the following files using input_frag
ffrag_out = output_dataset_path / frag_key
fextra_col = ffrag_out.with_suffix(".extra_column.parquet")
fmissing_col = ffrag_out.with_suffix(".missing_column.parquet")
fwrong_types = ffrag_out.with_suffix(".wrong_dtypes.parquet")
fwrong_metadata = ffrag_out.with_suffix(".wrong_metadata.parquet")

ffrag_out.parent.mkdir(parents=True, exist_ok=True)

In [None]:
# Make a direct copy of input_frag for all files that will be recorded in the _metadata file
for file_out in [ffrag_out, fmissing_col, fextra_col, fwrong_types]:
    shutil.copy(input_frag.path, file_out)

# Write a _metadata that has the correct schema except for file-level metadata
metadata = input_tbl.schema.metadata or {}
metadata.update({b"extra key": b"extra value"})
collect_and_write_metadata(output_dataset_path, schema=input_tbl.schema.with_metadata(metadata))

In [None]:
# Write new data files using incorrect schemas.

# Drop a column
pq.write_table(input_tbl.drop_columns("dec_error"), fmissing_col)

# Add an extra column
extra_col = pa.array(range(len(input_tbl)))
extra_col_tbl = input_tbl.add_column(5, pa.field("extra", pa.int64()), extra_col)
pq.write_table(extra_col_tbl, fextra_col)

# Mangle file-level metadata
wrong_metadata = {"bad key": "bad value"}
pq.write_table(input_tbl.replace_schema_metadata(wrong_metadata), fwrong_metadata)

# Change some types
wrong_dtypes_fields = [
    fld if not fld.name.startswith("ra") else fld.with_type(pa.float16()) for fld in input_tbl.schema
]
wrong_dtypes_schema = pa.schema(wrong_dtypes_fields, metadata=input_tbl.schema.metadata)
pq.write_table(input_tbl.cast(wrong_dtypes_schema), fwrong_types)

In [None]:
# Write a _common_metadata with the wrong dtypes.
pq.write_metadata(schema=wrong_dtypes_schema, where=output_dataset_path / "_common_metadata")

# Write a _common_metadata with the correct schema but no hats columns.
# This mimics a schema that could have been passed as 'use_schema_file' upon import.
fimport_schema = (output_dataset_path / "_common_metadata").with_suffix(".import_truth")
hats_cols = ["_healpix_29", "Norder", "Dir", "Npix"]
import_schema = pa.schema([fld for fld in input_tbl.schema if fld.name not in hats_cols])
pq.write_metadata(schema=import_schema, where=fimport_schema)

### wrong_files_and_rows

This dataset is designed to fail the following verification tests:

- Files listed in metadata match files on disk.
- Row counts in metadata match row counts on disk and (if provided) user-supplied truth.
- `hats.io.validation.is_valid_catalog`

```
wrong_files_and_rows/
|- properties                               # direct copy of input
|- dataset/
    |- _common_metadata                     # direct copy of input
    |- _metadata                            # missing file 'Npix=11.extra_file.parquet'
    |- Norder=0/Dir=0/
        |- Npix=11.parquet                  # direct copy of input
        |- Npix=11.extra_file.parquet       # added after _metadata generated
        |- Npix=11.extra_rows.parquet       # rows appended after _metadata generated
        |- (Npix=11.missing_file.parquet)   # dropped after _metadata generated
```

In [None]:
output_dataset_path = Path(".") / "wrong_files_and_rows" / "dataset"
remove_directory(output_dataset_path.parent)

# Existing files may result in unexpected metadata output.
if output_dataset_path.parent.exists() and any(output_dataset_path.parent.iterdir()):
    raise FileExistsError("wrong_files_and_rows directory exists and is not empty. Remove it and try again.")

# We will create the following files using input_frag
ffrag_out = output_dataset_path / frag_key
fmissing_file = ffrag_out.with_suffix(".missing_file.parquet")
fextra_file = ffrag_out.with_suffix(".extra_file.parquet")
fextra_rows = ffrag_out.with_suffix(".extra_rows.parquet")

ffrag_out.parent.mkdir(parents=True, exist_ok=True)

# Copy metadata files that we will not alter
shutil.copy(input_dataset_path.parent / "properties", output_dataset_path.parent / "properties")
shutil.copy(input_dataset_path / "_common_metadata", output_dataset_path / "_common_metadata")

In [None]:
# Make a direct copy of input_frag for all files that will be recorded in the _metadata file
for file_out in [ffrag_out, fmissing_file, fextra_rows]:
    shutil.copy(input_frag.path, file_out)

# Write _metadata
collect_and_write_metadata(output_dataset_path)

In [None]:
# Mangle the dataset.

# Add a file
shutil.copy(input_frag.path, fextra_file)

# Remove a file
fmissing_file.unlink()

# Add rows to an existing file
new_tbl = pa.concat_tables([input_tbl, input_tbl.take([1, 2, 3, 4])])
pq.write_table(new_tbl, fextra_rows)

In [None]:
client.close()

In [None]:
tmp_path.cleanup()