This notebook present the steps to generate merged catalog files. The reason that we need to generate such files is that:

1. Bliss needs input parameters that contains psf, fluxes, location, source type, galaxy related parameters, but the truth catalog in DC2 does not provide galaxy parameters or psf
2. To get those parameter, we need to use CosmoDC2 dataset and truth-match table, where the galaxy parameters and psf are stored.
3. To save the time for multiple merging when loading the data, we store the merged dataset into corresponding merged_catalog_{}.pkl files

### Load Catalog

Use [GCRCatalogs](https://data.lsstdesc.org/doc/install_gcr) package to load the dc2 truth catalog and CosomoDC2.




In [None]:
import pickle
import GCRCatalogs
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd

In [2]:
output_dir = Path("./generate_catalog_output/")
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
truth_catalog_pickle_file = output_dir / "truth_catalog.pkl"
GCRCatalogs.set_root_dir("/data/scratch/dc2_nfs/")
if truth_catalog_pickle_file.exists():
    with open(truth_catalog_pickle_file, "rb") as inputp:
        truth_cat_data = pickle.load(inputp)
else:
    truth_cat = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_truth")
    print(sorted(truth_cat.list_all_quantities()))
    truth_cat_data = truth_cat.get_quantities([
        "id", "match_objectId", "cosmodc2_id", "ra", "dec", "truth_type",
        "flux_g", "flux_i", "flux_r", "flux_u", "flux_y", "flux_z",
        "redshift",
    ])
    with open(truth_catalog_pickle_file, "wb") as outp:
        pickle.dump(truth_cat_data, outp, pickle.HIGHEST_PROTOCOL)

truth_cat_df = pd.DataFrame(truth_cat_data)
truth_cat_df.rename(columns={"redshift": "redshifts"}, inplace=True)

In [None]:
plt.hist(truth_cat_data["flux_r"], np.linspace(50, 10000, num=100))
plt.show()

In [None]:
galaxy_params_pickle_file = output_dir / "galaxy_params.pkl"
if galaxy_params_pickle_file.exists():
    with open(galaxy_params_pickle_file, "rb") as inputp:
        galaxy_params_data = pickle.load(inputp)
else:
    config_overwrite = dict(
        catalog_root_dir="/data/scratch/dc2_nfs/cosmoDC2_v1.1.4/"
    )
    galaxy_params = GCRCatalogs.load_catalog("desc_cosmodc2", config_overwrite)
    print(sorted(galaxy_params.list_all_quantities()))
    galaxy_params_data = galaxy_params.get_quantities([
        "galaxy_id",
        "shear_1", "shear_2",
        "ellipticity_1_true", "ellipticity_2_true",
    ])
    with open(galaxy_params_pickle_file, "wb") as outp:
        pickle.dump(galaxy_params_data, outp, pickle.HIGHEST_PROTOCOL)

galaxy_params_df = pd.DataFrame(galaxy_params_data)

In [None]:
print(len(galaxy_params_df))

In [None]:
print(np.isnan(galaxy_params_df["shear_1"].values).sum() / len(galaxy_params_df))

In [None]:
galaxy_params_df["shear_1"].describe()

In [None]:
plt.hist(galaxy_params_df["shear_1"], log=True)
plt.show()

In [None]:
print(np.isnan(galaxy_params_df["shear_2"].values).sum() / len(galaxy_params_df))

In [None]:
galaxy_params_df["shear_2"].describe()

In [None]:
plt.hist(galaxy_params_df["shear_2"], log=True)
plt.show()

In [None]:
print(np.isnan(galaxy_params_df["ellipticity_1_true"].values).sum() / len(galaxy_params_df))

In [None]:
galaxy_params_df["ellipticity_1_true"].describe()

In [None]:
plt.hist(galaxy_params_df["ellipticity_1_true"], log=True)
plt.show()

In [None]:
print(np.isnan(galaxy_params_df["ellipticity_2_true"].values).sum() / len(galaxy_params_df))

In [None]:
galaxy_params_df["ellipticity_2_true"].describe()

In [None]:
plt.hist(galaxy_params_df["ellipticity_2_true"], log=True)
plt.show()

In [None]:
psf_params_pickle_file = output_dir / "psf_params.pkl"
if psf_params_pickle_file.exists():
    with open(psf_params_pickle_file, "rb") as inputp:
        psf_params_data = pickle.load(inputp)
else:
    psf_params = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_object_with_truth_match")
    print(sorted(psf_params.list_all_quantities()))
    psf_params_data = psf_params.get_quantities([
        "objectId",
        "blendedness",
        "IxxPSF_pixel_g", "IxxPSF_pixel_z",
        "IxxPSF_pixel_r", "IxxPSF_pixel_i", "IxxPSF_pixel_u",
        "IxxPSF_pixel_y", "IyyPSF_pixel_g", "IyyPSF_pixel_z",
        "IyyPSF_pixel_r", "IyyPSF_pixel_i", "IyyPSF_pixel_u",
        "IyyPSF_pixel_y", "IxyPSF_pixel_g", "IxyPSF_pixel_z",
        "IxyPSF_pixel_r", "IxyPSF_pixel_i", "IxyPSF_pixel_u",
        "IxyPSF_pixel_y", "psf_fwhm_g", "psf_fwhm_z", "psf_fwhm_r",
        "psf_fwhm_i", "psf_fwhm_u", "psf_fwhm_y"
    ])
    with open(psf_params_pickle_file, "wb") as outp:
        pickle.dump(psf_params_data, outp, pickle.HIGHEST_PROTOCOL)

psf_params_df = pd.DataFrame(psf_params_data)

### Merge Catalog 

In [None]:
truth_galaxy_df = truth_cat_df.merge(
    galaxy_params_df,
    left_on="cosmodc2_id",
    right_on="galaxy_id",
    how="left"
)
cosmodc2_mask = ~(np.isnan(truth_galaxy_df["ellipticity_1_true"].values))
truth_galaxy_df["cosmodc2_mask"] = cosmodc2_mask
truth_galaxy_psf_df = truth_galaxy_df.merge(
    psf_params_df,
    left_on="match_objectId",
    right_on="objectId",
    how="left"
)

In [None]:
print(len(truth_cat_df))
print(len(galaxy_params_df))
print(len(psf_params_df))
print(len(truth_galaxy_psf_df))

In [None]:
for k, v in truth_galaxy_psf_df.items():
    print(f"{k} has {np.isnan(v.values).sum()} nans")

In [None]:
# filter the object by flux
flux_min = 50
truth_galaxy_psf_df = truth_galaxy_psf_df.loc[truth_galaxy_psf_df["flux_r"] > flux_min]
plt.hist(truth_galaxy_psf_df["flux_r"], log=True)
plt.show()

In [None]:
for k, v in truth_galaxy_psf_df.items():
    print(f"{k} has {np.isnan(v.values).sum()} nans")

In [None]:
truth_galaxy_psf_df["flux_r"].describe()

In [None]:
np.quantile(truth_galaxy_psf_df["flux_r"], q=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
truth_galaxy_psf_df["blendedness"].describe()

In [None]:
# restrict blendedness
truth_galaxy_psf_df["blendedness"] = np.clip(truth_galaxy_psf_df["blendedness"].values, a_min=0.0, a_max=1.0)

In [None]:
truth_galaxy_psf_df["blendedness"].describe()

In [None]:
plt.hist(truth_galaxy_psf_df["blendedness"], log=True)
plt.show()

In [None]:
blendedness = truth_galaxy_psf_df["blendedness"].values
np.quantile(blendedness[~np.isnan(blendedness)], q=[0.01, 0.03, 0.05, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
truth_galaxy_psf_df["shear_1"].describe()

In [None]:
truth_galaxy_psf_df["shear_2"].describe()

In [None]:
plt.hist(truth_galaxy_psf_df["shear_1"], log=True)
plt.show()

In [None]:
plt.hist(truth_galaxy_psf_df["shear_2"], log=True)
plt.show()

In [None]:
truth_galaxy_psf_df["ellipticity_1_true"].describe()

In [None]:
truth_galaxy_psf_df["ellipticity_2_true"].describe()

In [None]:
plt.hist(truth_galaxy_psf_df["ellipticity_1_true"], log=True)
plt.show()

In [None]:
plt.hist(truth_galaxy_psf_df["ellipticity_2_true"], log=True)
plt.show()

### Save to File

In [None]:
# create a pickle file
truth_galaxy_psf_df.to_pickle(output_dir / f"merged_catalog_with_flux_over_{flux_min}.pkl")