This notebook present the steps to generate merged catalog files. The reason that we need to generate such files is that:

1. Bliss needs input parameters that contains psf, fluxes, location, source type, galaxy related parameters, but the truth catalog in DC2 does not provide galaxy parameters or psf
2. To get those parameter, we need to use CosmoDC2 dataset and truth-match table, where the galaxy parameters and psf are stored.
3. To save the time for multiple merging when loading the data, we store the merged dataset into corresponding merged_catalog_{}.pkl files

### Load Catalog

Use [GCRCatalogs](https://data.lsstdesc.org/doc/install_gcr) package to load the dc2 truth catalog and CosomoDC2.




In [1]:
import pickle
import GCRCatalogs
import matplotlib.pyplot as plt
import numpy as np
import healpy as hp
from pathlib import Path
import pandas as pd

from GCRCatalogs import GCRQuery

In [2]:
output_dir = Path("./generate_new_catalog_output/")
output_dir.mkdir(parents=True, exist_ok=True)

In [3]:
ori_cat = pd.read_pickle("/data/scratch/dc2local/ori_merged_catalog_with_flux_over_50.pkl")

In [4]:
ori_max_ra = ori_cat["ra"].max()
ori_min_ra = ori_cat["ra"].min()
ori_max_dec = ori_cat["dec"].max()
ori_min_dec = ori_cat["dec"].min()
ra_dec_filters = [f"ra >= {ori_min_ra - 0.1}", f"ra <= {ori_max_ra + 0.1}", 
                  f"dec >= {ori_min_dec - 0.1}", f"dec <= {ori_max_dec + 0.1}"]

In [5]:
truth_catalog_pickle_file = output_dir / "truth_catalog.pkl"
GCRCatalogs.set_root_dir("/data/scratch/dc2_nfs/")
if truth_catalog_pickle_file.exists():
    with open(truth_catalog_pickle_file, "rb") as inputp: 
        truth_cat_data = pickle.load(inputp)
else:   
    truth_cat = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_truth") 
    print(sorted(truth_cat.list_all_quantities()))
    truth_cat_data = truth_cat.get_quantities([
        "id", "match_objectId", "cosmodc2_id", 
        "ra", "dec", 
        "truth_type", 
        "flux_g", "flux_i", "flux_r", "flux_u", "flux_y", "flux_z",
        "redshift",
    ],
    filters=ra_dec_filters)
    # with open(truth_catalog_pickle_file, "wb") as outp:
    #     pickle.dump(truth_cat_data, outp, pickle.HIGHEST_PROTOCOL)

truth_cat_df = pd.DataFrame(truth_cat_data)
truth_cat_df.rename(columns={"redshift": "redshifts"}, inplace=True)

['av', 'cosmodc2_hp', 'cosmodc2_id', 'dec', 'flux_g', 'flux_i', 'flux_r', 'flux_u', 'flux_y', 'flux_z', 'host_galaxy', 'id', 'id_string', 'is_good_match', 'is_nearest_neighbor', 'is_unique_truth_entry', 'mag_g', 'mag_i', 'mag_r', 'mag_u', 'mag_y', 'mag_z', 'match_objectId', 'match_sep', 'patch', 'ra', 'redshift', 'rv', 'tract', 'truth_type']


In [6]:
print(len(truth_cat_df))

60489715


In [8]:
print(1 - (truth_cat_df["flux_r"] > 100).values.sum() / len(truth_cat_df))

0.8513013658602955


In [None]:
print(truth_cat_df["ra"].min(), truth_cat_df["ra"].max())
print(truth_cat_df["dec"].min(), truth_cat_df["dec"].max())

In [None]:
plt.hist(truth_cat_data["flux_r"], np.linspace(50, 10000, num=100))
plt.show()

In [None]:
vertices = hp.ang2vec(np.array([ori_min_ra, ori_max_ra, ori_max_ra, ori_min_ra]),
                      np.array([ori_min_dec, ori_min_dec, ori_max_dec, ori_max_dec]), lonlat=True)
ipix = hp.query_polygon(32, vertices, inclusive=True)
healpix_filter = GCRQuery((lambda h: np.isin(h, ipix, True), "healpix_pixel"))

In [None]:
cosmodc2_pickle_file = output_dir / "cosmodc2.pkl"
if cosmodc2_pickle_file.exists():
    with open(cosmodc2_pickle_file, "rb") as inputp: 
        cosmodc2_data = pickle.load(inputp)
else:
    config_overwrite = dict(
        catalog_root_dir="/data/scratch/dc2_nfs/cosmoDC2_v1.1.4/"
    )
    cosmodc2_table = GCRCatalogs.load_catalog("desc_cosmodc2", config_overwrite)
    print(sorted(cosmodc2_table.list_all_quantities()))
    cosmodc2_data = cosmodc2_table.get_quantities([
        "galaxy_id",
        "shear_1", "shear_2",
        "ellipticity_1_true_dc2", "ellipticity_2_true_dc2",
    ],
    native_filters=healpix_filter)
    # with open(cosmodc2_pickle_file, "wb") as outp:
    #     pickle.dump(cosmodc2_data, outp, pickle.HIGHEST_PROTOCOL)

cosmodc2_df = pd.DataFrame(cosmodc2_data).rename(columns={
    "ellipticity_1_true_dc2": "ellipticity_1_true",
    "ellipticity_2_true_dc2": "ellipticity_2_true",
})

In [None]:
print(len(cosmodc2_df))

In [None]:
print(np.isnan(cosmodc2_df["shear_1"].values).sum() / len(cosmodc2_df))

In [None]:
cosmodc2_df["shear_1"].describe()

In [None]:
plt.hist(cosmodc2_df["shear_1"], log=True)
plt.show()

In [None]:
print(np.isnan(cosmodc2_df["shear_2"].values).sum() / len(cosmodc2_df))

In [None]:
cosmodc2_df["shear_2"].describe()

In [None]:
plt.hist(cosmodc2_df["shear_2"], log=True)
plt.show()

In [None]:
print(np.isnan(cosmodc2_df["ellipticity_1_true"].values).sum() / len(cosmodc2_df))

In [None]:
cosmodc2_df["ellipticity_1_true"].describe()

In [None]:
plt.hist(cosmodc2_df["ellipticity_1_true"], log=True)
plt.show()

In [None]:
print(np.isnan(cosmodc2_df["ellipticity_2_true"].values).sum() / len(cosmodc2_df))

In [None]:
cosmodc2_df["ellipticity_2_true"].describe()

In [None]:
plt.hist(cosmodc2_df["ellipticity_2_true"], log=True)
plt.show()

In [None]:
truth_match_pickle_file = output_dir / "psf_params.pkl"
if truth_match_pickle_file.exists():
    with open(truth_match_pickle_file, "rb") as inputp: 
        truth_match_data = pickle.load(inputp)
else:
    truth_match = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_object_with_truth_match")
    print(sorted(truth_match.list_all_quantities()))
    truth_match_data = truth_match.get_quantities([
        "objectId",
        "blendedness",
        "Ixx_pixel", "Iyy_pixel", "Ixy_pixel",
        "IxxPSF_pixel_g", "IxxPSF_pixel_z", 
        "IxxPSF_pixel_r", "IxxPSF_pixel_i", "IxxPSF_pixel_u", 
        "IxxPSF_pixel_y", "IyyPSF_pixel_g", "IyyPSF_pixel_z", 
        "IyyPSF_pixel_r", "IyyPSF_pixel_i", "IyyPSF_pixel_u", 
        "IyyPSF_pixel_y", "IxyPSF_pixel_g", "IxyPSF_pixel_z", 
        "IxyPSF_pixel_r", "IxyPSF_pixel_i", "IxyPSF_pixel_u", 
        "IxyPSF_pixel_y", "psf_fwhm_g", "psf_fwhm_z", "psf_fwhm_r",
        "psf_fwhm_i", "psf_fwhm_u", "psf_fwhm_y"
    ])
    # with open(truth_match_pickle_file, "wb") as outp:
    #     pickle.dump(truth_match_data, outp, pickle.HIGHEST_PROTOCOL)

truth_match_df = pd.DataFrame(truth_match_data)

### Merge Catalog 

In [None]:
truth_cat_plus_cosmodc2_df = truth_cat_df.merge(
    cosmodc2_df, 
    left_on="cosmodc2_id", 
    right_on="galaxy_id", 
    how="left" 
)
cosmodc2_mask = ~(np.isnan(truth_cat_plus_cosmodc2_df["ellipticity_1_true"].values) |
                  np.isnan(truth_cat_plus_cosmodc2_df["ellipticity_2_true"].values))
truth_cat_plus_cosmodc2_df["cosmodc2_mask"] = cosmodc2_mask
full_df = truth_cat_plus_cosmodc2_df.merge(
    truth_match_df, 
    left_on="match_objectId", 
    right_on="objectId", 
    how="left" 
)

In [None]:
print(len(truth_cat_df))
print(len(cosmodc2_df))
print(len(truth_match_df))
print(len(full_df))

In [None]:
for k, v in full_df.items():
    print(f"{k} has {np.isnan(v.values).sum()} nans")

In [None]:
# filter the object by flux
flux_min = 50
full_df = full_df.loc[full_df["flux_r"] > flux_min]
plt.hist(full_df["flux_r"], log=True)
plt.show()

In [None]:
for k, v in full_df.items():
    print(f"{k} has {np.isnan(v.values).sum()} nans")

In [None]:
full_df["flux_r"].describe()

In [None]:
np.quantile(full_df["flux_r"], q=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

In [None]:
full_df["blendedness"].describe()

In [None]:
# restrict blendedness
full_df["blendedness"] = np.clip(full_df["blendedness"].values, a_min=0.0, a_max=1.0)

In [None]:
full_df["blendedness"].describe()

In [None]:
plt.hist(full_df["blendedness"], log=True)
plt.show()

In [None]:
blendedness = full_df["blendedness"].values
np.quantile(blendedness[~np.isnan(blendedness)], q=[0.01, 0.03, 0.05, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
full_df["shear_1"].describe()

In [None]:
full_df["shear_2"].describe()

In [None]:
plt.hist(full_df["shear_1"], log=True)
plt.show()

In [None]:
plt.hist(full_df["shear_2"], log=True)
plt.show()

In [None]:
full_df["ellipticity_1_true"].describe()

In [None]:
full_df["ellipticity_2_true"].describe()

In [None]:
plt.hist(full_df["ellipticity_1_true"], log=True)
plt.show()

In [None]:
plt.hist(full_df["ellipticity_2_true"], log=True)
plt.show()

### Save to File

In [None]:
# create a pickle file 
full_df.to_pickle(output_dir / f"merged_catalog_with_flux_over_{flux_min}.pkl")