## DC2 Generate Cat and Split Data 

In [3]:
%load_ext autoreload
%autoreload 2

In [1]:
from os import environ
from pathlib import Path

import GCRCatalogs
import pandas as pd
import torch
from hydra import compose, initialize
from hydra.utils import instantiate
import pickle

from bliss.catalog import SourceType
from bliss.surveys.dc2 import DC2DataModule, wcs_from_wcs_header_str

# Generate Cat data

In [2]:
output_dir = Path("./DC2_generate_catalog_output/")
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
truth_catalog_pickle_file = output_dir / "truth_catalog.pkl"
GCRCatalogs.set_root_dir("/nfs/turbo/lsa-regier/lsstdesc-public/dc2")
if truth_catalog_pickle_file.exists():
    with open(truth_catalog_pickle_file, "rb") as inputp: 
        truth_cat_data = pickle.load(inputp)
else:   
    truth_cat = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_truth") 
    print(sorted(truth_cat.list_all_quantities()))
    truth_cat_data = truth_cat.get_quantities([
        "id", "match_objectId", "cosmodc2_id", "ra", "dec", "truth_type", 
        "flux_g", "flux_i", "flux_r", "flux_u", "flux_y", "flux_z",
        "redshift",
    ])
    with open(truth_catalog_pickle_file, "wb") as outp:
        pickle.dump(truth_cat_data, outp, pickle.HIGHEST_PROTOCOL)

truth_cat_df = pd.DataFrame(truth_cat_data)
truth_cat_df.rename(columns={"redshift": "redshifts"}, inplace=True)

In [11]:
galaxy_params_pickle_file = output_dir / "galaxy_params.pkl"
if galaxy_params_pickle_file.exists():
    with open(galaxy_params_pickle_file, "rb") as inputp: 
        galaxy_params_data = pickle.load(inputp)
else:
    GCRCatalogs.set_root_dir("/nfs/turbo/lsa-regier")
    galaxy_params = GCRCatalogs.load_catalog("desc_cosmodc2")
    print(sorted(galaxy_params.list_all_quantities()))
    galaxy_params_data = galaxy_params.get_quantities([
        "galaxy_id",
        "shear_1", "shear_2",
        "ellipticity_1_true", "ellipticity_2_true",
    ])
    with open(galaxy_params_pickle_file, "wb") as outp:
        pickle.dump(galaxy_params_data, outp, pickle.HIGHEST_PROTOCOL)

galaxy_params_df = pd.DataFrame(galaxy_params_data)

['A_v', 'A_v_bulge', 'A_v_disk', 'Mag_true_Y_lsst_z0', 'Mag_true_Y_lsst_z0_no_host_extinction', 'Mag_true_g_lsst_z0', 'Mag_true_g_lsst_z0_no_host_extinction', 'Mag_true_g_sdss_z0', 'Mag_true_g_sdss_z0_no_host_extinction', 'Mag_true_i_lsst_z0', 'Mag_true_i_lsst_z0_no_host_extinction', 'Mag_true_i_sdss_z0', 'Mag_true_i_sdss_z0_no_host_extinction', 'Mag_true_r_lsst_z0', 'Mag_true_r_lsst_z0_no_host_extinction', 'Mag_true_r_sdss_z0', 'Mag_true_r_sdss_z0_no_host_extinction', 'Mag_true_u_lsst_z0', 'Mag_true_u_lsst_z0_no_host_extinction', 'Mag_true_u_sdss_z0', 'Mag_true_u_sdss_z0_no_host_extinction', 'Mag_true_y_lsst_z0', 'Mag_true_y_lsst_z0_no_host_extinction', 'Mag_true_z_lsst_z0', 'Mag_true_z_lsst_z0_no_host_extinction', 'Mag_true_z_sdss_z0', 'Mag_true_z_sdss_z0_no_host_extinction', 'R_v', 'R_v_bulge', 'R_v_disk', 'bulge_to_total_ratio_i', 'convergence', 'dec', 'dec_true', 'ellipticity_1_bulge_true', 'ellipticity_1_bulge_true_dc2', 'ellipticity_1_disk_true', 'ellipticity_1_disk_true_dc2', '

In [6]:
psf_params_pickle_file = output_dir / "psf_params.pkl"
if psf_params_pickle_file.exists():
    with open(psf_params_pickle_file, "rb") as inputp: 
        psf_params_data = pickle.load(inputp)
else:
    psf_params = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_object_with_truth_match")
    psf_params_data = psf_params.get_quantities([
        "objectId", "blendedness", "IxxPSF_pixel_g", "IxxPSF_pixel_z", 
        "IxxPSF_pixel_r", "IxxPSF_pixel_i", "IxxPSF_pixel_u", 
        "IxxPSF_pixel_y", "IyyPSF_pixel_g", "IyyPSF_pixel_z", 
        "IyyPSF_pixel_r", "IyyPSF_pixel_i", "IyyPSF_pixel_u", 
        "IyyPSF_pixel_y", "IxyPSF_pixel_g", "IxyPSF_pixel_z", 
        "IxyPSF_pixel_r", "IxyPSF_pixel_i", "IxyPSF_pixel_u", 
        "IxyPSF_pixel_y", "psf_fwhm_g", "psf_fwhm_z", "psf_fwhm_r",
        "psf_fwhm_i", "psf_fwhm_u", "psf_fwhm_y"
    ])
    with open(psf_params_pickle_file, "wb") as outp:
        pickle.dump(psf_params_data, outp, pickle.HIGHEST_PROTOCOL)

### Merge Catalog

In [7]:
truth_cat_df = pd.DataFrame(truth_cat_data)
galaxy_params_df = pd.DataFrame(galaxy_params_data)
psf_params_df = pd.DataFrame(psf_params_data)
truth_galaxy_df = truth_cat_df.merge(
    galaxy_params_df, 
    left_on="cosmodc2_id", 
    right_on="galaxy_id", 
    how="left" 
)
truth_galaxy_psf_df = truth_galaxy_df.merge(
    psf_params_df, 
    left_on="match_objectId", 
    right_on="objectId", 
    how="left" 
)

In [8]:
truth_galaxy_psf_df = truth_galaxy_df.merge(
    psf_params_df, 
    left_on="match_objectId", 
    right_on="objectId", 
    how="left" 
)

In [7]:
merged_catalog_pikle_file = output_dir / "merged_catalog.pkl"
if merged_catalog_pikle_file.exists():
    with open(merged_catalog_pikle_file, "rb") as inputp: 
        truth_galaxy_psf_df = pickle.load(inputp)
else:
    truth_cat_df = pd.DataFrame(truth_cat_data)
    galaxy_params_df = pd.DataFrame(galaxy_params_data)
    psf_params_df = pd.DataFrame(psf_params_data)
    truth_galaxy_df = truth_cat_df.merge(
        galaxy_params_df, 
        left_on="cosmodc2_id", 
        right_on="galaxy_id", 
        how="left" 
    )
    truth_galaxy_psf_df = truth_galaxy_df.merge(
        psf_params_df, 
        left_on="match_objectId", 
        right_on="objectId", 
        how="left" 
    )
    with open(merged_catalog_pikle_file, "wb") as outp:
        pickle.dump(truth_galaxy_psf_df, outp, pickle.HIGHEST_PROTOCOL)

In [11]:
# filter the object by flux
flux_min = 50
truth_galaxy_psf_df = truth_galaxy_psf_df.loc[truth_galaxy_psf_df["flux_r"] > flux_min]

In [13]:
truth_galaxy_psf_df["blendedness"].describe()

count    9.160922e+06
mean     3.976622e-02
std      1.079735e-01
min     -1.678246e+00
25%      0.000000e+00
50%      1.636379e-03
75%      2.584892e-02
max      1.000000e+00
Name: blendedness, dtype: float64

In [14]:
truth_galaxy_psf_df["blendedness"].isna().sum()

3452525

In [15]:
len(truth_galaxy_psf_df)

12613447

In [16]:
truth_galaxy_psf_df["blendedness"] = np.clip(truth_galaxy_psf_df["blendedness"].values, a_min=0.0, a_max=1.0)

In [17]:
truth_galaxy_psf_df["blendedness"].describe()

count    9.160922e+06
mean     4.019095e-02
std      1.076950e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.636379e-03
75%      2.584892e-02
max      1.000000e+00
Name: blendedness, dtype: float64

In [18]:
# # create a pickle file 
truth_galaxy_psf_df.to_pickle(output_dir / f"merged_catalog_with_flux_over_{flux_min}.pkl")

In [None]:
truth_galaxy_psf_df

# generate split file

In [22]:
print("+" * 100, flush=True)
print("initialization begins", flush=True)

with initialize(config_path=".", version_base=None):
    notebook_cfg = compose("notebook_config")
print("initialization ends", flush=True)
print("+" * 100, flush=True)

print("+" * 100, flush=True)
print("load dc2", flush=True)
dc2: DC2DataModule = instantiate(notebook_cfg.surveys.dc2)
dc2.prepare_data()
print("+" * 100, flush=True)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
initialization begins


initialization ends
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
load dc2


InstantiationException: Error locating target 'bliss.cached_dataset.FluxFilterTransform', set env var HYDRA_FULL_ERROR=1 to see chained exception.
full_key: surveys.dc2.train_transforms2