## DC2 Generate Cat and Split Data 

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from os import environ
from pathlib import Path

import GCRCatalogs
import pandas as pd
import torch
from hydra import compose, initialize
from hydra.utils import instantiate
import pickle

from bliss.catalog import SourceType
from bliss.surveys.dc2 import DC2DataModule, wcs_from_wcs_header_str

# Generate LSST data for Regressor

In [4]:
output_dir = Path("/data/scratch/qiaozhih/data/redshift/dc2/")
output_dir.mkdir(parents=True, exist_ok=True)

In [4]:
import GCRCatalogs
import pandas as pd
import torch
from bliss.catalog import FullCatalog, SourceType

lsst_root_dir = "/data/scratch/dc2_nfs/"
GCRCatalogs.set_root_dir(lsst_root_dir)
lsst_catalog_gcr = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_object_with_truth_match")

lsst_catalog_sub = lsst_catalog_gcr.get_quantities(
    [
        "id_truth",
        "mag_u_cModel",
        "mag_g_cModel",
        "mag_r_cModel",
        "mag_i_cModel",
        "mag_z_cModel",
        "mag_y_cModel",
        "magerr_u_cModel",
        "magerr_g_cModel",
        "magerr_r_cModel",
        "magerr_i_cModel",
        "magerr_z_cModel",
        "magerr_y_cModel",
    ]
)
lsst_catalog_df = pd.DataFrame(lsst_catalog_sub)



In [5]:
lsst_catalog_df.describe()

Unnamed: 0,mag_y_cModel,magerr_i_cModel,magerr_z_cModel,mag_i_cModel,mag_u_cModel,mag_z_cModel,id_truth,magerr_u_cModel,magerr_y_cModel,magerr_g_cModel,mag_g_cModel,mag_r_cModel,magerr_r_cModel
count,9987282.0,11018930.0,10793630.0,10934830.0,9649256.0,10292310.0,11116210.0,10575780.0,10716780.0,11013040.0,10944100.0,11017700.0,11038760.0
mean,inf,,,inf,inf,inf,8088583000.0,inf,,,inf,inf,inf
std,,,,,,,42746140000.0,,,,,,
min,11.43206,-inf,-inf,11.89937,13.99609,11.59003,676386.0,-470606300.0,-inf,-inf,12.91604,12.27066,-946832400000.0
25%,23.92812,0.0650048,0.1057525,24.67404,25.61706,24.23471,6825068000.0,0.1311172,0.1292641,0.06841042,25.51972,25.16218,0.05814535
50%,24.81313,0.1479706,0.2820019,25.61176,26.47421,25.21701,7750733000.0,0.3612945,0.3468699,0.1426473,26.35767,26.03854,0.1232727
75%,25.62102,0.2613201,0.5848774,26.27653,27.32608,26.03465,7878704000.0,0.7734282,0.7187968,0.2248982,26.93405,26.61246,0.1935008
max,inf,inf,inf,inf,inf,inf,9026235000000.0,inf,inf,inf,inf,inf,inf


In [6]:
import numpy as np
lsst_catalog_df_na = lsst_catalog_df.replace([np.inf, -np.inf], np.nan)

In [7]:
lsst_catalog_df_nona = lsst_catalog_df_na.dropna()

In [8]:
lsst_catalog_df_nona.describe()

Unnamed: 0,mag_y_cModel,magerr_i_cModel,magerr_z_cModel,mag_i_cModel,mag_u_cModel,mag_z_cModel,id_truth,magerr_u_cModel,magerr_y_cModel,magerr_g_cModel,mag_g_cModel,mag_r_cModel,magerr_r_cModel
count,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0
mean,24.4908,46.41381,3376.254,25.01757,26.27893,24.79081,8062859000.0,5928.891,138.7315,195.6296,25.83652,25.4072,846.2013
std,1.725277,65803.8,9325938.0,1.616828,1.637081,1.772456,39062630000.0,14987060.0,229282.9,287129.0,1.464796,1.491504,1101339.0
min,11.43206,2.846753e-05,4.018415e-05,11.89937,14.37855,11.59003,811547.0,0.0002694765,4.407217e-05,5.204819e-05,12.91604,12.27066,2.749518e-05
25%,23.68605,0.04860332,0.1059764,24.27482,25.38903,23.92407,6820354000.0,0.1747186,0.1545336,0.0518862,25.14867,24.75342,0.04200911
50%,24.63077,0.1146499,0.2538098,25.29204,26.29794,24.97002,7750373000.0,0.3796795,0.3414059,0.1101115,26.03747,25.7083,0.09313366
75%,25.40578,0.2043973,0.5142233,26.00035,27.12148,25.78308,7878367000.0,0.7866697,0.6780241,0.192691,26.71187,26.36522,0.1624493
max,47.32455,176428400.0,25864600000.0,47.54039,52.6722,52.22029,9025765000000.0,41381830000.0,590063700.0,730451500.0,50.16285,50.92666,2770628000.0


In [9]:
new_name = {
    "id_truth": "id",
    "mag_u_cModel": "mag_u_lsst",
    "mag_g_cModel": "mag_g_lsst",
    "mag_r_cModel": "mag_r_lsst",
    "mag_i_cModel": "mag_i_lsst",
    "mag_z_cModel": "mag_z_lsst",
    "mag_y_cModel": "mag_y_lsst",
    "magerr_u_cModel": "mag_err_u_lsst",
    "magerr_g_cModel": "mag_err_g_lsst",
    "magerr_r_cModel": "mag_err_r_lsst",
    "magerr_i_cModel": "mag_err_i_lsst",
    "magerr_z_cModel": "mag_err_z_lsst",
    "magerr_y_cModel": "mag_err_y_lsst",
}

In [10]:
lsst_catalog_df_nona_newname = lsst_catalog_df_nona.rename(new_name, axis=1)

In [50]:
lsst_catalog_df_nona_newname[-1-100_000: -1].to_pickle("/data/scratch/qiaozhih/data/redshift/dc2/lsst_val_nona_100k.pkl")
lsst_catalog_df_nona_newname[: 200_000].to_pickle("/data/scratch/qiaozhih/data/redshift/dc2/lsst_train_nona_200k.pkl")

In [11]:
truth_catalog_pickle_file = output_dir / "truth_catalog.pkl"
GCRCatalogs.set_root_dir("/nfs/turbo/lsa-regier/lsstdesc-public/dc2")
if truth_catalog_pickle_file.exists():
    with open(truth_catalog_pickle_file, "rb") as inputp: 
        truth_cat_data = pickle.load(inputp)
else:   
    truth_cat = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_truth") 
    print(sorted(truth_cat.list_all_quantities()))
    truth_cat_data = truth_cat.get_quantities([
        "id", "redshift",
    ])
    with open(truth_catalog_pickle_file, "wb") as outp:
        pickle.dump(truth_cat_data, outp, pickle.HIGHEST_PROTOCOL)

truth_cat_df = pd.DataFrame(truth_cat_data)


In [12]:
lsst_truth_redshift_df = truth_cat_df.merge(
    lsst_catalog_df_nona_newname, 
    left_on="id", 
    right_on="id", 
    how="left" 
)

In [13]:
lsst_truth_redshift_df_nona = lsst_truth_redshift_df.dropna()

In [16]:
lsst_truth_redshift_df_nona = lsst_truth_redshift_df_nona.drop("match_objectId", axis=1)

In [17]:
lsst_truth_redshift_df_nona.to_pickle("/data/scratch/qiaozhih/data/redshift/dc2/lsst_truth_redshift_nona.pkl")

In [18]:
lsst_truth_redshift_df_nona.describe()

Unnamed: 0,redshift,id,mag_y_lsst,mag_err_i_lsst,mag_err_z_lsst,mag_i_lsst,mag_u_lsst,mag_z_lsst,mag_err_u_lsst,mag_err_y_lsst,mag_err_g_lsst,mag_g_lsst,mag_r_lsst,mag_err_r_lsst
count,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0,7691795.0
mean,1.136858,8062859000.0,24.4908,46.41381,3376.254,25.01757,26.27893,24.79081,5928.891,138.7315,195.6296,25.83652,25.4072,846.2013
std,0.6071841,39062630000.0,1.725277,65803.8,9325938.0,1.616828,1.637081,1.772456,14987060.0,229282.9,287129.0,1.464796,1.491504,1101339.0
min,0.0,811547.0,11.43206,2.846753e-05,4.018415e-05,11.89937,14.37855,11.59003,0.0002694765,4.407217e-05,5.204819e-05,12.91604,12.27066,2.749518e-05
25%,0.7289815,6820354000.0,23.68605,0.04860332,0.1059764,24.27482,25.38903,23.92407,0.1747186,0.1545336,0.0518862,25.14867,24.75342,0.04200911
50%,1.071423,7750373000.0,24.63077,0.1146499,0.2538098,25.29204,26.29794,24.97002,0.3796795,0.3414059,0.1101115,26.03747,25.7083,0.09313366
75%,1.484574,7878367000.0,25.40578,0.2043973,0.5142233,26.00035,27.12148,25.78308,0.7866697,0.6780241,0.192691,26.71187,26.36522,0.1624493
max,3.072735,9025765000000.0,47.32455,176428400.0,25864600000.0,47.54039,52.6722,52.22029,41381830000.0,590063700.0,730451500.0,50.16285,50.92666,2770628000.0


In [19]:
lsst_truth_redshift_df_nona[:200_000].to_pickle("/data/scratch/qiaozhih/data/redshift/dc2/lsst_truth_redshift_train_nona_200k.pkl")
lsst_truth_redshift_df_nona[-1-100_000:-1].to_pickle("/data/scratch/qiaozhih/data/redshift/dc2/lsst_truth_redshift_val_nona_100k.pkl")

# Generate Cat data

In [2]:
output_dir = Path("./DC2_generate_catalog_output/")
output_dir.mkdir(parents=True, exist_ok=True)

In [8]:
truth_catalog_pickle_file = output_dir / "truth_catalog.pkl"
GCRCatalogs.set_root_dir("/nfs/turbo/lsa-regier/lsstdesc-public/dc2")
if truth_catalog_pickle_file.exists():
    with open(truth_catalog_pickle_file, "rb") as inputp: 
        truth_cat_data = pickle.load(inputp)
else:   
    truth_cat = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_truth") 
    print(sorted(truth_cat.list_all_quantities()))
    truth_cat_data = truth_cat.get_quantities([
        "id", "match_objectId", "cosmodc2_id", "ra", "dec", "truth_type", 
        "flux_g", "flux_i", "flux_r", "flux_u", "flux_y", "flux_z",
        "redshift",
    ])
    with open(truth_catalog_pickle_file, "wb") as outp:
        pickle.dump(truth_cat_data, outp, pickle.HIGHEST_PROTOCOL)

truth_cat_df = pd.DataFrame(truth_cat_data)
truth_cat_df.rename(columns={"redshift": "redshifts"}, inplace=True)

In [11]:
galaxy_params_pickle_file = output_dir / "galaxy_params.pkl"
if galaxy_params_pickle_file.exists():
    with open(galaxy_params_pickle_file, "rb") as inputp: 
        galaxy_params_data = pickle.load(inputp)
else:
    GCRCatalogs.set_root_dir("/nfs/turbo/lsa-regier")
    galaxy_params = GCRCatalogs.load_catalog("desc_cosmodc2")
    print(sorted(galaxy_params.list_all_quantities()))
    galaxy_params_data = galaxy_params.get_quantities([
        "galaxy_id",
        "shear_1", "shear_2",
        "ellipticity_1_true", "ellipticity_2_true",
    ])
    with open(galaxy_params_pickle_file, "wb") as outp:
        pickle.dump(galaxy_params_data, outp, pickle.HIGHEST_PROTOCOL)

galaxy_params_df = pd.DataFrame(galaxy_params_data)

['A_v', 'A_v_bulge', 'A_v_disk', 'Mag_true_Y_lsst_z0', 'Mag_true_Y_lsst_z0_no_host_extinction', 'Mag_true_g_lsst_z0', 'Mag_true_g_lsst_z0_no_host_extinction', 'Mag_true_g_sdss_z0', 'Mag_true_g_sdss_z0_no_host_extinction', 'Mag_true_i_lsst_z0', 'Mag_true_i_lsst_z0_no_host_extinction', 'Mag_true_i_sdss_z0', 'Mag_true_i_sdss_z0_no_host_extinction', 'Mag_true_r_lsst_z0', 'Mag_true_r_lsst_z0_no_host_extinction', 'Mag_true_r_sdss_z0', 'Mag_true_r_sdss_z0_no_host_extinction', 'Mag_true_u_lsst_z0', 'Mag_true_u_lsst_z0_no_host_extinction', 'Mag_true_u_sdss_z0', 'Mag_true_u_sdss_z0_no_host_extinction', 'Mag_true_y_lsst_z0', 'Mag_true_y_lsst_z0_no_host_extinction', 'Mag_true_z_lsst_z0', 'Mag_true_z_lsst_z0_no_host_extinction', 'Mag_true_z_sdss_z0', 'Mag_true_z_sdss_z0_no_host_extinction', 'R_v', 'R_v_bulge', 'R_v_disk', 'bulge_to_total_ratio_i', 'convergence', 'dec', 'dec_true', 'ellipticity_1_bulge_true', 'ellipticity_1_bulge_true_dc2', 'ellipticity_1_disk_true', 'ellipticity_1_disk_true_dc2', '

In [22]:
psf_params_pickle_file = output_dir / "psf_params.pkl"
if psf_params_pickle_file.exists():
    with open(psf_params_pickle_file, "rb") as inputp: 
        psf_params_data = pickle.load(inputp)
else:
    psf_params = GCRCatalogs.load_catalog("desc_dc2_run2.2i_dr6_object_with_truth_match")
    psf_params_data = psf_params.get_quantities([
        "objectId", "blendedness", "IxxPSF_pixel_g", "IxxPSF_pixel_z", 
        "IxxPSF_pixel_r", "IxxPSF_pixel_i", "IxxPSF_pixel_u", 
        "IxxPSF_pixel_y", "IyyPSF_pixel_g", "IyyPSF_pixel_z", 
        "IyyPSF_pixel_r", "IyyPSF_pixel_i", "IyyPSF_pixel_u", 
        "IyyPSF_pixel_y", "IxyPSF_pixel_g", "IxyPSF_pixel_z", 
        "IxyPSF_pixel_r", "IxyPSF_pixel_i", "IxyPSF_pixel_u", 
        "IxyPSF_pixel_y", "psf_fwhm_g", "psf_fwhm_z", "psf_fwhm_r",
        "psf_fwhm_i", "psf_fwhm_u", "psf_fwhm_y"
    ])
    with open(psf_params_pickle_file, "wb") as outp:
        pickle.dump(psf_params_data, outp, pickle.HIGHEST_PROTOCOL)

### Merge Catalog

In [7]:
truth_cat_df = pd.DataFrame(truth_cat_data)
galaxy_params_df = pd.DataFrame(galaxy_params_data)
psf_params_df = pd.DataFrame(psf_params_data)
truth_galaxy_df = truth_cat_df.merge(
    galaxy_params_df, 
    left_on="cosmodc2_id", 
    right_on="galaxy_id", 
    how="left" 
)
truth_galaxy_psf_df = truth_galaxy_df.merge(
    psf_params_df, 
    left_on="match_objectId", 
    right_on="objectId", 
    how="left" 
)

In [8]:
truth_galaxy_psf_df = truth_galaxy_df.merge(
    psf_params_df, 
    left_on="match_objectId", 
    right_on="objectId", 
    how="left" 
)

In [7]:
merged_catalog_pikle_file = output_dir / "merged_catalog.pkl"
if merged_catalog_pikle_file.exists():
    with open(merged_catalog_pikle_file, "rb") as inputp: 
        truth_galaxy_psf_df = pickle.load(inputp)
else:
    truth_cat_df = pd.DataFrame(truth_cat_data)
    galaxy_params_df = pd.DataFrame(galaxy_params_data)
    psf_params_df = pd.DataFrame(psf_params_data)
    truth_galaxy_df = truth_cat_df.merge(
        galaxy_params_df, 
        left_on="cosmodc2_id", 
        right_on="galaxy_id", 
        how="left" 
    )
    truth_galaxy_psf_df = truth_galaxy_df.merge(
        psf_params_df, 
        left_on="match_objectId", 
        right_on="objectId", 
        how="left" 
    )
    with open(merged_catalog_pikle_file, "wb") as outp:
        pickle.dump(truth_galaxy_psf_df, outp, pickle.HIGHEST_PROTOCOL)

In [11]:
# filter the object by flux
flux_min = 50
truth_galaxy_psf_df = truth_galaxy_psf_df.loc[truth_galaxy_psf_df["flux_r"] > flux_min]

In [13]:
truth_galaxy_psf_df["blendedness"].describe()

count    9.160922e+06
mean     3.976622e-02
std      1.079735e-01
min     -1.678246e+00
25%      0.000000e+00
50%      1.636379e-03
75%      2.584892e-02
max      1.000000e+00
Name: blendedness, dtype: float64

In [14]:
truth_galaxy_psf_df["blendedness"].isna().sum()

3452525

In [15]:
len(truth_galaxy_psf_df)

12613447

In [16]:
truth_galaxy_psf_df["blendedness"] = np.clip(truth_galaxy_psf_df["blendedness"].values, a_min=0.0, a_max=1.0)

In [17]:
truth_galaxy_psf_df["blendedness"].describe()

count    9.160922e+06
mean     4.019095e-02
std      1.076950e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.636379e-03
75%      2.584892e-02
max      1.000000e+00
Name: blendedness, dtype: float64

In [18]:
# # create a pickle file 
truth_galaxy_psf_df.to_pickle(output_dir / f"merged_catalog_with_flux_over_{flux_min}.pkl")

In [None]:
truth_galaxy_psf_df

# generate split file

In [10]:
print("+" * 100, flush=True)
print("initialization begins", flush=True)

with initialize(config_path=".", version_base=None):
    notebook_cfg = compose("notebook_config")
print("initialization ends", flush=True)
print("+" * 100, flush=True)

print("+" * 100, flush=True)
print("load dc2", flush=True)
dc2: DC2DataModule = instantiate(notebook_cfg.surveys.dc2)
dc2.prepare_data()
print("+" * 100, flush=True)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
initialization begins
initialization ends
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
load dc2





++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
