# Using DC2 and CosmoDC2 to Extract Shear and Convergence #

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

note: commented portions of cells below to illustrate attempted methods that were suboptimal (esp. from dc2-linked tutorials), included for future observation

In [2]:
import torch
import numpy as np
from os import environ
from pathlib import Path
from einops import rearrange

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate
import healpy as hp

In [3]:
environ["BLISS_HOME"] = str(Path().resolve().parents[1])
with initialize(config_path=".", version_base=None):
    cfg = compose("config")
train_dc2_cfg = cfg.copy()

CosmoDC2 is REALLY big so we don't want to load all of its data into memory when accessing attributes. We set filters below that match dc2's piece of the sky since it's a subset of Cosmo's coverage. In the below cell, we load our DC2 catalog with the fields we are interested in and find the relevant sky quadrant, allowing us to build the filters.

In [4]:
import GCRCatalogs
from GCRCatalogs import GCRQuery
GCRCatalogs.set_root_dir("/data/dc2/")
truth_cat = GCRCatalogs.load_catalog('desc_dc2_run2.2i_dr6_truth')
truth_data = truth_cat.get_quantities(["id", "cosmodc2_id", "match_objectId", "ra", "dec", "truth_type", "redshift"])

max_ra = np.nanmax(truth_data['ra'])
min_ra = np.nanmin(truth_data['ra'])
max_dec = np.nanmax(truth_data['dec'])
min_dec = np.nanmin(truth_data['dec'])
pos_filters = [f'ra >= {min_ra}',f'ra <= {max_ra}', f'dec >= {min_dec}', f'dec <= {max_dec}']

vertices = hp.ang2vec(np.array([min_ra, max_ra, max_ra, min_ra]),
                      np.array([min_dec, min_dec, max_dec, max_dec]), lonlat=True)
ipix = hp.query_polygon(32, vertices, inclusive=True)
healpix_filter = GCRQuery((lambda h: np.isin(h, ipix, True), "healpix_pixel"))

print(truth_data.keys())

dict_keys(['ra', 'match_objectId', 'dec', 'redshift', 'cosmodc2_id', 'id', 'truth_type'])


Next, load in CosmoDC2 and select the quantities we want bounded by the aforementioned filters

In [5]:
config_overwrite = dict(
    catalog_root_dir='/data/dc2/cosmoDC2'
)

cosmo_cat = GCRCatalogs.load_catalog('desc_cosmodc2', config_overwrite)

In [6]:
print(cosmo_cat.list_all_quantities())
print(truth_cat.list_all_quantities())

['A_v', 'sed_7843_486_bulge', 'mag_i_lsst', 'is_central', 'mag_Y_lsst_no_host_extinction', 'mag_z_lsst_no_host_extinction', 'Mag_true_r_sdss_z0', 'mag_z', 'mag_r_lsst_no_host_extinction', 'sed_3184_197_bulge', 'sed_3590_222_disk', 'sed_5467_339_bulge', 'sed_3812_236', 'size_minor_disk_true', 'sersic_disk', 'Mag_true_g_lsst_z0_no_host_extinction', 'Mag_true_r_lsst_z0_no_host_extinction', 'Mag_true_g_sdss_z0', 'size_minor_true', 'position_x', 'sed_5148_319_disk_no_host_extinction', 'sed_1246_306_bulge', 'sed_17402_2596_disk', 'mag_i', 'Mag_true_r_lsst_z0', 'sed_6166_382_bulge', 'sed_1246_306_disk', 'sed_7385_458', 'sed_3812_236_bulge', 'Mag_true_g_lsst_z0', 'mag_true_r_lsst_no_host_extinction', 'mag_true_g_sdss', 'sed_6166_382_disk', 'convergence', 'sed_2998_186_no_host_extinction', 'mag_true_Y_lsst_no_host_extinction', 'mag_true_g', 'sed_9978_1489_disk', 'sed_13177_1966_bulge_no_host_extinction', 'sed_5806_360', 'sed_2407_591_disk', 'sed_2407_591', 'mag_true_u_lsst', 'Mag_true_u_lsst_z0

In [7]:
cosmo_data = cosmo_cat.get_quantities(quantities = ["galaxy_id", "redshift", "redshift_true", "ra", "dec", "shear_1", "shear_2", "convergence"], filters=pos_filters, native_filters=healpix_filter)
print(cosmo_data.keys())

dict_keys(['galaxy_id', 'redshift_true', 'ra', 'shear_1', 'dec', 'convergence', 'redshift', 'shear_2'])


#### Matching Attempt 1: Using example code from ####
https://github.com/LSSTDESC/DC2-analysis/blob/253625a230d545f4ceb529aae58416ef7a768648/tutorials/matching_fof.ipynb

Note: Do not actually run the commented cells below, takes forever and does not converge.

In [8]:
# cosmo = pd.DataFrame(cosmo_data)
# tru = pd.DataFrame(truth_data)
# import FoFCatalogMatching
# results = FoFCatalogMatching.match(
#     catalog_dict={'truth': tru, 'object': cosmo}, 
#     linking_lengths=1.0,
# )

In [9]:
# truth_mask = results['catalog_key'] == 'truth'
# object_mask = ~truth_mask

# # then np.bincount will give up the number of id occurrences (like historgram but with integer input)
# n_groups = results['group_id'].max() + 1
# n_truth = np.bincount(results['group_id'][truth_mask], minlength=n_groups)
# print(n_truth[n_truth>10])
# n_object = np.bincount(results['group_id'][object_mask], minlength=n_groups)

# # now n_truth and n_object are the number of truth/object objects in each group
# # we want to make a 2d histrogram of (n_truth, n_object). 
# n_max = max(n_truth.max(), n_object.max()) + 1
# hist_2d = np.bincount(n_object * n_max + n_truth, minlength=n_max*n_max).reshape(n_max, n_max)

# plt.imshow(np.log10(hist_2d+1), extent=(-0.5, n_max-0.5, -0.5, n_max-0.5), origin='lower');
# plt.xlabel('Number of truth objects');
# plt.ylabel('Number of object objects');
# plt.colorbar(label=r'$\log(N_{\rm groups} \, + \, 1)$');

In [10]:
# # Let's further inspect the objects in the groups that have 1-to-1 truth/object match.

# # first, let's find our the IDs of the groups that have 1-to-1 truth/object match:
# one_to_one_group_mask = np.in1d(results['group_id'], np.flatnonzero((n_truth == 1) & (n_object == 1)))

# # and then we can find the row indices in the *original* truth/object catalogs for those 1-to-1 groups
# truth_idx = results['row_index'][one_to_one_group_mask & truth_mask]
# object_idx = results['row_index'][one_to_one_group_mask & object_mask]

In [11]:
# with open("friends.pkl", "wb") as f:
#     pickle.dump(results, f)
# print("saved friends")

#### Now, since that approach didn't work, let's try something that was attempted by @Xinyue: ####
https://github.com/prob-ml/bliss/blob/master/case_studies/dc2/DC2_galaxy_psf_params.ipynb

merging on galaxy_id and cosmodc2_id


In [12]:
import pandas as pd
df_data = pd.DataFrame(truth_data)
df_galaxy = pd.DataFrame(cosmo_data)

merge_data = df_data.merge(
    df_galaxy, 
    left_on = "cosmodc2_id", 
    right_on = "galaxy_id", 
    how = "inner" 
)
merge_data_fill = merge_data.dropna()

In [13]:
merge_data.to_csv('/data/scratch/shreyasc/combined_idmatch_truth.csv')

In [14]:
merge_data.shape

(43408507, 15)

#### Now, since we actually just care about shear and convergence at locations defined by {ra, dec}, let's try explicitly merging on those ####

In [15]:
ra_dec_explicit_merge = pd.merge(df_data, df_galaxy, on=["ra", "dec"], how='inner')

In [16]:
ra_dec_explicit_merge.to_csv('/data/scratch/shreyasc/combined_ra_dec_merge_truth.csv')

In [17]:
ra_dec_explicit_merge.shape

(43408508, 13)

In [18]:
merge_data.head()

Unnamed: 0,ra_x,match_objectId,dec_x,redshift_x,cosmodc2_id,id,truth_type,galaxy_id,redshift_true,ra_y,shear_1,dec_y,convergence,redshift_y,shear_2
0,56.851958,15982501021353368,-38.677521,1.782795,8757401501,8757401501,1,8757401501,1.779265,56.851958,-0.02237,-38.677521,-0.022032,1.782795,0.041324
1,56.799616,15982501021353393,-38.677104,0.592967,8750694182,8750694182,1,8750694182,0.593389,56.799616,-0.00066,-38.677104,0.004835,0.592967,-0.000362
2,56.883238,15982501021353397,-38.676882,1.072259,8752812073,8752812073,1,8752812073,1.072098,56.883238,-0.009312,-38.676882,-0.006825,1.072259,0.014221
3,56.85817,15982501021353414,-38.676674,0.232714,8750045581,8750045581,1,8750045581,0.232546,56.85817,-0.001958,-38.676674,-0.000179,0.232714,0.000761
4,56.889715,15982501021353419,-38.676423,0.865789,8751841300,8751841300,1,8751841300,0.865677,56.889715,-0.00854,-38.676423,-0.004813,0.865789,0.007185


In [19]:
ra_dec_explicit_merge.head()

Unnamed: 0,ra,match_objectId,dec,redshift_x,cosmodc2_id,id,truth_type,galaxy_id,redshift_true,shear_1,convergence,redshift_y,shear_2
0,56.851958,15982501021353368,-38.677521,1.782795,8757401501,8757401501,1,8757401501,1.779265,-0.02237,-0.022032,1.782795,0.041324
1,56.799616,15982501021353393,-38.677104,0.592967,8750694182,8750694182,1,8750694182,0.593389,-0.00066,0.004835,0.592967,-0.000362
2,56.883238,15982501021353397,-38.676882,1.072259,8752812073,8752812073,1,8752812073,1.072098,-0.009312,-0.006825,1.072259,0.014221
3,56.85817,15982501021353414,-38.676674,0.232714,8750045581,8750045581,1,8750045581,0.232546,-0.001958,-0.000179,0.232714,0.000761
4,56.889715,15982501021353419,-38.676423,0.865789,8751841300,8751841300,1,8751841300,0.865677,-0.00854,-0.004813,0.865789,0.007185


#### Sanity Check of the ra / dec from the initial merge ####

In [20]:
(merge_data["ra_x"] - merge_data["ra_y"]).mean()

0.0

In [21]:
(merge_data["dec_x"] - merge_data["dec_y"]).mean()

0.0

In [22]:
(merge_data["redshift_true"] - merge_data["redshift_x"]).mean()

0.0001810707004943611

Note via @declan: redshift in cosmoDC2 and DC2 truth table were significantly different for common id's. Could be a possible bug in the id-matching. Update: this is not the case in our findings.