# Using DC2 and CosmoDC2 to Extract Shear and Convergence #

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

note: commented portions of cells below to illustrate attempted methods that were suboptimal (esp. from dc2-linked tutorials), included for future observation

In [2]:
import torch
import numpy as np
from os import environ
from pathlib import Path
from einops import rearrange

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from hydra import initialize, compose
from hydra.utils import instantiate
import healpy as hp
import pickle as pkl
import os

In [3]:
# modify here if different
file_name = "lensing_catalog.pkl"
file_path = os.path.join("/data", "scratch", "shreyasc", file_name)
file_already_populated = os.path.isfile(file_path)

CosmoDC2 is REALLY big so we don't want to load all of its data into memory when accessing attributes. We set filters below that match dc2's piece of the sky since it's a subset of Cosmo's coverage. In the below cell, we load our DC2 catalog with the fields we are interested in and find the relevant sky quadrant, allowing us to build the filters.

In [4]:
if not file_already_populated:
    import GCRCatalogs
    from GCRCatalogs import GCRQuery
    GCRCatalogs.set_root_dir("/data/scratch/dc2_nfs/")
    truth_cat = GCRCatalogs.load_catalog('desc_dc2_run2.2i_dr6_truth')
    truth_data = truth_cat.get_quantities(["id", "cosmodc2_id", "ra", "dec", "match_objectId", "flux_u", "flux_g", "flux_r", "flux_i", "flux_z", "flux_y", "truth_type"])

    max_ra = np.nanmax(truth_data['ra'])
    min_ra = np.nanmin(truth_data['ra'])
    max_dec = np.nanmax(truth_data['dec'])
    min_dec = np.nanmin(truth_data['dec'])
    pos_filters = [f'ra >= {min_ra}',f'ra <= {max_ra}', f'dec >= {min_dec}', f'dec <= {max_dec}']

    vertices = hp.ang2vec(np.array([min_ra, max_ra, max_ra, min_ra]),
                        np.array([min_dec, min_dec, max_dec, max_dec]), lonlat=True)
    ipix = hp.query_polygon(32, vertices, inclusive=True)
    healpix_filter = GCRQuery((lambda h: np.isin(h, ipix, True), "healpix_pixel"))
    truth_data = pd.DataFrame(truth_data)
    truth_data.drop(['ra', 'dec'], axis=1, inplace=True)
    # print(truth_data.keys())

In [5]:
# truth_data

In [6]:
# truth_data.to_csv('/data/scratch/shreyasc/truth_data_only.csv')

In [7]:
# truth_data[((truth_data["match_objectId"] > -1) & (truth_data["is_unique_truth_entry"]))]

Next, load in CosmoDC2 and select the quantities we want bounded by the aforementioned filters

In [8]:
if not file_already_populated:
    config_overwrite = dict(
        catalog_root_dir='/data/scratch/dc2_nfs/cosmoDC2'
    )

    cosmo_cat = GCRCatalogs.load_catalog('desc_cosmodc2', config_overwrite)
    cosmo_data = cosmo_cat.get_quantities(quantities = ["galaxy_id", "shear_1", "shear_2", "convergence", "ra", "dec", "mag_true_r", "galaxy_id", "position_angle_true", "size_minor_disk_true", 
        "size_disk_true", "size_minor_bulge_true", 
        "size_bulge_true", "bulge_to_total_ratio_i", "redshift"], filters=pos_filters, native_filters=healpix_filter)
    cosm_dat = pd.DataFrame(cosmo_data)
    # cosm_dat.to_csv("/data/scratch/shreyasc/cosmo_only.csv")

In [9]:
# cosm_dat

In [15]:
    # PSF params - we need object catalog for now
if not file_already_populated:
    
        match_cat = GCRCatalogs.load_catalog('desc_dc2_run2.2i_dr6_object_with_truth_match')
        psf_params = match_cat.get_quantities([
            "IxxPSF_pixel_g", "IxxPSF_pixel_z", 
            "IxxPSF_pixel_r", "IxxPSF_pixel_i", "IxxPSF_pixel_u", 
            "IxxPSF_pixel_y", "IyyPSF_pixel_g", "IyyPSF_pixel_z", 
            "IyyPSF_pixel_r", "IyyPSF_pixel_i", "IyyPSF_pixel_u", 
            "IyyPSF_pixel_y", "IxyPSF_pixel_g", "IxyPSF_pixel_z", 
            "IxyPSF_pixel_r", "IxyPSF_pixel_i", "IxyPSF_pixel_u", 
            "IxyPSF_pixel_y", "psf_fwhm_g", "psf_fwhm_z", "psf_fwhm_r",
            "psf_fwhm_i", "psf_fwhm_u", "psf_fwhm_y", "cosmodc2_id_truth"
        ])
        psf = pd.DataFrame(psf_params)

In [None]:
# pd.set_option('display.max_columns', None)
# psf

#### Matching Attempt 1: Using example code from ####
https://github.com/LSSTDESC/DC2-analysis/blob/253625a230d545f4ceb529aae58416ef7a768648/tutorials/matching_fof.ipynb

Note: Do not actually run the commented cells below, takes forever and does not converge.

In [None]:
# cosmo = pd.DataFrame(cosmo_data)
# tru = pd.DataFrame(truth_data)
# import FoFCatalogMatching
# results = FoFCatalogMatching.match(
#     catalog_dict={'truth': tru, 'object': cosmo}, 
#     linking_lengths=1.0,
# )

In [None]:
# truth_mask = results['catalog_key'] == 'truth'
# object_mask = ~truth_mask

# # then np.bincount will give up the number of id occurrences (like historgram but with integer input)
# n_groups = results['group_id'].max() + 1
# n_truth = np.bincount(results['group_id'][truth_mask], minlength=n_groups)
# print(n_truth[n_truth>10])
# n_object = np.bincount(results['group_id'][object_mask], minlength=n_groups)

# # now n_truth and n_object are the number of truth/object objects in each group
# # we want to make a 2d histrogram of (n_truth, n_object). 
# n_max = max(n_truth.max(), n_object.max()) + 1
# hist_2d = np.bincount(n_object * n_max + n_truth, minlength=n_max*n_max).reshape(n_max, n_max)

# plt.imshow(np.log10(hist_2d+1), extent=(-0.5, n_max-0.5, -0.5, n_max-0.5), origin='lower');
# plt.xlabel('Number of truth objects');
# plt.ylabel('Number of object objects');
# plt.colorbar(label=r'$\log(N_{\rm groups} \, + \, 1)$');

In [None]:
# # Let's further inspect the objects in the groups that have 1-to-1 truth/object match.

# # first, let's find our the IDs of the groups that have 1-to-1 truth/object match:
# one_to_one_group_mask = np.in1d(results['group_id'], np.flatnonzero((n_truth == 1) & (n_object == 1)))

# # and then we can find the row indices in the *original* truth/object catalogs for those 1-to-1 groups
# truth_idx = results['row_index'][one_to_one_group_mask & truth_mask]
# object_idx = results['row_index'][one_to_one_group_mask & object_mask]

In [None]:
# with open("friends.pkl", "wb") as f:
#     pickle.dump(results, f)
# print("saved friends")

#### Now, since that approach didn't work, let's try something that was attempted by @Xinyue: ####
https://github.com/prob-ml/bliss/blob/master/case_studies/dc2/DC2_galaxy_psf_params.ipynb

merging on galaxy_id and cosmodc2_id


In [16]:
if not file_already_populated:
    cosmo_truth = cosm_dat.merge(
        truth_data, 
        left_on="galaxy_id", right_on="cosmodc2_id", 
        how = "left" 
    )

    merge_with_object = cosmo_truth.merge(
        psf, 
        left_on = "galaxy_id", 
        right_on = "cosmodc2_id_truth", 
        how = "left" 
    )

In [None]:
# merge_data.shape

In [22]:
if not file_already_populated:
    with open(file_path, 'wb') as f:
        pkl.dump(merge_with_object, f)