**Notebook Author**: Martin Rodriguez Monroy

**Date last tested**: 2022-10-23

**Ran with RSP image**: Weekly 2022_40

**A large container is recommended for this notebook.**

This notebook demonstrates how to bin and do Principle Component Analysis (PCA) with survey property maps, and how to create a galaxy number map using the Object catalog.

This notebook builds off DP0.2 Tutorial Notebook 03c_Survey_Property_Maps.ipynb, available in the <a href="https://github.com/rubin-dp0/tutorial-notebooks">tutorial-notebooks repository</a>.
Data products are accessed through the Butler, and the user is expected to be familiar with the content of the introductory Butler tutorial in that repo (04a_Introduction_to_the_Butler.ipynb).

# 1.0. Import packages

In [None]:
# general python packages
import numpy as np
import matplotlib.pyplot as plt
from astropy.visualization import ZScaleInterval, LinearStretch, ImageNormalize
from astropy.wcs import WCS
import os
import seaborn as sns
import pandas as pd
import pickle

# specific packages for statistics and principal component analysis
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import stats
from scipy.optimize import curve_fit

# packages for working with sparse healpix maps
import healsparse as hsp
import skyproj

#packages for working with healpy healpix maps
import healpy as hp

# LSST packages
from lsst.daf.butler import Butler
import lsst.geom as geom

# allow interactive plots
%matplotlib widget

# default plot style is accessible
plt.style.use('tableau-colorblind10')

In [None]:
config = 'dp02'
collections = '2.2i/runs/DP0.2'
butler = Butler(config, collections=collections)

# 2.0. Display the map of magnitude limit

In [None]:
hspmap = butler.get('deepCoadd_psf_maglim_consolidated_map_weighted_mean', band='i')

In [None]:
nside_coverage = hspmap.nside_coverage
nside_sparse = hspmap.nside_sparse
print('nside_coverage = ', hspmap.nside_coverage)
print('nside_sparse = ', hspmap.nside_sparse)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
sp.draw_hspmap(hspmap, vmin=26.0, vmax=26.3)
sp.draw_colorbar(label='PSF Maglim (i-band)')
plt.show()

del fig, ax, sp

# 3.0. Degrade the SP map to lower nside resolution 

In [None]:
deg_nside = 512

In [None]:
hspmap = hspmap.degrade(deg_nside)

In [None]:
print('nside_coverage = ', hspmap.nside_coverage)
print('nside_sparse = ', hspmap.nside_sparse)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
sp.draw_hspmap(hspmap, vmin=26.0, vmax=26.3)
sp.draw_colorbar(label='PSF Maglim (i-band)')
plt.show()

del fig, ax, sp

# 4.0. Binning the SP maps on sky 

In [None]:
def equal_area_bin_edges(map_data,nbins):
        data_min = map_data.min()
        data_max = map_data.max()
        pix_per_bin = int(len(map_data)/nbins)

        data_sort = np.sort(map_data)
        if nbins*pix_per_bin==len(map_data):
                data_sort = np.append(data_sort,data_max)
        binedges = [data_sort[i*pix_per_bin] for i in range(nbins+1)]
        binedges[-1] = data_max

        if len(np.unique(binedges)) != len(binedges):
                raise RuntimeError('Your bin edges are not unique please set them manually')

        return binedges

In [None]:
valid_pix = hspmap.valid_pixels

In [None]:
sp_ra, sp_dec = hspmap.valid_pixels_pos(lonlat=True)

In [None]:
vals = hspmap.get_values_pix(valid_pix, nest=True)

In [None]:
print(len(valid_pix),len(vals),len(sp_ra))

In [None]:
binedges1d = equal_area_bin_edges(vals,nbins=4)

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
_ = plt.hist(vals,bins=100)
for spbin in binedges1d:
    plt.axvline(x=spbin,ls='--',color='orange')
plt.plot([],[],ls='--',color='orange',label='Bin edges')
plt.grid()
plt.yscale("log")
plt.xlabel('SP values')
plt.ylabel('Number of pixels')
plt.legend(loc="upper left")
plt.show()

del fig, ax

In [None]:
sky_bins = np.ones(len(valid_pix))
for ibin in range(len(binedges1d)-1):
    ibin_mask = (vals>binedges1d[ibin])*(vals<binedges1d[ibin+1])
    sky_bins[ibin_mask] = ibin+1

In [None]:
hsp_bins = hsp.HealSparseMap.make_empty(hspmap.nside_coverage, hspmap.nside_sparse, dtype=np.float64)
hsp_bins.update_values_pix(valid_pix, sky_bins,operation='replace')

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
sp.draw_hspmap(hsp_bins)
sp.draw_colorbar(label='PSF Maglim (i-band)')
plt.show()

del fig, ax, sp

Let's load additional SP maps at nside = 512 

In [None]:
sp_names = []
for dtype in sorted(butler.registry.queryDatasetTypes(expression="*consolidated_map*")):
    print(dtype.name)
    sp_names.append(dtype.name)

In [None]:
sp_names = ['deepCoadd_exposure_time_consolidated_map_sum',
            'deepCoadd_psf_maglim_consolidated_map_weighted_mean',
            'deepCoadd_psf_size_consolidated_map_weighted_mean',
            'deepCoadd_sky_background_consolidated_map_weighted_mean']

In [None]:
band = 'i'

In [None]:
map_dict = {}
for name in sp_names:
    hspmap_ = butler.get(name, band='i')
    hspmap_ = hspmap_.degrade(512)
    dict_ = {}
    dict_['nside_coverage'] = hspmap_.nside_coverage
    dict_['nside_sparse'] = hspmap_.nside_sparse
    valid_pixels_ = hspmap_.valid_pixels
    dict_['valid_pixels'] = valid_pixels_
    dict_['map_values'] = np.array(hspmap_.get_values_pix(valid_pixels_, nest=True))
    
    map_dict[name] = dict_
    del hspmap_
    

In [None]:
#val_pix_ref = map_dict[sp_names[0]]['valid_pixels']
for sp in sp_names:
    print((map_dict[sp]['valid_pixels']==valid_pix).all())

In [None]:
map_dict[sp_names[0]]

In [None]:
corr_matrix_p = np.zeros((len(sp_names),len(sp_names)))
for i,map_i in enumerate(sp_names):
    vals_i = np.array(map_dict[map_i]['map_values'])
    for j,map_j in enumerate(sp_names):
        vals_j = map_dict[map_j]['map_values']
        corr_matrix_p[i,j] = stats.pearsonr(vals_i,vals_j)[0]
print(corr_matrix_p)

In [None]:
corr_matrix_s = np.zeros((len(sp_names),len(sp_names)))
for i,map_i in enumerate(sp_names):
    vals_i = np.array(map_dict[map_i]['map_values'])
    for j,map_j in enumerate(sp_names):
        vals_j = map_dict[map_j]['map_values']
        corr_matrix_s[i,j] = stats.spearmanr(vals_i,vals_j)[0]
print(corr_matrix_s)

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(1,3,1)
sns.heatmap(corr_matrix_p,vmin=-1.0,vmax=1.0,annot=True,cmap='jet',linewidth=0.5,square=True,cbar=True)
plt.title(r'$r_P$ coeff')
ax2 = fig.add_subplot(1,3,3)
plt.title(r'$r_S$ coeff')
sns.heatmap(corr_matrix_s,vmin=-1.0,vmax=1.0,annot=True,cmap='jet',linewidth=0.5,square=True,cbar=True)

del fig, ax1, ax2

# 5.0. Do principal component analysis (PCA) of the SP maps 

In [None]:
def pca_summary(pca, standardised_data, out=True):
        names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
        a = list(np.std(pca.transform(standardised_data), axis=0))
        b = list(pca.explained_variance_ratio_)
        c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1,len(pca.explained_variance_ratio_)+1)]
        columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
        summary = pd.DataFrame(zip(a, b, c), index=names, columns=columns)
        if out:
                print("Importance of components:")
                display(summary)
        return summary

In [None]:
def screeplot(pca, standardised_values, figsize):
        y = np.std(pca.transform(standardised_values), axis=0)**2
        x = np.arange(len(y)) + 1
        fig = plt.figure(figsize=figsize)
        fig.set_tight_layout(True)
        plt.plot(x, y, "o-")
        plt.xticks(x, ["Comp."+str(i) for i in x], ha='right', rotation=50, fontsize=12)
        plt.ylabel("Variance")
        plt.grid()
        plt.show()
        #plt.close()

In [None]:
data_dict = {}
for key in map_dict:
    data_dict[key] = map_dict[key]['map_values']

In [None]:
dataframe = pd.DataFrame(data_dict)

In [None]:
standard = scale(dataframe)
standard_df = pd.DataFrame(standard,columns=dataframe.columns)
pca = PCA().fit(standard_df)

In [None]:
summary = pca_summary(pca, standard_df)

In [None]:
summary.to_csv('data_frame.csv',header=True)

In [None]:
screeplot(pca, standard_df, figsize=(8,6))

In [None]:
print(pca.components_)

In [None]:
print(pca.components_[0])

In [None]:
for i in range(len(sp_names)):
        print('PC {0}'.format(i+1))
        pcavalues = pca.transform(standard_df)[:,i]
        
        hsp_pca_ = hsp.HealSparseMap.make_empty(hspmap.nside_coverage, deg_nside, dtype=np.float64)
        hsp_pca_.update_values_pix(valid_pix, pcavalues,operation='replace')
        
        assert (hsp_pca_.valid_pixels==valid_pix).all()
        
        fig, ax = plt.subplots(figsize=(8, 5))
        sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
        sp.draw_hspmap(hsp_pca_)
        sp.draw_colorbar(label='PSF Maglim (i-band)')
        plt.show()

        del fig, ax, sp

# 6.0. Load galaxies from dp02_dc2_catalogs.Object and create number galaxy map 
We previously saved the colums that we are interested in from dp02_dc2_catalogs.Object in a pickle file


In [None]:
def cat2map(ra,dec,nside,weight=None):
        theta = np.radians(90.-dec)
        phi = np.radians(ra)
        
        map1 = np.zeros(hp.nside2npix(nside))
        p = hp.ang2pix(nside,theta,phi,nest=True)
        #print(p)
        mask = np.zeros(hp.nside2npix(nside)).astype('bool')
        mask[p] = True
        if weight is None:
                for i in p:
                        map1[i] += 1
        else:
                for index,i in enumerate(p):
                        map1[i] += weight[index]
        map1[~mask] = hp.UNSEEN
        pix_ra, pix_dec = hp.pix2ang(nside,np.unique(p),nest=True,lonlat=True)
        
        return map1, mask, np.unique(p), pix_ra, pix_dec

In [None]:
with open('/scratch/mrmonroy/survpropmap/sources_result.pkl','rb') as f:
    cat_table = pickle.load(f)

In [None]:
cat_table

In [None]:
ra = cat_table['coord_ra']
dec = cat_table['coord_dec']
refext = cat_table['refExtendedness']

In [None]:
sel_mask = (refext==1.0)

In [None]:
sel_ra = np.array(ra[sel_mask])
sel_dec = np.array(dec[sel_mask])

In [None]:
print(len(sel_ra))
print(sel_ra.min(),sel_ra.max())
print(sel_dec.min(),sel_dec.max())

In [None]:
ngal_vals, mask, gal_pixels, pix_ra, pix_dec = cat2map(sel_ra,sel_dec,deg_nside)

In [None]:
assert (np.sum(ngal_vals[mask])==len(sel_ra))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
n,bins,_ = ax.hist(ngal_vals[mask],bins=100)
ax.grid()
ax.set_xlabel(r'$n_{gal}$')
ax.set_ylabel('Number if pixels')

del fig, ax

In [None]:
hsp_map_ngal = hsp.HealSparseMap.make_empty(nside_coverage, deg_nside, dtype=np.float64)
pixels_ngal = hp.ang2pix(deg_nside, np.radians(90. - pix_dec), np.radians(pix_ra), nest=True)
assert (len(np.unique(pixels_ngal))==len(pixels_ngal))
hsp_map_ngal.update_values_pix(pixels_ngal, ngal_vals[mask],operation='replace')

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
sp.draw_hspmap(hsp_map_ngal)
sp.draw_colorbar(label='ngal')
plt.show()

del fig, ax, sp

In [None]:
(hsp_map_ngal.get_values_pix(gal_pixels)==ngal_vals[mask]).all()

Now we evaluate the SP maps in the same regions where the $n_{gal}$ map is defined 

In [None]:
hspmap_masked = hsp.HealSparseMap.make_empty(nside_coverage, deg_nside, dtype=np.float64)
hspmap_masked.update_values_pix(gal_pixels, hspmap.get_values_pix(gal_pixels, nest=True),operation='replace')

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sp = skyproj.McBrydeSkyproj(ax=ax, lon_0=65.0)
sp.draw_hspmap(hspmap_masked, vmin=26.0, vmax=26.3)
sp.draw_colorbar(label='PSF Maglim (i-band)')
plt.show()

del fig, ax, sp

Let's mask all the SP maps 

In [None]:
masked_map_dict = {}
for name in sp_names:
    hspmap_ = butler.get(name, band='i')
    hspmap_ = hspmap_.degrade(deg_nside)
    assert (hspmap_.nside_sparse==hsp_map_ngal.nside_sparse)
    
    dict_ = {}
    dict_['nside_coverage'] = hspmap_.nside_coverage
    dict_['nside_sparse'] = hspmap_.nside_sparse
    dict_['map_values'] = np.array(hspmap_.get_values_pix(gal_pixels, nest=True))
    
    masked_map_dict[name] = dict_
    del hspmap_
    

# 7.0. Compute 1D relations 
We define the function to compute them 

In [None]:
def bin1d_coords(ngal_map,mask,sp_map_vals,nbins1d):
    ngal_vals = ngal_map[mask]
    ngal_footprint = np.average(ngal_vals)
    
    binedges1d = equal_area_bin_edges(sp_map_vals,nbins=nbins1d)
    
    sp_in_bin1d = []
    ngal_in_bin1d = []
    err_in_bin1d = []
    for ibin in range(len(binedges1d)-1):
        ibin_mask = (sp_map_vals>binedges1d[ibin])*(sp_map_vals<binedges1d[ibin+1])
        
        sp_in_bin1d_ = np.average(sp_map_vals[ibin_mask])
        ngal_in_bin1d_ = np.average(ngal_vals[ibin_mask])
        err_in_bin1d_ = np.std(ngal_vals[ibin_mask])/np.sqrt(len(ngal_vals[ibin_mask]))
        
        sp_in_bin1d.append(sp_in_bin1d_)
        ngal_in_bin1d.append(ngal_in_bin1d_)
        err_in_bin1d.append(err_in_bin1d_)
    
    sp_in_bin1d = np.array(sp_in_bin1d)
    ngal_in_bin1d = np.array(ngal_in_bin1d)
    err_in_bin1d = np.array(err_in_bin1d)
    
    return sp_in_bin1d, ngal_in_bin1d, err_in_bin1d, ngal_footprint
    

In [None]:
def fun_fit(x,a,b,c):
    return a*x**2.+b*x+c

Let's plot an example 

In [None]:
nbins1d = 10

In [None]:
sp_x,ngal_y,ngal_err,ngal_mean = bin1d_coords(ngal_vals,mask,masked_map_dict[sp_names[0]]['map_values'],nbins1d=nbins1d)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.axhline(y=1.0,ls='--',color='b')
ax.plot(sp_x,ngal_y/ngal_mean,color='r')
plt.errorbar(sp_x,ngal_y/ngal_mean,yerr=ngal_err/ngal_mean,fmt='.',color='r')
ax.grid()
ax.set_xlabel(sp_names[0])
plt.ylabel(r'$n_{gal}/\langle n_{gal} \rangle$')
plt.show()

Now let's do this for all SP maps and fit each 1D relation with a quadratic function 

In [None]:
dict_1d = {}
for name in sp_names:
    dict_1d_ = {}
    
    sp_x_,ngal_y_,ngal_err_,ngal_mean_ = bin1d_coords(ngal_vals,mask,masked_map_dict[name]['map_values'],nbins1d=nbins1d)
    
    p0,cov0 = curve_fit(fun_fit,sp_x_,ngal_y_/ngal_mean_,sigma=ngal_err_/ngal_mean_)
    ngal_fit_ = fun_fit(sp_x_,p0[0],p0[1],p0[2])
    
    dict_1d_['x'] = sp_x_
    dict_1d_['y'] = ngal_y_
    dict_1d_['err'] = ngal_err_
    dict_1d_['ngal_mean'] = ngal_mean_
    dict_1d_['yfit'] = ngal_fit_
    
    dict_1d[name] = dict_1d_

In [None]:
dict_1d.keys()

In [None]:
sp_names

In [None]:
num_columns = 2
# num_rows = len(sp_names)+len(sp_names)%2
num_rows = 2

fig, axs = plt.subplots(num_rows, num_columns, figsize=(8,6))

i = 0
index_list = []
for row in range(num_rows):
    for col in range(num_columns):
        
        # if name not in index_list:
        #     sp_name = sp_names[row+col]
        # else:
        #     sp_name = sp_names[row+col+1]
        sp_name = sp_names[i]
        i += 1
            
        index_list.append(sp_name)
            
        ngal_mean_ = dict_1d[sp_name]['ngal_mean']
        x_ = dict_1d[sp_name]['x']
        y_ = dict_1d[sp_name]['y']/ngal_mean_
        err_ = dict_1d[sp_name]['err']/ngal_mean_
        yfit_ = dict_1d[sp_name]['yfit']
            
        axs[row,col].axhline(y=1.0,ls='--',color='b')
        axs[row,col].plot(x_,y_,color='r')
        axs[row,col].errorbar(x_,y_,yerr=err_,fmt='.',color='r')
        axs[row,col].plot(x_,yfit_,ls='--',color='purple')
        axs[row,col].set_title(sp_name, fontsize='8')
        
plt.tight_layout()
plt.show()
del fig, axs
print(index_list)