In [None]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
from sklearn import preprocessing
from sklearn.cluster import KMeans, MiniBatchKMeans

In [None]:
import lib

In [None]:
rng = np.random.default_rng(0)

In [None]:
shear_step_plus = "g1_slice=0.02__g2_slice=0.00__g1_other=0.00__g2_other=0.00__zlow=0.0__zhigh=6.0"
shear_step_minus = "g1_slice=-0.02__g2_slice=0.00__g1_other=0.00__g2_other=0.00__zlow=0.0__zhigh=6.0"

In [None]:
shear_sim_plus = h5py.File(
    lib.const.SIM_SHEAR_CATALOGS[shear_step_plus],
)
redshift_sim_plus = h5py.File(
    lib.const.SIM_REDSHIFT_CATALOGS[shear_step_plus],
)
neighbors_sim_plus = h5py.File(
    f"/pscratch/sd/s/smau/fiducial-neighbors/neighbors_{shear_step_plus}.hdf5",
)

In [None]:
shear_sim_minus = h5py.File(
    lib.const.SIM_SHEAR_CATALOGS[shear_step_minus],
)
redshift_sim_minus = h5py.File(
    lib.const.SIM_REDSHIFT_CATALOGS[shear_step_minus],
)
neighbors_sim_minus = h5py.File(
    f"/pscratch/sd/s/smau/fiducial-neighbors/neighbors_{shear_step_minus}.hdf5",
)

In [None]:
bhat_sim_plus = lib.tomography.get_tomography(shear_sim_plus, redshift_sim_plus, "noshear")
cell_sim_plus = lib.tomography.get_assignment(shear_sim_plus, redshift_sim_plus, "noshear")
weight_sim_plus = lib.weight.get_shear_weights(shear_sim_plus["mdet/noshear"])

In [None]:
bhat_sim_minus = lib.tomography.get_tomography(shear_sim_minus, redshift_sim_minus, "noshear")
cell_sim_minus = lib.tomography.get_assignment(shear_sim_minus, redshift_sim_minus, "noshear")
weight_sim_minus = lib.weight.get_shear_weights(shear_sim_minus["mdet/noshear"])

In [None]:
shear_y6 = h5py.File(lib.const.Y6_SHEAR_CATALOG)
redshift_y6 = h5py.File(lib.const.Y6_REDSHIFT_CATALOG)
neighbors_y6 = h5py.File(f"/pscratch/sd/s/smau/fiducial-neighbors/neighbors_y6.hdf5")

In [None]:
bhat_y6 = lib.tomography.get_tomography(shear_y6, redshift_y6, "noshear")
cell_y6 = lib.tomography.get_assignment(shear_y6, redshift_y6, "noshear")
weight_y6 = lib.weight.get_shear_weights(shear_y6["mdet/noshear"])

In [None]:
# cell_assignments = {}
# for tomographic_bin in lib.const.TOMOGRAPHIC_BINS:
#     cell_assignments[tomographic_bin] = np.unique(
#         cell_y6[bhat_y6 == tomographic_bin]
#     ).astype(int)

# assert len(
#     functools.reduce(
#         np.intersect1d,
#         [
#             cells
#             for cells in cell_assignments.values()
#         ],
#     )
# ) == 0
cell_assignments = lib.const.CELL_ASSIGNMENTS

---

In [None]:
tomographic_bin = lib.const.TOMOGRAPHIC_BINS[0]

In [None]:
sel_sim_plus = (bhat_sim_plus == tomographic_bin)

In [None]:
sel_sim_minus = (bhat_sim_minus == tomographic_bin)

In [None]:
sel_y6 = (bhat_y6 == tomographic_bin)

In [None]:
# _weight_sim_plus = weight_sim_plus[sel_sim_plus]

In [None]:
# _weight_sim_minus = weight_sim_minus[sel_sim_minus]

In [None]:
# _weight_y6 = weight_y6[sel_y6]

In [None]:
N_SUBSAMPLE = 400_000

In [None]:
_sim_subsample_plus = rng.choice(sum(sel_sim_plus), N_SUBSAMPLE, replace=False)
sim_subsample_plus = np.isin(
    np.arange(sum(sel_sim_plus)),
    _sim_subsample_plus,
)

In [None]:
_sim_subsample_minus = rng.choice(sum(sel_sim_minus), N_SUBSAMPLE, replace=False)
sim_subsample_minus = np.isin(
    np.arange(sum(sel_sim_minus)),
    _sim_subsample_minus,
)

In [None]:
_y6_subsample = rng.choice(sum(sel_y6), N_SUBSAMPLE, replace=False)
y6_subsample = np.isin(
    np.arange(sum(sel_y6)),
    _y6_subsample,
)

In [None]:
neighbors_sim_plus["mdet"]["noshear"]["mag"][sel_sim_plus][sim_subsample_plus],
neighbors_sim_plus["mdet"]["noshear"]["neighbor_mag"][sel_sim_plus][sim_subsample_plus],
neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus]

In [None]:
# bins = np.geomspace(1e0, 1e2, 101)
NBINS = 100
bins = np.geomspace(1e-6, 1e-2, NBINS + 1)

fig, axs = plt.subplots(1, 1)

# axs.hist(neighbor_distance_y6, bins=bins, histtype="step")
axs.hist(
    neighbors_y6["mdet"]["noshear"]["neighbor_distance"][sel_y6][y6_subsample],
    weights=weight_y6[sel_y6][y6_subsample],
    bins=bins,
    histtype="step",
    label="Y6",
)
axs.hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus],
    weights=weight_sim_plus[sel_sim_plus][sim_subsample_plus],
    bins=bins,
    histtype="step",
    label="sim (+)",
)
axs.hist(
    neighbors_sim_minus["mdet"]["noshear"]["neighbor_distance"][sel_sim_minus][sim_subsample_minus],
    weights=weight_sim_minus[sel_sim_minus][sim_subsample_minus],
    bins=bins,
    histtype="step",
    label="sim (-)",
)

axs.set_xscale("log")

axs.set_xlabel("nearest neighbor distance")
axs.set_ylabel("counts")

fig.show()

In [None]:
X_sim_plus = np.stack(
    [
        np.log10(neighbors_sim_plus["mdet"]["noshear"]["neighbor_mag"][sel_sim_plus][sim_subsample_plus]),
        neighbors_sim_plus["mdet"]["noshear"]["mag"][sel_sim_plus][sim_subsample_plus],
        neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus],
    ],
    axis=-1,
)

In [None]:
weights_sim_plus = weight_sim_plus[sel_sim_plus][sim_subsample_plus]

In [None]:
X_sim_minus = np.stack(
    [
        np.log10(neighbors_sim_minus["mdet"]["noshear"]["neighbor_mag"][sel_sim_minus][sim_subsample_minus]),
        neighbors_sim_minus["mdet"]["noshear"]["mag"][sel_sim_minus][sim_subsample_minus],
        neighbors_sim_minus["mdet"]["noshear"]["neighbor_distance"][sel_sim_minus][sim_subsample_minus],
    ],
    axis=-1,
)

In [None]:
weights_sim_minus = weight_sim_minus[sel_sim_minus][sim_subsample_minus]

In [None]:
X_sim = np.concatenate([X_sim_plus, X_sim_minus])
W_sim = np.concatenate([weights_sim_plus, weights_sim_minus])

In [None]:
X_y6 = np.stack(
    [
        np.log10(neighbors_y6["mdet"]["noshear"]["neighbor_mag"][sel_y6][y6_subsample]),
        neighbors_y6["mdet"]["noshear"]["mag"][sel_y6][y6_subsample],
        neighbors_y6["mdet"]["noshear"]["neighbor_distance"][sel_y6][y6_subsample],
    ],
    axis=-1,
)

In [None]:
W_y6 = weight_y6[sel_y6][y6_subsample]

In [None]:
X = np.concatenate([X_sim, X_y6])
W = np.concatenate([W_sim, W_y6])

In [None]:
scaler = preprocessing.StandardScaler().fit(X)

In [None]:
ns = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
inertias = []
inertias_batch = []
for n in ns:
    kmeans = KMeans(
        n_clusters=n,
        random_state=0,
    ).fit(
        scaler.transform(X),
        sample_weight=W,
    )
    inertias.append(kmeans.inertia_)

    batch_kmeans = MiniBatchKMeans(
        n_clusters=n,
        random_state=0,
    ).fit(
        scaler.transform(X),
        sample_weight=W,
    )
    inertias_batch.append(batch_kmeans.inertia_)

In [None]:
plt.axvline(200, ls=":")
plt.plot(
    ns,
    inertias,
    marker="o",
    markersize=2,
    c="k",
)
plt.plot(
    ns,
    inertias_batch,
    marker="o",
    markersize=2,
    c="r",
    ls="--",
)
plt.xlabel("N Clusters")
plt.ylabel("Inertia")
plt.loglog()
plt.show()

In [None]:
N_CLUSTERS = 200

In [None]:
kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=0,
).fit(
    scaler.transform(X),
    sample_weight=W,
)

In [None]:
# y_sim = kmeans.predict(scaler.transform(X_sim))
y_sim_plus = kmeans.predict(scaler.transform(X_sim_plus))
y_sim_minus = kmeans.predict(scaler.transform(X_sim_minus))

In [None]:
y_y6 = kmeans.predict(scaler.transform(X_y6))

In [None]:
cmap = mpl.cm.cubehelix
bounds = np.unique(kmeans.labels_)
bounds = np.append(bounds, bounds[-1] + 1)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

N_PLOT = 1_000
_sim_subsample_plus = rng.choice(len(X_sim_plus), N_PLOT, replace=False)
_sim_subsample_minus = rng.choice(len(X_sim_minus), N_PLOT, replace=False)
_y6_subsample = rng.choice(len(X_y6), N_PLOT, replace=False)

fig, axs = plt.subplots(2, 3, sharex="row", sharey="row")

for ax in axs.ravel():
    ax.set_facecolor("grey")

axs[0, 0].scatter(
    10**X_sim_plus[_sim_subsample_plus, 0],
    X_sim_plus[_sim_subsample_plus, 1] - X_sim_plus[_sim_subsample_plus, 2],
    c=y_sim_plus[_sim_subsample_plus],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[0, 0].set_xlabel("distance")
axs[0, 0].set_ylabel("$mag - mag_{neighbor}$")

axs[0, 1].scatter(
    10**X_sim_minus[_sim_subsample_minus, 0],
    X_sim_minus[_sim_subsample_minus, 1] - X_sim_minus[_sim_subsample_minus, 2],
    c=y_sim_minus[_sim_subsample_minus],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[0, 1].set_xlabel("distance")
# axs[0, 1].set_ylabel("$mag - mag_{neighbor}$")

axs[0, 2].scatter(
    10**X_y6[_y6_subsample, 0],
    X_y6[_y6_subsample, 1] - X_y6[_y6_subsample, 2],
    c=y_y6[_y6_subsample],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[0, 2].set_xlabel("distance")
# axs[0, 2].set_ylabel("$mag - mag_{neighbor}$")

axs[1, 0].scatter(
    X_sim_plus[_sim_subsample_plus, 1],
    X_sim_plus[_sim_subsample_plus, 2],
    c=y_sim_plus[_sim_subsample_plus],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[1, 0].set_xlabel("$mag$")
axs[1, 0].set_ylabel("$mag_{neighbor}$")

axs[1, 1].scatter(
    X_sim_minus[_sim_subsample_minus, 1],
    X_sim_minus[_sim_subsample_minus, 2],
    c=y_sim_minus[_sim_subsample_minus],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[1, 1].set_xlabel("$mag$")
# axs[1, 1].set_ylabel("$mag_{neighbor}$")

axs[1, 2].scatter(
    X_y6[_y6_subsample, 1],
    X_y6[_y6_subsample, 2],
    c=y_y6[_y6_subsample],
    cmap=cmap,
    norm=norm,
    s=12,
)
axs[1, 2].set_xlabel("$mag$")
# axs[1, 2].set_ylabel("$mag_{neighbor}$")

axs[0, 0].set_title("sim (+)")
axs[0, 1].set_title("sim (-)")
axs[0, 2].set_title("Y6")

# fig.supxlabel("nearest neighbor distance [arcsec]")
# fig.supylabel("$mag - mag_{neighbor}$")

cb = fig.colorbar(
    mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=axs.ravel(),
    label="K-Means Cluster Label",
    spacing="proportional",
)
cb.locator = mpl.ticker.MaxNLocator(nbins="auto", integer=True)
cb.minor_locator = mpl.ticker.NullLocator()

plt.show()

---

In [None]:
w_bins = np.arange(N_CLUSTERS)

In [None]:
w_plus = np.bincount(y_y6) / np.bincount(y_sim_plus)
w_plus = np.ma.masked_invalid(w_plus)
w_plus /= np.mean(w_plus)

In [None]:
w_minus = np.bincount(y_y6) / np.bincount(y_sim_minus)
w_minus = np.ma.masked_invalid(w_minus)
w_minus /= np.nanmean(w_minus)

In [None]:
plt.hist(
    [w_plus, w_minus],
    label=["sim (+)", "sim (-)"],
    bins=100,
)
plt.xlabel("weight")
plt.legend()
plt.show()

In [None]:
_w_ind_plus = np.digitize(
    y_sim_plus,
    w_bins,
    right=True,
)

_w_ind_minus = np.digitize(
    y_sim_minus,
    w_bins,
    right=True,
)

In [None]:
w_plus_low = np.quantile(w_plus, 0.1)
w_plus_high = np.quantile(w_plus, 0.9)

w_minus_low = np.quantile(w_minus, 0.1)
w_minus_high = np.quantile(w_minus, 0.9)

In [None]:
NBINS = 100

fig, axs = plt.subplots(1, 3)

bins = np.linspace(16, 26, NBINS + 1)

axs[0].hist(
    neighbors_y6["mdet"]["noshear"]["mag"][sel_y6][y6_subsample],
    bins=bins,
    density=True,
    ec="gray",
    histtype="step",
    label="Y6",
)
axs[0].hist(
    neighbors_sim_plus["mdet"]["noshear"]["mag"][sel_sim_plus][sim_subsample_plus],
    bins=bins,
    density=True,
    ec="k",
    histtype="step",
    label="sim",
)
axs[0].hist(
    neighbors_sim_plus["mdet"]["noshear"]["mag"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] > 1.5],
    bins=bins,
    density=True,
    ec="r",
    histtype="step",
    label="sim (w > 1.5)",
)
axs[0].hist(
    neighbors_sim_plus["mdet"]["noshear"]["mag"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] < 0.5],
    bins=bins,
    density=True,
    ec="b",
    histtype="step",
    label="sim (w < 0.5)",
)

axs[0].set_xlabel("mag")
axs[0].set_yticks([])

axs[1].hist(
    neighbors_y6["mdet"]["noshear"]["neighbor_mag"][sel_y6][y6_subsample],
    bins=bins,
    density=True,
    ec="gray",
    histtype="step",
    label="Y6",
)
axs[1].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_mag"][sel_sim_plus][sim_subsample_plus],
    bins=bins,
    density=True,
    ec="k",
    histtype="step",
    label="sim",
)
axs[1].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_mag"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] > 1.5],
    bins=bins,
    density=True,
    ec="r",
    histtype="step",
    label="sim (w > 1.5)",
)
axs[1].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_mag"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] < 0.5],
    bins=bins,
    density=True,
    ec="b",
    histtype="step",
    label="sim (w < 0.5)",
)

axs[1].set_xlabel("neighbor mag")
axs[1].set_yticks([])

bins = np.geomspace(1e-6, 1e-2, NBINS + 1)

axs[2].hist(
    neighbors_y6["mdet"]["noshear"]["neighbor_distance"][sel_y6][y6_subsample],
    bins=bins,
    density=True,
    ec="gray",
    histtype="step",
    label="Y6",
)
axs[2].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus],
    bins=bins,
    density=True,
    ec="k",
    histtype="step",
    label="sim",
)
axs[2].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] > 0.5],
    bins=bins,
    density=True,
    ec="r",
    histtype="step",
    label="sim (w > 1.5)",
)
axs[2].hist(
    neighbors_sim_plus["mdet"]["noshear"]["neighbor_distance"][sel_sim_plus][sim_subsample_plus][w_plus[_w_ind_plus] < 0.5],
    bins=bins,
    density=True,
    ec="b",
    histtype="step",
    label="sim (w < 0.5)",
)

axs[2].set_xscale("log")
axs[2].set_xlabel("neighbor distance")
axs[2].set_yticks([])

axs[1].legend()

plt.show()

---