## Preparation

With the code below, we ensure that we are constantly updating the local packages while importing them in the jupyter notebook environment

In [None]:
%load_ext autoreload
%autoreload 2
%aimport

import matplotlib.pyplot as plt
import torch
import numpy as np
from inverse_optim import gen_data
from inverse_optim import research_plot
from inverse_optim import sancho
import tadasets
import powerbox as pbox

# Circle and the Figure Eight

First, choose which figure you would like to generate. You can comment out the figure you are not interested in.

In [None]:
# This is a synthetic data set that we want "approximate"
N = 300
goal_pts = tadasets.dsphere(n=N, d=1, noise=0.1) # circle
# goal_pts = tadasets.infty_sign(n=N, noise=0.1) # figure eight

# To perform Stochastic Gradient Descent (SGD), we need our set to be of tensor type
goal_pts = torch.tensor(goal_pts)

# Plot the initial/goal data set
P = goal_pts.detach().numpy()
plt.scatter(P[:, 0], P[:, 1])
plt.show()

## Alpha Filtration

## Ripser Filtration

## Alpha DTM Filtration

## Alpha-Ripser Hybrid Filtration

### Creation of the new dataset

In [5]:
# Creation of the PD that we want to get to
goal_pd = gen_data.create_alpha_pd(goal_pts)

First, we need to find out what optimal learning rate is. If you would like to make the computation quicker (which will make it less accurate), pass the option sliced=True to use the sliced wasserstein distance as the metric.

In [None]:
lr_list = np.linspace(0.001, 0.2, 6)
research_plot.research_lr(lr_list=lr_list, goal_pd=goal_pd, amount=N, dim=2, epochs=300, decay_speed=30, sliced=False)

After running the above code, plug in the best learning rate:

In [None]:
# Generation of new dataset
final_pts = gen_data.generate_data(goal_pd=goal_pd, amount=N, dim=2, lr=0.08, epochs=600, decay_speed=30, investigate=False, sliced=False, filtr="alpha_rips_hybrid")

### The powerspectrum

In [None]:
original_pts = goal_pts.detach().numpy()
produced_pts = final_pts.detach().numpy()

In [None]:
# The number of grid points are also required when passing the samples
p_k_samples, bins_samples = pbox.get_power(original_pts, 2.0, N=N)
p_k_samples_new, bins_samples_new = pbox.get_power(produced_pts, 8.0, N=N)

In [None]:
plt.plot(bins_samples, p_k_samples,label="Original Circle Power")
plt.plot(bins_samples_new, p_k_samples_new,label="Generated Circle Power")

plt.legend()
plt.xscale('log')
plt.yscale('log')

# Sancho

In [None]:
# Load dataset
cat = np.load(f'/Users/sliemela/Downloads/Sancho/fiducial_HOD_fid_NFW_sample0_1Gpc_z0.50_RSD3_run0.npz')
pos = cat['pos']        # shape: (N_galaxies, 3) --> X,Y,Z position of each galaxy in Mpc/h
vel = cat['vel']        # shape: (N_galaxies, 3) --> Vx, Vy, Vz velocity of the galaxy in km/s
gtype = cat['gtype']

# Split up the dataset
split = (2,2,2)
bins = sancho.bin(pos, split)

# Plotting the bins
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

for x in bins:
    if len(x) != 0:
        ax.scatter3D(x[:, 0], x[:, 1], x[:, 2])

In [None]:
# Calculating the statistics of the wasserstein distances of sancho
list_of_wasser_dist = sancho.compare_wasser_alpha(bins)

wasser_mean = np.mean(list_of_wasser_dist)
wasser_std = np.std(list_of_wasser_dist)

print(wasser_mean)
print(wasser_std)

# NOTE: the code of compare_wasser_alpha has been changed to only consider the first 3000. In the future, we may consider all pairs. 