In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from numpy.linalg import norm as norm 
from numpy.linalg import pinv
from collections import Counter

 
from sklearn.preprocessing import PolynomialFeatures
from numpy.polynomial import chebyshev as cheb

from tqdm import tqdm 
import time

In [None]:

X = np.linspace(-1., 1., 1001)
k = 100
nk = 2*k+1
M = cheb.chebvander(X, deg=nk)

obs_set = np.arange(k+1)            # modeled space indices
unobs_set = np.setdiff1d(np.arange(nk+1), obs_set)  # unmodeled space
rand_state = np.random.RandomState(10)

configs = {}

# 1. sparse in modeled, zero in unmodeled
ctrue1 = np.zeros(nk+1)
idx = rand_state.choice(obs_set, size=5, replace=False)
ctrue1[idx] = rand_state.normal(0, 1, size=5)
configs["sparse_modeled_only"] = ctrue1

# 2. modeled is zero, sparse in unmodeled, 
ctrue2 = np.zeros(nk+1)
idx = rand_state.choice(unobs_set, size=10, replace=False)
ctrue2[idx] = rand_state.normal(0, 0.5, size=10)
configs["sparse_unmodeled_only"] = ctrue2

# 3. Sparse in modeled, dense in unmodeled (small variance)
ctrue3 = np.zeros(nk+1)
idx = rand_state.choice(obs_set, size=5, replace=False)
ctrue3[idx] = rand_state.normal(0, 5, size=5)   
ctrue3[unobs_set] = rand_state.normal(0, 0.1, size=len(unobs_set)) 
configs["signal_modeled_noise_unmodeled"] = ctrue3

# 4. Dense everywhere
ctrue4 = rand_state.normal(0, 1, size=nk+1)
configs["dense_everywhere"] = ctrue4

# 5. Sparse everywhere, a few unmodeled signals
ctrue5 = np.zeros(nk+1)
idx_m = rand_state.choice(obs_set, size=3, replace=False)
idx_u = rand_state.choice(unobs_set, size=5, replace=False)
ctrue5[idx_m] = rand_state.normal(0, 1, size=3)
ctrue5[idx_u] = rand_state.normal(0, 3, size=5)  
configs["sparse_with_unmodeled_spikes"] = ctrue5

# 6. Modeled dense, unmodeled sparse
ctrue6 = np.zeros(nk+1)
ctrue6[obs_set] = rand_state.normal(0, 1, size=len(obs_set))
idx_u = rand_state.choice(unobs_set, size=5, replace=False)
ctrue6[idx_u] = rand_state.normal(0, 2, size=5)
configs["dense_modeled_sparse_unmodeled"] = ctrue6

ctrue7 = np.zeros(nk+1)
ctrue7[2*k ] = 1.5
ctrue7[1] = 0.5
ctrue7 += rand_state.randn(ctrue7.size)
configs["one_signal_each_side_rest_noise"] = ctrue7

test_name = "one_signal_each_side_rest_noise"
ctrue = configs[test_name]


f = np.array(M @ ctrue)

chat_true = np.zeros_like(ctrue)
sol = np.linalg.lstsq(M[:,obs_set], f, rcond=None)
chat_true[obs_set] = sol[0]
nstart = 5
max_samples = 300
M_modeled = M[:,obs_set]

In [None]:
basis_idx = np.arange(ctrue.size)

def run_simulation(samples, probs=None, nstart=5):
    start_time = time.time()
    if probs is None:
        probs = np.ones_like(samples, dtype=float)

    assert probs.size == samples.size
    
    res = {'chat':[], 'errs':[], 'A_norm':[], 'MTMinv_norm':[], 'runtime':[]}
    T = samples[:nstart]

    for i in range(samples.size - nstart+1):
        MTM = M[np.ix_(T,obs_set)]
        
        MTU = M[np.ix_(T,np.delete(np.arange(M.shape[1]), obs_set))]
        probsi = np.sqrt(probs[:nstart+i]*(nstart+i))

        y = M[T,:] @ ctrue 
        sol = np.linalg.lstsq(MTM/probsi.reshape(-1, 1), y/probsi, rcond=None)
        # chat = np.zeros_like(ctrue, dtype=float)
        # chat[obs_set] = sol[0]
        chat = sol[0]
        
        res['chat'].append(chat) 
        res['errs'].append(np.linalg.norm(f - M_modeled @ chat))
        
        try: 
            Minv = pinv(MTM)
            res['MTMinv_norm'].append(norm(Minv,ord=2))
            res['A_norm'].append(norm(Minv @ MTU, ord=2))

        except np.linalg.LinAlgError:
            res['MTMinv_norm'].append(np.nan)
            res['A_norm'].append(np.nan)
        res['runtime'].append(time.time() - start_time)
        T = samples[:nstart+i+1]        
    return res 


def run_simulation_nodes(ntotal, nstart=5, snap_to_grid=False):
    start_time = time.time()
    Tx = cheb.chebpts2(nstart)
    if snap_to_grid:
        N = int(1 / (1-np.cos(np.pi / nstart)) + 1)
        grid = np.linspace(-1,1,N)
        Tx = np.array([grid[np.argmin(np.abs(grid - x))] for x in Tx])

    res = {'chat':[], 'errs':[], 'A_norm':[], 'MTMinv_norm':[], 'runtime':[]}
    
    
    for i in range(ntotal - nstart):
        M_ = cheb.chebvander(Tx, deg=nk)
        MTM = M_[:,obs_set]

        MTU = M_[:,np.delete(np.arange(M_.shape[1]), obs_set)]
        y = M_ @ ctrue
        sol = np.linalg.lstsq(MTM, y, rcond=None)
        chat = np.zeros_like(ctrue, dtype=float)
        chat[obs_set] = sol[0]
        res['chat'].append(chat) 
        res['errs'].append(np.linalg.norm(M@(chat_true - chat)))
        
        try: 
            Minv = pinv(MTM)
            res['MTMinv_norm'].append(norm(Minv,ord=2))
            res['A_norm'].append(norm(Minv @ MTU, ord=2))

        except np.linalg.LinAlgError:
            res['MTMinv_norm'].append(np.nan)
            res['A_norm'].append(np.nan)

        Tx = cheb.chebpts2(nstart+i+1)
        if snap_to_grid:
            N = int(1 / (1-np.cos(np.pi / nstart)) + 1)
            grid = np.linspace(-1,1,N)
            Tx = np.array([grid[np.argmin(np.abs(grid - x))] for x in Tx])
        res['runtime'].append(time.time() - start_time)    
    return res 

import numpy as np
from numpy.linalg import norm, pinv
import numpy.polynomial.chebyshev as cheb
    

In [None]:
def run_simulation_nodes_incremental(ntotal, nstart=5):
    start_time = time.time()
    pts = cheb.chebpts2(ntotal)

    N = int(1 / (1-np.cos(np.pi / ntotal)) + 1) 
    print(N)
    grid = np.linspace(-1,1,N)
    chebyshev_grid = np.array([grid[np.argmin(np.abs(grid - x))] for x in pts])

    # print(f"Counts of all numbers: {dict(Counter(chebyshev_grid))}")
    random_state = np.random.RandomState(19)
    sample = random_state.choice(chebyshev_grid, ntotal, replace=True)
    
    res = {'chat':[], 'errs':[], 'A_norm':[], 'MTMinv_norm':[], 'runtime':[]}
    Tx = sample[:nstart]
    for i in range(ntotal - nstart + 1):
        M_ = cheb.chebvander(Tx, deg=nk)
        MTM = M_[:,obs_set]

        MTU = M_[:,np.delete(np.arange(M_.shape[1]), obs_set)]
        y = M_ @ ctrue
        sol = np.linalg.lstsq(MTM, y, rcond=None)
        chat = np.zeros_like(ctrue, dtype=float)
        chat[obs_set] = sol[0]
        res['chat'].append(chat) 
        res['errs'].append(np.linalg.norm(M@(chat_true - chat)))
        
        try: 
            Minv = pinv(MTM)
            res['MTMinv_norm'].append(norm(Minv,ord=2))
            res['A_norm'].append(norm(Minv @ MTU, ord=2))

        except np.linalg.LinAlgError:
            res['MTMinv_norm'].append(np.nan)
            res['A_norm'].append(np.nan)

        Tx = sample[:nstart+i+1]
        res['runtime'].append(time.time() - start_time)
    return res, sample

In [None]:
M_modeled = M[:,:101]

In [None]:
# Chebyshev nodes
output = run_simulation_nodes_incremental(max_samples, nstart=nstart)
RESULTS_CHEB, points_cheb = [output[0]], output[1]


In [None]:
from optimal_design_sub_mod import run_simulation_greedy_sub_mod
output = run_simulation_greedy_sub_mod(M_modeled, f, obs_set, ctrue, nstart, max_samples, "A")
RESULTS_OD_SUB_MOD_A, points_od_sub_mod_A = [output[0]], output[1]

In [None]:
output = run_simulation_greedy_sub_mod(M_modeled, f, obs_set, ctrue, nstart, max_samples, "V")
RESULTS_OD_SUB_MOD_V, points_od_sub_mod_V = [output[0]], output[1]


In [None]:
nsims = 10

In [None]:
RESULTS_RAND = []
for j in tqdm(range(nsims), total=nsims):
    rand_state = np.random.RandomState(45+j)
    samples = np.arange(X.size)
    rand_state.shuffle(samples)
    samples = samples[:max_samples]
    RESULTS_RAND.append(run_simulation(samples, nstart=nstart))
points_rand = M[:,1][samples]

In [None]:
Qs, _ = np.linalg.qr(M[:,obs_set])
poly_ls_probs_s = np.linalg.norm(Qs, axis=1)**2.
print(poly_ls_probs_s.max(), poly_ls_probs_s.min(), poly_ls_probs_s.sum())
poly_ls_probs_s /= poly_ls_probs_s.sum()

RESULTS_LS = []
RESULTS_LS_UNWEIGHTED = []
for j in tqdm(range(nsims), total=nsims):
    rand_state = np.random.RandomState(45+j)
    poly_ls_samples_s = rand_state.choice(X.size, max_samples, p=poly_ls_probs_s)
    poly_ls_probs_s_on_samples = poly_ls_probs_s[poly_ls_samples_s]
    RESULTS_LS.append(run_simulation(poly_ls_samples_s, poly_ls_probs_s_on_samples, nstart=nstart))
    RESULTS_LS_UNWEIGHTED.append(run_simulation(poly_ls_samples_s, nstart=nstart))

points_ls = M[:,1][poly_ls_samples_s]

In [None]:
import pickle

results = {
    'ctrue': ctrue,
    'Random': RESULTS_RAND,
    'Leverage Score': RESULTS_LS,
    'Chebyshev Nodes': RESULTS_CHEB,
    'Greedy Sub Modular OD V': RESULTS_OD_SUB_MOD_V,
    'Greedy Sub Modular OD A': RESULTS_OD_SUB_MOD_A
}

with open(f"poly_regr_results_{test_name}.pkl", "wb") as f:
    pickle.dump(results, f)

In [None]:
import os
save = True

keys = ['errs', 'MTMinv_norm', 'runtime']
for key in keys:
    if key == 'errs':
        ylabel = r"Error, $\|\hat{f}_{\mathcal{T}} - f\|_2$" 
    elif key == 'A_norm':
        ylabel = r"$\|A\|$"
    elif key == 'MTMinv_norm':
        ylabel = r"$\|M_{\mathcal{TM}}^\dagger\|$"
    elif key == 'runtime':
        ylabel = 'Time (s)'

    savename = f"figures/{test_name}/{key.lower()}.png"
    fig, ax = plt.subplots()
    for results, name in zip([RESULTS_RAND, RESULTS_LS, RESULTS_CHEB, RESULTS_OD_SUB_MOD_V, RESULTS_OD_SUB_MOD_A], 
                             ['Random', 'Leverage Score', 'Chebyshev Nodes', 'Greedy Sub Modular OD V', 'Greedy Sub Modular OD A']):
        metric = np.array([res[key] for res in results])
        if key != 'runtime':
            metric /= X.size 
            
        mean = metric.mean(axis=0)

        line = ax.loglog(np.arange(nstart, max_samples+1), mean, label=name)[0]



    ax.legend(title='Sampling Method', fontsize=11)
    ax.set_xlabel(r"Size of Training Set, $\mathcal{T}$", fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    if save:
        
        outdir = f"figures/{test_name}"
        os.makedirs(outdir, exist_ok=True) 
        plt.savefig(savename, dpi=250, format='png')

In [None]:
for config in configs.keys():
    print(config)
    with open(f"poly_regr_results_{config}.pkl", "rb") as f:
        ALL_RESULTS = pickle.load(f) 
    
    RESULTS_RAND         = ALL_RESULTS['Random']
    ctrue                = ALL_RESULTS['ctrue']
    RESULTS_LS           = ALL_RESULTS['Leverage Score']
    RESULTS_CHEB         = ALL_RESULTS['Chebyshev Nodes']
    RESULTS_OD_SUB_MOD_V = ALL_RESULTS['Greedy Sub Modular OD V']
    RESULTS_OD_SUB_MOD_A = ALL_RESULTS['Greedy Sub Modular OD A']

    # chat's over time
    from matplotlib.animation import FuncAnimation, PillowWriter
    import os
    ctrue_observed = ctrue[:101]
    for results, name in zip([RESULTS_RAND, RESULTS_LS, RESULTS_OD_SUB_MOD_A, RESULTS_OD_SUB_MOD_V], ['Random', 'Lev_score', 'OD_Sub_Mod_A', 'OD_Sub_Mod_V']):
        data = np.array(results[0]['chat'])[:,:101]

        
        n_frames, n_points = data.shape
        x = np.arange(n_points)
        # print(x.shape, data.shape)
        fig, ax = plt.subplots(figsize=(6, 4))
        line_approx, = ax.plot([], [], lw=2, label=f"{name} sampling")
        line_ground_truth, = ax.plot([],[],lw=2, label="ground truth")

        ax.set_xlim(0, n_points - 1)
        ax.set_ylim(-25, 25)

        ax.set_title(f"{name} chat")

        ax.legend()

        def init():
            line_approx.set_data([], [])
            line_ground_truth.set_data([], [])
            return line_approx, line_ground_truth

        def update(frame):
            line_approx.set_data(x, data[frame])
            line_ground_truth.set_data(x, ctrue_observed)
            ax.set_title(f"{name} chat frame {frame+1}/{n_frames}")
            return line_approx, line_ground_truth

        ani = FuncAnimation(fig, update, frames=n_frames,
                            init_func=init, blit=False, interval=10)

        writer = PillowWriter(fps=30)
        outdir = f"figures/polynomial_regression/{config}"
        os.makedirs(outdir, exist_ok=True)
        ani.save(f"{outdir}/chat_evolution_{name.lower()}.gif", writer=writer)

In [None]:
# chat's over time
from matplotlib.animation import FuncAnimation, PillowWriter
import os
ctrue_observed = ctrue[:100]
for results, name in zip([RESULTS_RAND, RESULTS_LS, RESULTS_OD_SUB_MOD_A, RESULTS_OD_SUB_MOD_V], ['Random', 'Lev_score', 'OD_Sub_Mod_A', 'OD_Sub_Mod_V']):
    data = np.array(results[0]['chat'])
    
    n_frames, n_points = data.shape
    x = np.arange(n_points)

    fig, ax = plt.subplots(figsize=(6, 4))
    line_approx, = ax.plot([], [], lw=2, label=f"{name} sampling")
    line_ground_truth, = ax.plot([],[],lw=2, label="ground truth")

    ax.set_xlim(0, n_points - 1)
    ax.set_ylim(-1, 1)
    ax.set_title(f"{name} chat")

    ax.legend()

    def init():
        line_approx.set_data([], [])
        line_ground_truth.set_data([], [])
        return line_approx, line_ground_truth

    def update(frame):
        line_approx.set_data(x, data[frame])
        line_ground_truth.set_data(x, ctrue_observed)
        ax.set_title(f"{name} chat frame {frame+1}/{n_frames}")
        return line_approx, line_ground_truth

    ani = FuncAnimation(fig, update, frames=n_frames,
                        init_func=init, blit=False, interval=10)

    writer = PillowWriter(fps=30)
    outdir = f"figures/polynomial_regression/{name.lower()}"
    os.makedirs(outdir, exist_ok=True)
    ani.save(f"{outdir}/chat_evolution_{name.lower()}.gif", writer=writer)

In [None]:
# colors = ['red', 'green', 'purple', 'orange','pink', 'black']
# methods = [RESULTS_RAND, RESULTS_LS, RESULTS_CHEB, RESULTS_OD_SUB_MOD_V, RESULTS_OD_SUB_MOD_A]
# names = ['Random', 'Leverage Score', 'Chebyshev Nodes', 'OD Sub Modular V', 'OD Sub Modular A']


# fig, axes = plt.subplots(len(methods), 1, figsize=(8, 15), sharex=True)
# point_idx = 195
# for ax, method, name, color in zip(axes, methods, names, colors):
#     ax.plot(X, M @ method[0]['chat'][point_idx], label=name, color=color, alpha=0.6)
#     ax.plot(X, M @ ctrue, label='True function', color='blue', alpha=0.8)
#     ax.set_title(f"{name} Sampling Induced Function vs. True Function at 200 Sampled Points")
#     ax.set_ylim(-25,25)
#     ax.set_xlim(-1,1)
#     ax.legend()

# plt.tight_layout()
# plt.savefig(f'figures/{test_name}/chat_at_{point_idx}.png')

# # clean up. add training data points in the plot. fix scale
# # do this at 50, 150 training points as well