In [110]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm.autonotebook import tqdm, trange
import math

In [111]:
def generate_problem(m,n):
    rand = np.random.default_rng(seed=1)
    return rand.normal(size=(m,n)), np.zeros((n,))[np.newaxis].T

def gen_f(A):
    m, n = A.shape
    return lambda xk: -np.sum(np.log(1 - (A @ xk))) \
                      -np.sum(np.log(1 + xk)) \
                      -np.sum(np.log(1 - xk))

def gen_df(A):
    m, n = A.shape
    return lambda xk: np.sum(A / (1 - A @ xk), axis=0)[np.newaxis].T \
                     -1 / (1 + xk) \
                     +1 / (1 - xk)
    
def gen_hf(A):
    m, n = A.shape
    def hf(xk):
        # The sum of outer products of rows is equivalent to matrix multiplication
        # (i.e, the sum of the outer product of column i of A and row i of B)
        # Therefore, A^TA can be written as the sum of the outer products of its rows with themselves
        
        # The numerator of the fractional part of the sum in the Hessian formula is exactly the outer
        # product of the rows of A. However, the division makes this a bit more complicated. 
        
        # However, it is exactly this division that makes the hessian function so slow. If we can optimize
        # this to one matrix-wide division and a matrix multiplication, it should be must faster.
        
        # If we take a copy of A, name it B and divide it each row j by (1-A[j] @ xk), then perform 
        # B^TA, that should give the same output. 
        
        # Get the denominators into a vector
        denom = 1 - A @ xk
        # Divide each row of A by the denominators. Because of numpy behaviour, dividing an m x n 
        # matrix by an m x 1 vector has the effect of dividing each row in the matrix by the 
        # corresponding vector element
        B = A.copy() / denom
        # Perform the sum of outer products by converting to the corresponding matrix multiply
        return B.T @ A
    # return lambda xk: sum_func(m, lambda j: np.outer(A[j],A[j]) / ((1 - A[j] @ xk) ** 2)) \
    #                  +np.diag(1 / ((1 + xk) ** 2).flatten()) \
    #                  +np.diag(1 / ((1 - xk) ** 2).flatten())

In [113]:
def gen_stepsize(A, sigma, beta, f, df):
    def helper(dk, xk):
        s = 1
        alpha = s 
        while np.any(A @ (xk + alpha * dk) >= 1) or \
        np.any(np.abs(xk + alpha * dk) >= 1) or \
        f(xk + alpha * dk) - f(xk) >= sigma * alpha * df(xk).T @ dk:
            alpha *= beta
        return alpha
    return helper
epsilon = 1e-3
stopcond = lambda xk, df: np.linalg.norm(df(xk),ord=2) <= epsilon

sigma = 1/10
beta = 1/2
m=20000
n=10000

In [114]:

# Run each stepsize, stopcond pair, storing data necessary for plotting
np.seterr(all='raise')
plotdata_steepest = {}
k = 0
A, x0 = generate_problem(m, n)
f = gen_f(A)
df = gen_df(A)
hf = gen_hf(A)
stepsize = gen_stepsize(A, sigma, beta, f, df)
curalg = (sigma, beta)
plotdata_steepest.update({curalg: []})
xk = x0
while True:
    plotdata_steepest[curalg].append(f(xk))
    grad = df(xk)
    dk = -np.linalg.inv(hf(xk)) @ grad
    #dk = -grad
    step = stepsize(dk, xk)
    if stopcond(xk, df):
        break
    print(f"{k} - {step} - {np.linalg.norm(df(xk),ord=2)}", end="\r")
    k += 1
    xk = xk + step * dk

ValueError: operands could not be broadcast together with shapes (20000,1) (10000,1) 

In [None]:
colors = ['r', 'g', 'b', 'y']
def data_graph(plotname, sigma):
    fig, axs = plt.subplots(4,4, figsize=(10,10))
    for i, m in enumerate(ms):
        for j, n in enumerate(ns):
            # select xstar as average over all values of this 
            xstar = 0
            for beta in betas:
                for sigma in sigmas:
                    alg = (sigma, beta, m, n)
                    xstar += plotdata_steepest[alg][-1]
            xstar /= 16
            for k, beta in enumerate(betas):
                alg = (sigma, beta, m, n)
                y = abs(np.array(plotdata_steepest[alg]) - xstar)
                x = range(len(plotdata_steepest[alg]))
                axs[i,j].scatter(x, y, c=colors[k], label=f"beta={beta}",s=plt.rcParams['lines.markersize'] ** 2 * 2, marker = '.', alpha=0.3)
            if j == 0:
                axs[i,j].set(ylabel=f"m={m}")
            if i == 3:
                axs[i,j].set(xlabel=f"n={n}")
            axs[i,j].set_yscale("log")
    handles, labels = fig.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    fig.legend(by_label.values(), by_label.keys(), loc="upper right")
    fig.suptitle(plotname)
for sigma in sigmas:
    data_graph(f"Error vs iteration, sigma={sigma}", sigma)