# Here we compare the optimal potentials obtained from different algorithms namely: Sinkhorn, damped Newton and semi-dual damped Newton with the optimal potentials obtained from using log-domain Sinkhorn.

In [None]:
from __future__ import division
import os
import numpy as np
import time
import matplotlib.pyplot as plt
import scipy as scp
import pylab as pyl
import warnings
warnings.filterwarnings('ignore')
np.random.seed(1234)

%matplotlib inline 
%load_ext autoreload                                                                                                                                                                                                
%autoreload 

In [None]:
%load_ext autoreload
%autoreload 2
    

In [None]:
relative_path_to_new_folder = "../Images"
os.makedirs(relative_path_to_new_folder, exist_ok=True)
if not os.path.isdir('../Images/Correctness_images'):
    os.makedirs('../Images/Correctness_images')


In [None]:
"""To compute distance matrix"""
def distmat(x,y):
    return np.sum( x**2,0 )[:,None] + np.sum( y**2,0 )[None,:] - 2*x.transpose().dot(y)

"""To Normalise a vector"""
normalize = lambda a: a/np.sum( a )

"""To Compute P"""
def GetP(u,K,v):
    return u[:,None]*K*v[None,:]

def plotp(x, col,plt, scale=200, edgecolors="k"):
  return plt.scatter( x[0,:], x[1,:], s=scale, edgecolors=edgecolors,  c=col, cmap='plasma', linewidths=2 )

In [None]:
def generate_data(N):
    """
     N is a list of the size of the data on x and y
    """
    x = np.random.rand( 2,N[0] )-0.5
    theta = 2*np.pi*np.random.rand( 1,N[1] )
    r = 0.8+.2*np.random.rand( 1,N[1] )
    y = np.vstack( ( r*np.cos( theta ),r*np.sin( theta ) ) )
    return x,y

In [None]:
import computational_OT

### Make potentials independent of any shift by constants

In [None]:
def make_unique_potentials(f, g):
    # Fixes if f and g have extra useless dimensions
    f = f.flatten()
    g = g.flatten()
    #
    ones_N = np.ones_like(f)
    ones_M = np.ones_like(g)
    coeff = (np.sum(f)-np.sum(g))/(len(f)+len(g))
    f_new = f-coeff*ones_N
    g_new = g+coeff*ones_M
    return (f_new, g_new)

In [None]:
N = [ 500,600 ]
x,y = generate_data(N)
epsilons  = [ 1.0, 0.5, 0.1,  0.05, 0.03 ]

## Entropy regularized formulation

The primal entropy regularized formulation of OT is given by:
$$
OT_{\varepsilon}(\alpha,\beta) = min_{\pi \in \mathcal{U}(\alpha,\beta)} \langle C,\pi \rangle +\varepsilon KL(\pi\|\alpha \otimes \beta)\ ,
$$
where
$\ 
KL(\pi\|\alpha \otimes \beta) 
\ $ is the KL-divergence and $\ \mathcal{U}(\alpha,\beta)=\{\pi: \pi\mathcal{1}=\alpha, \pi^{T}\mathcal{1}=\beta\}$.

## Sinkhorn  
The optimal coupling $\pi^{*}$ has the following form :
$$
\pi^{*} = \alpha \odot diag(u)K diag(v)\odot \beta
$$
and we know that $\pi^{*}\mathbb{1}=\alpha$ and $(\pi^{*})^{T}\mathbb{1}=\beta$.
###
Therefore, Sinkhorn updates is given by the following alternative projections
$$
u^{t+1}  \leftarrow \frac{1}{K(v^{t}\odot \beta)}\ , \\
v^{t+1}  \leftarrow \frac{1}{K^{T}(u^{t+1}\odot \alpha)}\ , 
$$
where 
$K = e^{-\frac{C}{\varepsilon}}\in M_{n\times m}(\mathbb{R}),\ \alpha \in \mathbb{R}^{n},\ \beta \in \mathbb{R}^{m}\ ,\ u\in \mathbb{R}^{n},\ v\in \mathbb{R}^{m}\ and \ (u^{0},v^{0})=(u,v)\ .$



## Entropy regularized dual-formulation
The dual formulation of the entropy regularized OT is given by:
$$
OT_{\varepsilon}(\alpha,\beta) = \max_{f\in \mathbb{R}^{n}, g\in\mathbb{R}^{m}} \langle f, \alpha \rangle + \langle g, \beta \rangle - \varepsilon\left(\langle\alpha \otimes \beta, e^{\frac{f}{\varepsilon}}\odot K \odot e^{\frac{g}{\varepsilon}}  \rangle-1\right)\ ,
$$
where
$$
\alpha \in \mathcal{M}_{1}(\mathcal{X}),\ \beta \in \mathcal{M}_{1}(\mathcal{Y}),\ \varepsilon>0,\ f\in\mathbb{R}^{n},\ g\in \mathbb{R}^{m}\ .
$$

## log-domain Sinkhorn
Now, the exp-log regularized update of the Sinkhorn algorithm is as follows:
$$
m_{i}(g)\leftarrow \min_{j}(C_{ij}-g_{j}^{(t)}),\ \forall\  i = 1,\dots,n\ ,
$$
$$
f^{(t+1)}_{i}\leftarrow -\varepsilon \log\left(\sum_{j=1}^{m}\exp\left(\frac{-\left(C_{ij}-g_{j}^{(t)}-m_{i}(g)\right)}{\varepsilon}\right)\beta_{j}\right)+m_{i}(g),\ \forall\  i=1,\dots,n\ ,
$$
$$
m_{j}(f)\leftarrow \min_{i}(C_{ij}-f_{i}^{(t+1)}),\ \forall\   j=1,\dots,m\
 ,
$$
$$
g^{(t+1)}_{j}\leftarrow -\varepsilon \log\left(\sum_{i=1}^{n}\exp\left(\frac{-\left(C_{ij}-f_{i}^{(t+1)}-m_{j}(f)\right)}{\varepsilon}\right)\alpha_{i}\right)+m_{j}(f),\ \forall\  j=1,\dots,m\ ,
$$
where 
$K=e^{-C/\varepsilon} \in M_{n \times m}(\mathbb{R}),\ $ $\varepsilon >0,\ $ $\alpha \in \mathbb{R}^{n},\ $ $\beta \in \mathbb{R}^{m},\ $
   $f \in \mathbb{R}^{n},\ $ $g \in \mathbb{R}^{m}\ and \ (f^{(0)},g^{(0)})=(f,g)$.


## I. Log-domain Sinkhorn


In [None]:
# Log domain Sinkhorn
print("Log domain Sinkhorn.... ")
results_logSinkhorn = []
times_logSinkhorn   = []
logsinkhornP        = []
a = normalize(np.ones(N[0]))
b = normalize(np.ones(N[1]))

#Cost matrix
C = distmat(x,y)
for eps in epsilons:
  print( "Sinkhorn for epsilon = "+str(eps)+":" )    
  print( "Doing for (",N[0],N[1],")." )
  print( " |- Iterating" )
  start = time.time()
  logsinkhorn = computational_OT.Log_domainSinkhorn(a,b,C,eps)
  output = logsinkhorn.update( niter = 500 )
  results_logSinkhorn.append( output )
  end = time.time()
  times_logSinkhorn.append(1e-3*(end-start) )
  logsinkhornP.append(GetP(output['potential_f']/eps, np.exp(-C/eps),output['potential_g']/eps))

In [None]:
plt.figure( figsize = (20,7) )
plt.subplot(2,1,1),
plt.title( "$||P1 -a||_1+||P1 -b||_1$" )
for i in range( len( results_logSinkhorn) ):
  error = np.asarray( results_logSinkhorn[i]['error'])
  plt.plot( error, label = 'log-sinkhorn for $\epsilon=$'+ str(epsilons[i]) , linewidth = 2 )
plt.yscale( 'log' )
plt.legend()
plt.xlabel("Iterations")
plt.ylabel("Error in log-scale")
plt.savefig("../Images/Correctness_images/Error_plot_log_domain_Sinkhorn.png") 
plt.show()


In [None]:
flogsinkhorn, glogsinkhorn = [], []
for i in range(len(results_logSinkhorn)):
    flogsinkhorn.append(results_logSinkhorn[i]['potential_f'])
    glogsinkhorn.append(results_logSinkhorn[i]['potential_g'])

## II. Sinkhorn




In [None]:
# Sinkhorn
print("Sinkhorn.... ")
SinkhornP = []
results_Sinkhorn = []
times_Sinkhorn = []
Pmatrix_dist_linVSsinkhorn = []
#Cost matrix
C = distmat(x,y)
 # a and b
a = normalize(np.ones(N[0]))
a = a.reshape(a.shape[0],-1)
b = normalize(np.ones(N[1]))
b = b.reshape(b.shape[0],-1)

for eps in epsilons:
  #Kernel
  K = np.exp(-C/eps)
  print("Doing for (",N[0],N[1],").")
  print( " |- Iterating")
  #Inflating
  u = a
  v = b
  start = time.time()
  Optimizer = computational_OT.Sinkhorn(K,a,b,u,v,eps)
  out = Optimizer._update()
  results_Sinkhorn.append(out)
  end = time.time()
  times_Sinkhorn.append(end-start)
  print( " |- Computing P")
  print( "" )
  SinkhornP.append(GetP(out['potential_f']/eps,K,out['potential_g']/eps))
  

In [None]:
plt.figure( figsize = (20,7) )

plt.subplot(2,1,1),
plt.title( "$||P1 -a||_1+||P1 -b||_1$" )
for i in range( len(results_Sinkhorn) ):
  error=np.asarray( results_Sinkhorn[i]['error_a'] )+np.asarray( results_Sinkhorn[i]['error_b'] )
  plt.plot( error,label = 'Sinkhorn for $\epsilon=$'+ str(epsilons[i]), linewidth = 2 )
plt.yscale( 'log' )
plt.legend()
plt.xlabel("Iterations")
plt.ylabel("Error in log-scale")
plt.savefig("../Images/Correctness_images/Error_plot_Sinkhorn.png") 
plt.show()

In [None]:
fsinkhorn, gsinkhorn = [], []
for i in range(len(results_Sinkhorn)):
    fsinkhorn.append(results_Sinkhorn[i]['potential_f'])
    gsinkhorn.append(results_Sinkhorn[i]['potential_g'])

In [None]:
print( "Sinkhorn keys:")
print( out.keys() )
print( "Sinkhorn log domain keys")
print( output.keys() )
# TODO: Make same keys
print( "")
print( fsinkhorn[0].shape )

print( flogsinkhorn[0].shape )
# Make outputs have same formats

##### Reality checks

In [None]:
# Reality checks
for i in range(len(results_Sinkhorn)):
    print( f'''i : {i}''')
    # Couplings
    P_logSK =   logsinkhornP[i]
    P_SK    = SinkhornP[i]
    error   = np.linalg.norm(P_SK-P_logSK, ord='fro')
    print( "Error of couplings : ", error )
    # Sums of potentials f_i + g_j
    sum_SK    = fsinkhorn[i][:,None] + gsinkhorn[i][None,:]
    sum_logSK = flogsinkhorn[i][:,None] + glogsinkhorn[i][None,:]
    print(sum_SK.shape,sum_logSK.shape)
    print(np.mean(sum_SK),np.mean(sum_logSK))
    sum_SK    = sum_SK.squeeze()
    sum_logSK = sum_logSK.squeeze()
    print(sum_SK.shape,sum_logSK.shape)
    error     = np.linalg.norm(sum_SK-sum_logSK, ord=np.inf)
    print( "Error of sums of potentials : ", error )
    print( "")

##### Comparing the final potentials of Sinkhorn and log-domian Sinkhorn

In [None]:
unique_logSK = []
unique_SK = []
errors_f = []
errors_g = []
for i in range(len(results_Sinkhorn)):
    print( f'''i : {i}''')
    unique_logSK.append(make_unique_potentials( flogsinkhorn[i], glogsinkhorn[i]))
    unique_SK.append(make_unique_potentials( fsinkhorn[i], gsinkhorn[i]))
    print(unique_logSK[i][0].shape,unique_SK[i][0].shape)
    err_f = np.linalg.norm( unique_logSK[-1][0] - unique_SK[-1][0] )
    err_g = np.linalg.norm( unique_logSK[-1][1] - unique_SK[-1][1] )
    errors_f.append( err_f )
    errors_g.append( err_g )
    print("norm of err_f: ", err_f )
    print("norm of err_g: ", err_g )
    print("")

In [None]:
plt.figure( figsize = (20,7) )
plt.title( "Difference between potentials." )
plt.plot(list(range(len(epsilons))), np.array(errors_f[::-1]) + np.array(errors_g[::-1]), label = 'difference for potentials (f,g) between log-domain Sinkhorn and Sinkhorn ', linewidth = 2, marker= 'o' )
plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_Sinkhorn.png")
plt.show()


## III. Damped Newton using direct inversion method without preconditioning 


### 
The Hessian of the dual formulation of the entropy regularized OT is given by 
$$
\nabla^{2}Q_{\varepsilon}(f,g)=\frac{-1}{\varepsilon}
\begin{pmatrix}
\Delta(\alpha) && \pi_{\varepsilon}\\
\pi^{T}_{\varepsilon} && \Delta(\beta) 
\end{pmatrix}
\ , 
$$ 
where $\pi\mathbb{1}_{m} = \alpha,\ \pi^{T}\mathbb{1}_{n}=\beta,\ $ and $\Delta = diag: \mathbb{R}^{n} \rightarrow M_{n}(\mathbb{R})$ is the linear operator mapping a vector  to a diagonal matrix  containing  this vector.
#####

This implies 
$$
\begin{pmatrix}
\Delta(\alpha) && \pi_{\varepsilon}\\
\pi^{T}_{\varepsilon} && \Delta(\beta) 
\end{pmatrix}
\begin{pmatrix}
\mathbb{1}_{n}\\
\mathbb{1}_{m}
\end{pmatrix} = 0\ ,
$$
that is,
$$
\begin{pmatrix}
\mathbb{1}_{n}\\
\mathbb{1}_{m}
\end{pmatrix}\in Ker(\nabla^{2}Q_{\epsilon}(f,g))\ .
$$
Hence, $\nabla^{2}Q_{\varepsilon}(f,g)$ is singular. Therefore, on regularization we have the following Hessian
$
H_{reg} := \nabla^{2}Q_{\varepsilon}(f,g)+\lambda cc^{T}\ ,
$ 
where $c= \begin{pmatrix}\frac{\mathbb{1}}{\sqrt{n+m}}\\-\frac{\mathbb{1}}{\sqrt{n+m}}\end{pmatrix}\in M_{(n+m),1}(\mathbb{R})$.
###
Now, at the $k^{th}$ iteration solve
$\nabla^{2}Q_{\varepsilon}(f,g)p_{k} = \nabla Q_{\varepsilon}(f,g)$ to obtain the optimizing direction vector $p_{k}$ and then perform the Armijo condition to obtain the update step $\alpha_{k}$ such that we have the update
$$
(f,g) \leftarrow (f,g) + \alpha_{k} p_{k}\ .
$$





In [None]:
rho = 0.95
c = 0.05
DampedNewtonP = []
results_DampedNewton  = []
times_DampedNewton    = []
Hessians_DampedNewton = []
#Cost matrix
C = distmat(x,y)
# a and b
a = normalize(np.ones(N[0]))
a = a.reshape(a.shape[0],-1)
b = normalize(np.ones(N[1]))
b = b.reshape(b.shape[0],-1)
for eps in epsilons:
    # Line Search
    print("Damped Newton for epsilon="+str(eps)+":")   
    #Kernel
    K = np.exp(-C/eps)
    f,g = a,b
    print("Doing for (",N[0],N[1],").")
    print( " |- Iterating")  
    start = time.time()
    Optimizer = computational_OT.DampedNewton(K,a,b,f,g,eps,rho,c)
    out = Optimizer._update(maxiter=50)
    results_DampedNewton.append(out)
    end = time.time()
    times_DampedNewton.append(end-start)
    print( " |- Computing P")
    
    DampedNewtonP.append(GetP(np.exp(out['potential_f']/eps),K,np.exp(out['potential_g']/eps)))
    print( " |- Recording (unstabilized) Hessian \n")

    mat  = -eps*Optimizer.Hessian
    diag = 1/np.sqrt( np.vstack( (a,b) ) ).flatten()
    mat = diag[:,None]*mat*diag[None,:]
    Hessians_DampedNewton.append( mat )


In [None]:
plt.figure( figsize = (20,7) )
plt.title( "$$" )
plt.title( "$||P1 -a||_1+||P^T 1 -b||_1$" )
for i in range(len(results_DampedNewton)):
  error = np.asarray( results_DampedNewton[i]['error_a'] )+np.asarray( results_DampedNewton[i]['error_b'] )
  plt.plot( error,label='Damped Newton for $\epsilon=$'+str(epsilons[i]), linewidth = 2 )
plt.xlabel( "Number of iterations" )
plt.ylabel( "Error in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.savefig("../Images/Correctness_images/Error_plot_dampedNewton.png") 
plt.show()
print( "\n Error plots can increase! The error is not the objective function!" )


##### Comparing the final potentials of log-domain Sinkhorn and damped Newton 

In [None]:
fdampednewton, gdampednewton = [], []
for i in range(len(results_DampedNewton)):
    fdampednewton.append(results_DampedNewton[i]['potential_f'])
    gdampednewton.append(results_DampedNewton[i]['potential_g'])
    
unique_dampednewton = []
for i in range(len(results_DampedNewton)):
    unique_dampednewton.append(make_unique_potentials( fdampednewton[i], gdampednewton[i]))
errors_f, errors_g = [], []
for i in range(len(results_DampedNewton)):
    print( f'''i : {i}''')
    err_f = np.linalg.norm( unique_logSK[i][0] - unique_dampednewton[i][0] )
    err_g = np.linalg.norm( unique_logSK[i][1] - unique_dampednewton[i][1] )
    errors_f.append(err_f)
    errors_g.append(err_g)
    print( "norm of err_f: ", err_f )
    print( "norm of err_g: ", err_g )
    print( "")

In [None]:
plt.figure( figsize = (20,7) )
plt.title( "Difference between potentials." )
plt.plot(list(range(len(epsilons))), np.array(errors_f[::-1]) + np.array(errors_g[::-1]), label = 'difference for potentials (f,g) between log-domain Sinkhorn and damped Newton using direct solver', linewidth = 2, marker= 'o' )
plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_dampedNewton.png") 
plt.show()


## IV. Damped Newton with Preconditioning
Here we proceed similar to the damped Newton algorithm but with preconditioning.  We consider $t$ eigenvalues of the Hessian and form the following preconditioning matrix:
$$
P = \left(I_{n+m}-\sum_{i-1}^{t}\left(1 - \frac{1}{\sqrt{\lambda_{i}}}\right)y_{i}y_{i}^{T}\right)\ .
$$
Now, at the $k^{th}$ iteration we solve the following equation:
$$
(P\nabla^{2}Q_{\varepsilon}(f,g)P)(Pp_{k})=P\nabla Q_{\varepsilon}(f,g)\ ,
$$
using iterative inversion methods such as "Conjugate gradient" and "GMRES" to get the update direction $p_{k}$, following which we use the Armijo condition to obtain the step size $\alpha_{k}$.


In [None]:
def build_preconditioners( num_eigs,modified_Hessian, ansatz=True ):
    # Diagonalize
    eigenvalues, eigenvectors = np.linalg.eigh( modified_Hessian )
    sorting_indices = np.argsort( eigenvalues )
    eigenvalues  = eigenvalues[sorting_indices]
    eigenvectors = eigenvectors[:, sorting_indices]
    # Form null vector
    if not ansatz:
        null_vector = eigenvectors[:, 0]
    else:
        null_vector = np.hstack( (np.ones(N[0]), -np.ones(N[1])) )
        norm = np.sqrt( N[0] + N[1] )
        null_vector = null_vector/norm
    # Form other vectors (only 13)
    n,m = eigenvectors.shape
    indices=[]
    for i in range(num_eigs//2):
        indices.append(m-i-1)
        indices.append(i+1)
    if num_eigs//2!=0:
        indices.append(m-1-num_eigs//2)
    precond_vectors = eigenvectors[:, indices ]
    precond_vectors = []
    for index in indices:
        precond_vectors.append( eigenvectors[:,index] )
    #
    return null_vector, precond_vectors

In [None]:
num_eigs = 13
null_vector, precond_vectors = build_preconditioners( num_eigs, Hessians_DampedNewton[-1], ansatz=False )

In [None]:
rho = 0.95
c = 0.05
reset_starting_point = True
final_modified_Hessians = []
DampedNewtonP = []
results_DampedNewton_With_Preconditioner  = []
times_DampedNewton_With_Preconditioner    = []

f, g = None, None
# Cost matrix
C = distmat(x,y)    
# a and b
a = normalize( np.ones(N[0]) )
a = a.reshape( a.shape[0],-1 )
b = normalize( np.ones(N[1]) )
b = b.reshape( b.shape[0],-1 )

for eps in epsilons:
    # Line Search
    print( "Damped Newton for epsilon="+str(eps)+":" )    
    #Kernel
    K = np.exp(-C/eps)

    if (f is None) or (g is None): 
        f,g = 0*a,0*b
    print( "Doing for (",N[0],N[1],")." )
    print( " |- Iterating" )  
    start = time.time()
    Optimizer = computational_OT.DampedNewton_With_Preconditioner( K,a,b,f,g,eps,rho,c,null_vector,precond_vectors[:] )
    out = Optimizer._update( maxiter=50, iterative_inversion=30, version=None,debug=False,optType='cg' )
    results_DampedNewton_With_Preconditioner.append( out )
    end = time.time()
    times_DampedNewton_With_Preconditioner.append(end-start)
    print( " |- Computing P" )

    if not reset_starting_point:
        f = Optimizer.x[:a.shape[0]]
        g = Optimizer.x[a.shape[0]:]
        # f = f.reshape( f.shape[0], -1)
        # g = g.reshape( g.shape[0], -1)
    
    DampedNewtonP.append( GetP(np.exp(out['potential_f']/eps),K,np.exp(out['potential_g']/eps)) )
    final_modified_Hessians.append( Optimizer.modified_Hessian )


In [None]:
plt.figure( figsize = (20,7) )
plt.title( "$$" )
plt.title( "$||P1 -a||_1+||P^T 1 -b||_1$" )
for i in range(len(results_DampedNewton_With_Preconditioner)):
  error = np.asarray( results_DampedNewton_With_Preconditioner[i]['error_a'] )+np.asarray( results_DampedNewton_With_Preconditioner[i]['error_b'] )
  plt.plot( error,label='Damped Newton for $\epsilon=$'+str(epsilons[i]), linewidth = 2 )
plt.xlabel( "Number of iterations" )
plt.ylabel( "Error in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.savefig("../Images/Correctness_images/Error_plot_dampedNewton_with_preconditioning.png") 
plt.show()
print( "\n Error plots can increase! The error is not the objective function!" )


##### Comparing the final potentials of log-domain Sinkhorn and damped Newton with preconditioning

In [None]:
fdampednewtonwithprecond, gdampednewtonwithprecond = [], []
for i in range(len(results_DampedNewton_With_Preconditioner)):
    fdampednewtonwithprecond.append(results_DampedNewton_With_Preconditioner[i]['potential_f'])
    gdampednewtonwithprecond.append(results_DampedNewton_With_Preconditioner[i]['potential_g'])
unique_dampednewtonwithprecond = []
for i in range(len(results_DampedNewton_With_Preconditioner)):
    unique_dampednewtonwithprecond.append(make_unique_potentials(fdampednewtonwithprecond[i], gdampednewtonwithprecond[i]))
errors_f, errors_g = [], []
for i in range(len(results_DampedNewton_With_Preconditioner)):
    print( f'''i : {i}''')
    err_f = np.linalg.norm( unique_logSK[i][0] - unique_dampednewtonwithprecond[i][0] )
    err_g = np.linalg.norm( unique_logSK[i][1] - unique_dampednewtonwithprecond[i][1] )
    errors_f.append(err_f)
    errors_g.append(err_g)
    print( "norm of err_f: ", err_f )
    print( "norm of err_g: ", err_g )
    print( "" )

##### Comparing the final potentials of damped Newton and damped Newton with preconditioning

In [None]:
print("For damped Newton with and without precodnitioning")
for i in range(len(results_DampedNewton_With_Preconditioner)):
    print( f'''i : {i}''')
    # Sums of potentials f_i + g_j
    sum_dampedNewton    = fdampednewton[i][:,None] + gdampednewton[i][None,:]
    sum_dampedNewtonprecond = fdampednewtonwithprecond[i][:,None] + gdampednewtonwithprecond[i][None,:]
    sum_dampedNewton    = sum_dampedNewton.squeeze()
    sum_dampedNewtonprecond = sum_dampedNewtonprecond.squeeze()
    error     = np.linalg.norm(sum_dampedNewton-sum_dampedNewtonprecond, ord=np.inf)
    print( "Error of sums of potentials : ", error )
    print( "")

In [None]:
plt.figure( figsize = (20,7) )
plt.title( "Difference between potentials." )
plt.plot(list(range(len(epsilons))), np.array(errors_f[::-1])+np.array(errors_g[::-1]), label = 'difference for potentials (f,g) between log-domain Sinkhorn  and damped Newton with preconditioning', linewidth = 2, marker= 'o' )
plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_dampedNewton_with_preconditioning.png") 
plt.show()


## V. Semi-dual damped Newton using direct solver without any preconditioning

Using the Shrodinger-bridge equations between the potentials, that is, $g_{j} = -\varepsilon\log\left(\sum_{i}\exp\left(\frac{f_{i}-C_{ij}}{\varepsilon}\right)\alpha_{i}\right)\ , \ \forall j = 1,\dots,m$, we obtain the semi-dual formulation of the objective function, that is,
$$
Q_{\varepsilon}^{semi}(f) = \langle f, \alpha \rangle + \langle g(f,C,\varepsilon), \beta \rangle\ , 
$$
where
$g(f,C,\varepsilon)_{j} = -\varepsilon\log\left(\sum_{i}\exp\left(\frac{f_{i}-C_{ij}}{\varepsilon}\right)\alpha_{i}\right)$.

In this setup, the gradients and the Hessian is as follows:
$$
\nabla_{f}Q_{\varepsilon}^{semi}(f)_{i} = \frac{1}{\varepsilon}\alpha_{i}\left(1-\sum_{s=1}^{n}\frac{e^{\frac{f_{i}-C_{ij}}{\varepsilon}}\beta_{s}}{\left(\sum_{t=1}^{n}\alpha_{t}e^{\frac{f_{t}-C_{ts}}{\varepsilon}}\right)}\right)\ ,\ \forall i = 1,\dots,n 
$$
and
$$
\nabla^{2}_{f}Q_{\varepsilon}^{semi}(f)_{ii} = \frac{-1}{\varepsilon}\sum_{s=1}^{m}\left(\alpha_{i}\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\left(1 - \alpha_{i}\left(\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\right)\beta_{s}\ ,\ \forall i =1,\dots,n,
$$
and
$$
\nabla^{2}_{f}Q_{\varepsilon}^{semi}(f)_{ij} = \frac{1}{\varepsilon}\sum_{s=1}^{m}\alpha_{i}\alpha_{j}\left(\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\left(\exp\left(\frac{f_{j}+g(f,C,\varepsilon)_{s}-C_{js}}{\varepsilon}\right)\right)\beta_{s}\ ,\ \forall i \neq j = 1,\dots,n\ .
$$
Now we plug-in these gradients and Hessian in damped Newton algorithm as we did before.

Here we also use the exp-log stabilization to stabilize $g$, the gradients as well as the Hessian as below
$$
m(f)_{j} \leftarrow \min_{i}(C_{ij}-f_{i})\ ,  \ \forall j = 1,\dots,m\\
g_{j} = -\varepsilon\log\left(\sum_{i}\exp\left(\frac{f_{i}-C_{ij}+m(f)_{j}}{\varepsilon}\right)\alpha_{i}\right)+m(f)_{j}\ ,  \ \forall j = 1,\dots,m\ ,\\
\nabla_{f}Q_{\varepsilon}^{semi}(f)_{i} = \frac{1}{\varepsilon}\alpha_{i}\left(1-\sum_{s=1}^{n}\frac{e^{\frac{f_{i}-C_{ij}+m(f)_{j}}{\varepsilon}}\beta_{s}}{\left(\sum_{t=1}^{n}\alpha_{t}e^{\frac{f_{t}-C_{ts}+m(f)_{j}}{\varepsilon}}\right)}\right)\ ,\ \forall i = 1,\dots,n\ , \\
\nabla^{2}_{f}Q_{\varepsilon}^{semi}(f)_{ii} = \frac{-1}{\varepsilon}\sum_{s=1}^{m}\left(\alpha_{i}\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\left(1 - \alpha_{i}\left(\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\right)\beta_{s}\ ,\ \forall i =1,\dots,n\ , \\
\nabla^{2}_{f}Q_{\varepsilon}^{semi}(f)_{ij} = \frac{1}{\varepsilon}\sum_{s=1}^{m}\alpha_{i}\alpha_{j}\left(\exp\left(\frac{f_{i}+g(f,C,\varepsilon)_{s}-C_{is}}{\varepsilon}\right)\right)\left(\exp\left(\frac{f_{j}+g(f,C,\varepsilon)_{s}-C_{js}}{\varepsilon}\right)\right)\beta_{s}\ ,\ \forall i \neq j = 1,\dots,n\ .
$$

In [None]:
rho = 0.95
c = 0.5
Semi_dual_dampedNewtonP  = []
results_DampedNewtonsemidual  = []
times_DampedNewtonsemidual     = []
Hessians_DampedNewtonsemidual  = []

#Cost matrix
C = distmat(x,y)
# a and b
a = normalize(np.ones(N[0]))
b = normalize(np.ones(N[1]))
for eps in epsilons:
    K = np.exp(-C/eps)
    # Line Search
    print("Semi-dual damped Newton for epsilon="+str(eps)+":")   
    f = a
    print("Doing for (",N[0],N[1],").")
    print( " |- Iterating" )  
    start = time.time()
    Optimizer = computational_OT.DampedNewton_SemiDual_np(C,a,b,f,eps,rho,c)
    out = Optimizer._update(maxiter = 50 )
    results_DampedNewtonsemidual.append(out)
    end = time.time()
    times_DampedNewtonsemidual.append(1e3*(end-start))
    print( " |- Computing P")
    Semi_dual_dampedNewtonP .append(GetP(np.exp(out['potential_f']/eps),K,np.exp(out['potential_g']/eps)))
    print( " |- Recording (unstabilized) Hessian \n")
    mat  = -eps*Optimizer.Hessian
    diag = 1/np.sqrt( np.vstack( a ) ).flatten()
    mat = diag*mat*diag
    Hessians_DampedNewtonsemidual.append( mat )

In [None]:
plt.figure(figsize = (12,5))
plt.title("$$")
plt.title("$||P1 -a||_1+||P^T 1 -b||_1$")
for i in range(len(results_DampedNewtonsemidual)):
  error = np.asarray(results_DampedNewtonsemidual[i]['error'])
  plt.plot( error, label = 'Semi-dual damped Newton for $\epsilon=$'+ str(epsilons[i]), linewidth = 2)
plt.xlabel("Number of iterations")
plt.ylabel("Error in log-scale")
plt.legend(loc = "upper right")
plt.yscale( 'log' )
plt.tight_layout()
plt.savefig("../Images/Correctness_images/Error_plot_semi_dual_dampedNewton.png") 
plt.show()
print("\n Error plots can increase! The error is not the objective function!")

##### Comparing the final potentials of log-domain Sinkhorn and damped Newton in the semi-dual setup

In [None]:
fdampednewtonSemiDual, gdampednewtonSemiDual = [], []
for i in range(len(results_DampedNewtonsemidual)):
    fdampednewtonSemiDual.append(results_DampedNewtonsemidual[i]['potential_f'])
    gdampednewtonSemiDual.append(results_DampedNewtonsemidual[i]['potential_g'])
unique_dampednewtonSemiDual= []
for i in range(len(results_DampedNewtonsemidual)):
    unique_dampednewtonSemiDual.append(make_unique_potentials( fdampednewtonSemiDual[i], gdampednewtonSemiDual[i] )) 
errors_f, errors_g = [], []
for i in range(len(results_DampedNewtonsemidual)):
    print( f'''i : {i}''')
    err_f = np.linalg.norm( unique_logSK[i][0] - unique_dampednewtonSemiDual[i][0] )
    err_g = np.linalg.norm( unique_logSK[i][1] - unique_dampednewtonSemiDual[i][1] )
    errors_f.append(err_f)
    errors_g.append(err_g)
    print( "norm of err_f: ", err_f )
    print( "norm of err_g: ", err_g )
    print( "")

In [None]:
plt.figure( figsize = (20,7) )
plt.title( "Difference between potentials." )
plt.plot(list(range(len(epsilons))), np.array(errors_f[::-1])+np.array(errors_g[::-1]), label = 'difference for potentials (f,g) between log-domain Sinkhorn and damped Newton  in the semi dual setup', linewidth = 2, marker= 'o' )
plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_semi_dual_dampedNewton.png") 
plt.show()

## VI. Semi-dual damped Newton with preconditioning
Here we proceed similar to semi-dual damped Newton but with preconditioning. We consider $t$ eigenvalues of the Hessian and form the following preconditioning matrix:
$$
P = \left(I_{n+m}-\sum_{i-1}^{t}\left(1 - \frac{1}{\sqrt{\lambda_{i}}}\right)y_{i}y_{i}^{T}\right)\ .
$$
Now, at the $k^{th}$ iteration we solve the following equation:
$$
(P\nabla^{2}Q_{\varepsilon}(f,g)P)(Pp_{k})=P\nabla Q_{\varepsilon}(f,g)\ ,
$$
using iterative inversion methods such as "Conjugate gradient" and "GMRES" to get the update direction $p_{k}$, following which we use the Armijo condition to obtain the step size $\alpha_{k}$.

In [None]:
def build_preconditioners( num_eigs,modified_Hessian, N,ansatz=True ):
    # Diagonalize
    eigenvalues, eigenvectors = np.linalg.eigh( modified_Hessian )
    sorting_indices = np.argsort( eigenvalues )
    eigenvalues  = eigenvalues[sorting_indices]
    eigenvectors = eigenvectors[:, sorting_indices]
    # Form null vector
    if not ansatz:
        null_vector = eigenvectors[:, 0]
    else:
        null_vector = np.hstack( (np.ones(N[0])) )
        norm = np.sqrt( N[0])
        null_vector = null_vector/norm
    # Form other vectors
    indices = []
    for i in range(num_eigs):
        indices.append(i+1)
   
    precond_vectors = eigenvectors[:, indices ]
    precond_vectors = []
    for index in indices:
        precond_vectors.append( eigenvectors[:,index] )
    #
    return null_vector, precond_vectors

In [None]:
num_eigs = 13
null_vector, precond_vectors = build_preconditioners( num_eigs, Hessians_DampedNewtonsemidual[-1], N,ansatz=False )

In [None]:
rho = 0.95
c = 0.5
reset_starting_point    = True  
final_modified_Hessians = []
Semi_dual_dampedNewtonP            = []
results_DampedNewton_with_precodonditioner_SemiDual    = []
times_DampedNewton_with_precodonditioner_SemiDual     = []

f, g = None, None
# Cost matrix
C = distmat(x,y)
# a and b
a = normalize( np.ones(N[0]) )
b = normalize( np.ones(N[1]) )
for eps in epsilons:
    # Line Search
    print( "Semi-dual damped Newton for epsilon="+str(eps)+":" )    
    if (f is None):
        f = a*0
    print( "Doing for (",N[0],N[1],")." )
    print( " |- Iterating" )  
    start = time.time()
    Optimizer = computational_OT.DampedNewton_with_precodonditioner_SemiDual_np( C, a, b, f, eps, rho, c, null_vector, precond_vectors[:] )
    out = Optimizer._update( maxiter = 50, iterative_inversion = 30, version = None, debug = False, optType = 'cg' )
    results_DampedNewton_with_precodonditioner_SemiDual.append( out )
    end = time.time()
    times_DampedNewton_with_precodonditioner_SemiDual.append(1e3*(end-start))
    print( " |- Computing P" )
    if not reset_starting_point:
        f = Optimizer.x[:a.shape[0]]
        g = Optimizer.x[a.shape[0]:]
        # f = f.reshape( f.shape[0], -1)
        # g = g.reshape( g.shape[0], -1)
    Semi_dual_dampedNewtonP.append( GetP(np.exp(out['potential_f']/eps),np.exp(-C/eps),np.exp(out['potential_g']/eps)) )
    final_modified_Hessians.append( Optimizer.modified_Hessian )

In [None]:
plt.figure(figsize = (20,7)) 
plt.title("$$") 
plt.title("$||P1 -a||_1+||P^T1 -b||_1$") 
for i in range(len(results_DampedNewton_with_precodonditioner_SemiDual)): 
  error = np.asarray(results_DampedNewton_with_precodonditioner_SemiDual[i]['error'] ) 
  plt.plot( error, label = 'Semi-dual damped Newton for $\epsilon=$'+ str(epsilons[i]), linewidth = 2) 
plt.xlabel("Number of iterations")  
plt.ylabel("Error in log-scale")  
plt.legend() 
plt.yscale('log') 
plt.savefig("../Images/Correctness_images/Error_plot_semi_dual_dampedNewton_with_preconditioning.png") 
plt.show() 
print("\n Error plots can increase! The error is not the objective function!") 

#### Comparing the final potentials of log-domain Sinkhorn and damped Newton with preconditioning in the semi-dual setup

In [None]:
fdampednewtonSemiDualwithprecond, gdampednewtonSemiDuawithprecond = [], []
for i in range(len(results_DampedNewton_with_precodonditioner_SemiDual)):
    fdampednewtonSemiDualwithprecond.append(results_DampedNewton_with_precodonditioner_SemiDual[i]['potential_f'])
    gdampednewtonSemiDuawithprecond.append(results_DampedNewton_with_precodonditioner_SemiDual[i]['potential_g'])
unique_dampednewtonSemiDualwithprecond = []
for i in range(len(results_DampedNewton_with_precodonditioner_SemiDual)):
    unique_dampednewtonSemiDualwithprecond.append(make_unique_potentials( fdampednewtonSemiDualwithprecond[i], gdampednewtonSemiDuawithprecond[i]))
errors_f,errors_g = [], []
for i in range(len(results_DampedNewton_with_precodonditioner_SemiDual)):
    print( f'''i : {i}''')
    err_f = np.linalg.norm( unique_logSK[i][0] - unique_dampednewtonSemiDualwithprecond[i][0] )
    err_g = np.linalg.norm( unique_logSK[i][1] - unique_dampednewtonSemiDualwithprecond[i][1] )
    errors_f.append(err_f)
    errors_g.append(err_g)
    print( "norm of err_f: ", err_f )
    print( "norm of err_g: ", err_g )
    print( "" )

#### Comparing the final potentials of semi-dual damped Newton and semi-dual damped Newton with preconditioning

In [None]:

# Reality checks
print("For semi-dual damped Newton with and without precodnitioning")
for i in range(len(results_DampedNewton_with_precodonditioner_SemiDual)):
    print( f'''i : {i}''')
    # Sums of potentials f_i + g_j
    sum_dampedNewtonSemiDual    = fdampednewtonSemiDual[i][:,None] + gdampednewtonSemiDual[i][None,:]
    sum_dampedNewtonSemiDualprecond = fdampednewtonSemiDualwithprecond[i][:,None] + gdampednewtonSemiDuawithprecond[i][None,:]
    sum_dampedNewtonSemiDual    = sum_dampedNewtonSemiDual.squeeze()
    sum_dampedNewtonSemiDualprecond = sum_dampedNewtonSemiDualprecond.squeeze()
    error = np.linalg.norm(sum_dampedNewtonSemiDual-sum_dampedNewtonSemiDualprecond, ord=np.inf)
    print( "Error of sums of potentials : ", error )
    print("")

In [None]:
plt.figure( figsize = (20,7) )
plt.title( "$$" )
plt.title( "Difference between potentials." )
plt.plot(list(range(len(epsilons))), np.array(errors_f[::-1])+np.array(errors_g[::-1]), label = 'difference for potentials (f,g) between log-domain Sinkhorn  and damped Newton with preconditioning in the semi dual setup', linewidth = 2, marker= 'o' )
plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_semi_dual_dampedNewton_with_preconditioning.png") 
plt.show()

# Comparison plot for comparing the Kantorovich potentials of the various algortihms used above against the ground truth: log-domain Sinkhorn


In [None]:
plt.figure( figsize = (20,7) )
plt.title( "$$" )
plt.title( "Difference between potentials with and without regularization." )

difference_f = []
difference_g = []
for i in  range(len(results_Sinkhorn)):
    difference_f.append(np.linalg.norm(unique_logSK[i][0]-unique_SK[i][0]))
for i in  range(len(results_Sinkhorn)):
    difference_g.append(np.linalg.norm(unique_logSK[i][1]-unique_SK[i][1])) 
plt.plot(list(range(len(epsilons))), np.array(difference_f[::-1])+np.array(difference_g[::-1]),  label = 'difference for potential (f,g) between log-domain Sinkhorn and Sinkhorn', linewidth = 2, marker= 'o' )


difference_f = []
difference_g = []
for i in  range(len(results_DampedNewton)):
    difference_f.append(np.linalg.norm(unique_logSK[i][0]-unique_dampednewton[i][0]))
for i in  range(len(results_DampedNewton)):
    difference_g.append(np.linalg.norm(unique_logSK[i][1]-unique_dampednewton[i][1]))
plt.plot(list(range(len(epsilons))), np.array(difference_f[::-1])+np.array(difference_g[::-1]), label = 'difference for potential (f,g) between log-domain Sinkhorn and  damped Newton', linewidth = 2, marker= 'o' )


difference_f = []
difference_g = []
for i in  range(len(results_DampedNewton)):
    difference_f.append(np.linalg.norm(unique_logSK[i][0]-unique_dampednewtonwithprecond[i][0]))
for i in  range(len(results_DampedNewton)):
    difference_g.append(np.linalg.norm(unique_logSK[i][1]-unique_dampednewtonwithprecond[i][1]))
plt.plot(list(range(len(epsilons))), np.array(difference_f[::-1])+ np.array(difference_g[::-1]), label = 'difference for potential (f,g) between log-domain Sinkhorn and damped Newton with preconditioning', linewidth = 2, marker= 'o' )

difference_f = []
difference_g = []
for i in  range(len(results_DampedNewtonsemidual)):
    difference_f.append(np.linalg.norm(unique_logSK[i][0]-unique_dampednewtonSemiDual[i][0]))
for i in  range(len(results_DampedNewtonsemidual)):
    difference_g.append(np.linalg.norm(unique_logSK[i][1]-unique_dampednewtonSemiDual[i][1]))
plt.plot(list(range(len(epsilons))), np.array(difference_f[::-1])+np.array(difference_g[::-1]), label = 'difference for potential (f,g) between log-domain Sinkhorn and  semi-dual damped Newton', linewidth = 2, marker= 'o' )

difference_f = []
difference_g = []
for i in  range(len(results_DampedNewtonsemidual)):
    difference_f.append(np.linalg.norm(unique_logSK[i][0]-unique_dampednewtonSemiDualwithprecond[i][0]))
for i in  range(len(results_DampedNewtonsemidual)):
    difference_g.append(np.linalg.norm(unique_logSK[i][1]-unique_dampednewtonSemiDualwithprecond[i][1]))
plt.plot(list(range(len(epsilons))), np.array(difference_f[::-1])+ np.array(difference_g[::-1]), label = 'difference for potential (f,g) between log-domain Sinkhorn and  semi-dual damped Newton with preconditioning', linewidth = 2, marker= 'o' )


plt.xlabel("$\epsilon$")
plt.ylabel( "difference in log-scale" )
plt.legend()
plt.yscale( 'log' )
plt.xticks(list(range(len(epsilons))),epsilons[::-1])
plt.savefig("../Images/Correctness_images/Correctness_comparison_plot_all_algorithms.png") 
plt.show()
