# In this notebook we look at the performance of various optimization algorithms namely: linear optimization, Sinkhorn, gradient ascent, line search and L-BFGS.

In [None]:
from __future__ import division
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy as sc
import pylab as pyl
import time
import cvxpy as cp
from numpy import linalg as Lin
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import style
from sklearn import datasets
import computational_OT
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
relative_path_to_new_folder = "../Images"
os.makedirs(relative_path_to_new_folder, exist_ok = True)
if not os.path.isdir('../Images/Performance_comparison_exisitng_algorithms_images'):
    os.makedirs('../Images/Performance_comparison_exisitng_algorithms_images')

## Helper Functions

In [None]:
"""To compute distance matrix"""
def distmat( x, y ):
    return np.sum( x**2, 0 )[:,None] + np.sum( y**2, 0 )[None,:] - 2 * x.transpose().dot( y )

"""To Normalise a vector"""
normalize = lambda a: a/np.sum( a )

"""To Compute P"""
def GetP( u, K, v ):
    return u[:,None] * K * v[None,:]

def plotp( plt, x, col, scale = 200, edgecolors = "k" ):
  return plt.scatter( x[0,:], x[1,:], s = scale, edgecolors = edgecolors, c = col, cmap = 'plasma', linewidths = 2 )


## Generate Data


In [None]:
""" Here we generate points from an annulus and a square."""
def randomsampledata( N ):
  x = []
  y = []
  N = np.sort( N )
  for i in range(len(N)):
    x.append( np.random.rand( 2, N[i] ) - 0.5 )
    theta= 2 * np.pi * np.random.rand( 1, N[i] )
    r = 0.8 + 0.2 * np.random.rand( 1, N[i] )
    y.append( np.vstack( ( np.cos(theta) * r, np.sin(theta) * r ) ) )
  return x, y, N

In [None]:
N = [ 200, 400, 600, 800, 1000 ]
x, y, N = randomsampledata( N )

## Primal formulation
The primal formulation of the optimal transport problem is given by:
$$
OT_{\epsilon}(\alpha,\beta) = min_{\pi \in \mathcal{U(\alpha,\beta)}} \langle C, \pi \rangle\ ,
$$
where 
$\ \mathcal{U(\alpha,\beta)}=\{\pi: \pi\mathbb{1} = \alpha,\ \pi^{T}\mathbb{1}=\beta\}\ .$

## Linear Optimization
This primal formulation can be looked as a linear optimization problem and therefore, we look at the performance of the linear optimization algorithm to find the optimal coupling $P$.

In [None]:
# Linear optimization
print("Linear optimization...")
times_linearOpt = []
LinearP = []
Error = []
for i in range(len(N)):
  xi, yi = x[i], y[i]
  #Cost matrix
  C = distmat( xi, yi )
  # a and b
  a = normalize( np.ones( N[i] ) ) 
  b = normalize( np.ones( N[i] ) )
  # Plotting point clouds  
  plt.figure( figsize = ( 5, 5 ) )
  plotp( plt, xi, col = 'b')
  plotp( plt, yi, col = 'r' )
  plt.axis( "off" )
  plt.xlim( np.min( yi[0,:] ) - .1, np.max( yi[0,:] ) + .1 )
  plt.ylim( np.min( yi[1,:] ) - .1, np.max( yi[1,:] ) + .1 )
  plt.show()
  # Optimization
  print( "Doing for ",N[i] )
  print( " |- Iterating" )
  start = time.time()
  Optimizer = computational_OT.linear_optimization( N[i],
                                                    N[i],
                                                    a,
                                                    b,
                                                    C )
  print( " |- Computing P" )
  print( "" )
  out = Optimizer.solve()
  end = time.time()
  LinearP.append( out['Optimal coupling'].value )
  Error.append( out['Error'] )
  times_linearOpt.append( end - start )
#end for

## Entropy regularized formulation

The primal entropy regularized formulation of OT is given by:
$$
OT_{\epsilon}(\alpha,\beta) = min_{\pi \in \mathcal{U}(\alpha,\beta)} \langle C,\pi \rangle +\epsilon KL(\pi\|\alpha \otimes \beta)\ ,
$$
where
$\ 
KL(\pi\|\alpha \otimes \beta) 
\ $ is the KL-divergence. 

## Entropy regularized dual-formulation
The dual formulation of OT is given by:
$$
OT_{\epsilon}(\alpha,\beta) = \max_{f\in \mathbb{R}^{n}, g\in\mathbb{R}^{m}} \langle f, \alpha \rangle + \langle g, \beta \rangle - \epsilon\left(\langle\alpha \otimes \beta, e^{\frac{f}{\epsilon}}\odot K \odot e^{\frac{g}{\epsilon}}  \rangle-1\right)\ ,
$$
where
$$
\alpha \in \mathcal{M}_{1}(\mathcal{X}),\ \beta \in \mathcal{M}_{1}(\mathcal{Y}),\ \epsilon>0,\ f\in\mathbb{R}^{n},\ g\in \mathbb{R}^{m}\ .
$$

## Entropy regularized formulation

The primal entropy regularized formulation of OT is given by:
$$
OT_{\epsilon}(\alpha,\beta) = min_{\pi \in \mathcal{U}(\alpha,\beta)} \langle C,\pi \rangle +\epsilon KL(\pi\|\alpha \otimes \beta)\ ,
$$
where
$\ 
KL(\pi\|\alpha \otimes \beta) 
\ $ is the KL-divergence and $\ \mathcal{U}(\alpha,\beta)=\{\pi: \pi\mathcal{1}=\alpha, \pi^{T}\mathcal{1}=\beta\}$. 

### I. Sinkhorn
The optimal coupling $\pi^{*}$ has the following form :
$$
\pi^{*} = \alpha \odot diag(u)K diag(v)\odot \beta
$$
and we know that $\pi^{*}\mathbb{1}=\alpha$ and $(\pi^{*})^{T}\mathbb{1}=\beta$.
###
Therefore, Sinkhorn updates is given by the following alternative projections
$$
u^{t+1}  \leftarrow \frac{1}{K(v^{t}\odot \beta)}\ ,\ 
v^{t+1}  \leftarrow \frac{1}{K^{T}(u^{t+1}\odot \alpha)}\ , 
$$
where 
$K = e^{-\frac{C}{\epsilon}}\in M_{n\times m}(\mathbb{R}),\ \alpha \in \mathbb{R}^{n},\ \beta \in \mathbb{R}^{m}\ ,\ u\in \mathbb{R}^{n},\ v\in \mathbb{R}^{m}\ and \ (u^{0},v^{0})=(u,v)\ .$


In [None]:
# Sinkhorn
print("Sinkhorn... ")
SinkhornP = []
results_Sinkhorn = []
times_Sinkhorn = []
#Epsilon
epsilon = .06
for i in range(len(N)):
  xi, yi = x[i], y[i]
  #Cost matrix
  C = distmat( xi, yi )
  # a and b
  a = normalize( np.ones( N[i] ) ) 
  b = normalize( np.ones( N[i] ) )
  #Kernel
  K = np.exp( - C/epsilon )
  print( "Doing for ",N[i] )
  print( " |- Iterating" )
  #Inflating
  u = a
  v = b
  start = time.time()
  Optimizer = computational_OT.sinkhorn(  K,
                                          a,
                                          b,
                                          u,
                                          v,
                                          epsilon )
  out = Optimizer._update()
  results_Sinkhorn.append( out )
  end = time.time()
  times_Sinkhorn.append( end - start )
  print( " |- Computing P" )
  print( "" )
  u_opt = np.exp( out['potential_f']/epsilon )
  K = np.exp( - C/epsilon )
  v_opt =  np.exp( out['potential_g']/epsilon )
  P_opt = GetP( u_opt, K, v_opt )
  SinkhornP.append( P_opt )
# end for  

##### Error plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 2 )
plt.title( "$||P 1 -a||_1+||P^T 1 -b||_1$" )
for result in results_Sinkhorn:
    error = np.asarray( result['error_a'] ) + np.asarray( result['error_b'] ) 
    plt.plot( error, linewidth = 2 )
# end for
plt.yscale( 'log' )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ConvergenceSinkhorn.pdf", format = 'pdf' )
plt.show()

##### Objective function plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Objective Function" )
for result in results_Sinkhorn:
  plt.plot( np.asarray(result['objective_values']), linewidth = 2 )
# end for
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ObjectiveSinkhorn.pdf", format = 'pdf' )
plt.show()

### II. Gradient Ascent
Here we perform the gradient ascent algorithm on the dual entropy regularized setup as mentioned before. The gradients of the objective function $Q$ in this setup are as follows:
$$
\nabla_{f} Q(f,g) = \alpha -\alpha\odot e^{\frac{f}{\epsilon}}\odot\langle K, e^{\frac{g}{\epsilon}}\odot \beta \rangle\ , 
$$
$$
\nabla_{g} Q(f,g) = \beta - \beta\odot e^{\frac{g}{\epsilon}}\odot \langle K, e^{\frac{f}{\epsilon}}\odot \alpha \rangle\ ,
$$
where
$$
f \in \mathbb{R}^{n},\ g \in \mathbb{R}^{m}\ .
$$

In [None]:
# Gradient ascent
print("Gradient ascent...")
learning_rate = 0.01
#Epsilon
epsilon = .06
GradientAscentP = []
results_Gradient_ascent = []
times_Gradient_ascent = []
results_Gradient_ascent = []
times_Gradient_ascent = []
for i in range(len(N)):
  xi, yi = x[i], y[i]
  #Cost matrix
  C = distmat( xi, yi )
  # a and b
  a = normalize( np.ones( N[i] ) ) 
  b = normalize( np.ones( N[i] ) )
  #Kernel
  K = np.exp( - C/epsilon )
  f, g = a, b
  print( "Doing for ",N[i] )
  print( " |- Iterating" )
  start = time.time()
  Optimizer = computational_OT.gradient_ascent( K,
                                                a,
                                                b,
                                                f,
                                                g,
                                                epsilon,
                                                learning_rate )
  out = Optimizer._update()
  end = time.time()
  results_Gradient_ascent.append( out )
  times_Gradient_ascent.append( end - start )
  print( " |- Computing P" )
  print( "" )
  u_opt = np.exp( out['potential_f']/epsilon )
  K = np.exp( - C/epsilon )
  v_opt =  np.exp( out['potential_g']/epsilon )
  P_opt = GetP( u_opt, K, v_opt )
  GradientAscentP.append( P_opt )
# end for

##### Error plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 2 )
plt.title( "$||P 1 -a||_1+||P^T 1 -b||_1$" )
for result in results_Gradient_ascent:
    error = np.asarray( result['error_a'] ) + np.asarray( result['error_b'] ) 
    plt.plot( error, linewidth = 2 )
# end for
plt.yscale( 'log' )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ConvergenceGradient_ascent.pdf", format = 'pdf' )
plt.show()

##### Objective function plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Objective Function" )
for result in results_Gradient_ascent:
  plt.plot( np.asarray(result['objective_values']).flatten(), linewidth = 2 )
# end for
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ObjectiveGradient_ascent.pdf", format = 'pdf' )
plt.show()

### III. Line Search
Here we use the Armijo condition to obtain the step size to update the current vector toward the update direction which is obtained by gradient ascent.


In [None]:
# Line Search
print( "Line Search..." )
# Damping factor for ascent step-size
rho = 0.95
# Damping factor to increase ascent step-size
rho_inc = 1.5
# Damping factor for the slope in Wolfe condition
c1 = 0.05
#Epsilon
epsilon = .06
initial_alpha = 1
LineSearchP = []
results_LineSearch = []
times_LineSearch = []
for i in range(len(N)):
      xi, yi = x[i], y[i]
      #Cost matrix
      C = distmat( xi, yi )
      # a and b
      a = normalize( np.ones( N[i] ) ) 
      b = normalize( np.ones( N[i] ) )
      #Kernel
      K = np.exp( - C/epsilon )
      f, g = a, b
      print( "Doing for ", N[i] )
      print( " |- Iterating" )
      start = time.time()
      Optimizer = computational_OT.linesearch(  K,
                                                a,
                                                b,
                                                f,
                                                g,
                                                epsilon,
                                                rho,
                                                rho_inc,
                                                c1,
                                                initial_alpha )
      out = Optimizer._update()
      results_LineSearch.append( out )
      end = time.time()
      times_LineSearch.append( end - start )
      print( " |- Computing P" )
      print( "" )
      u_opt = np.exp( out['potential_f']/epsilon )
      K = np.exp( - C/epsilon )
      v_opt =  np.exp( out['potential_g']/epsilon )
      P_opt = GetP( u_opt, K, v_opt )
      LineSearchP.append( P_opt )
# end for

##### Error plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 2 )
plt.title( "$||P 1 -a||_1+||P^T 1 -b||_1$" )
for result in results_LineSearch:
    error = np.asarray( result['error_a'] ) + np.asarray( result['error_b'] ) 
    plt.plot( error, linewidth = 2 )
# end for
plt.yscale( 'log' )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ConvergenceLineSearch.pdf", format = 'pdf'  )
plt.show()

##### Objective function plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Objective Function" )
for result in results_LineSearch:
  plt.plot( np.asarray( result['objective_values'] ), linewidth = 2 )
# end for
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/ConvergenceLineSearch.pdf", format = 'pdf'  )
plt.show()

##### Ascent step-size plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Alpha" )
for result in results_LineSearch:
  plt.plot( np.asarray( result['linesearch_steps'] ), linewidth = 2 )
  # end for
plt.legend( [ str(i) for i in N ], loc = "upper right" )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/Alphaplot_LineSearch.pdf", format = 'pdf'  )
plt.show()

#### For varying epsilons

In [None]:
# Line Search
print( "Line Search..." )
print( "Doing for (",N[0], N[1],")." )
# Damping factor for ascent step-size
rho = 0.95
# Damping factor to increase ascent step-size
rho_inc = 1.5
# Damping factor for the slope in Wolfe condition
c1 = 0.05
initial_alpha = 1
LineSearchP = []
results_LineSearch = []
#Cost matrix
C = distmat( x[1], y[1] )
# a and b
a = normalize( np.ones( N[1] ) )
b = normalize( np.ones( N[1] ) )
epsilons = [  0.1,  0.08, 0.05, 0.01 ]
for eps in epsilons:
    print( "For epsilon = "+str(eps)+":" )    
    #Kernel
    K = np.exp( - C/eps )
    f, g = a, b
    print( " |- Iterating")
    start = time.time()
    Optimizer = computational_OT.LineSearch(    K,
                                                a, 
                                                b,
                                                f,
                                                g,
                                                eps,
                                                rho,
                                                rho_inc,
                                                c1,
                                                initial_alpha )
    out = Optimizer._update( max_iterations = 1000 ) 
    results_LineSearch.append( out )
    end = time.time()
    print( " |- Computing P" )
    print( "" )
    u_opt = np.exp( out['potential_f']/eps )
    K = np.exp( - C/eps )
    v_opt =  np.exp( out['potential_g']/eps )
    P_opt = GetP( u_opt, K, v_opt )
    LineSearchP.append( P_opt )
# end for

##### Error plot

In [None]:
# Plot
plt.figure( figsize = ( 15, 6 ) )
n = len( results_LineSearch )
plt.title( "$||P1 -a||_1 + ||P^T 1 -b||_1$" )
for i in range(n):
    error_hybrid = np.asarray( results_LineSearch[i]['error_a'] ) + np.asarray( results_LineSearch[i]['error_b'] )
    plt.plot( error_hybrid, label = 'LineSearch for $\epsilon = $'+ str(epsilons[i]), linewidth = 2)
# end for
plt.xlabel( "Number of iterations" )
plt.ylabel( "Error in log-scale" )
plt.legend( loc = 'upper right' )
plt.yscale( 'log' )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/Linesearchvaryingepsilon.pdf", format = 'pdf'  )
plt.show()

##### Objective function plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Alpha" )
for i in range(len(results_LineSearch)):
  plt.plot( np.asarray( results_LineSearch[i]['linesearch_steps']), label = 'LineSearchNewton for $\epsilon = $'+ str(epsilons[i]), linewidth = 2 )
# end for
plt.xlabel( "Number of iterations" )
plt.legend( loc = 'upper right' )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/AlphaLineSearchNewton.pdf", format = 'pdf'  )
plt.show()

### IV. L_BFGS_B
Here we perform the L-BFGS algorithm on the dual entropy regularized problem.

In [None]:
# BFGS
print("BFGS...")
BFGSP = []
results_BFGS = []
#Epsilon
epsilon = .06
times_BFGS = []
for i in range(len(N)):
      xi, yi = x[i], y[i]
      #Cost matrix
      C = distmat( xi, yi )
      # a and b
      a = normalize( np.ones( N[i] ) )
      a = a.reshape( a.shape[0], - 1 )
      b = normalize( np.ones( N[i] ) )
      b = b.reshape( b.shape[0], - 1 )
      #Kernel
      K = np.exp( - C/epsilon )
      f, g = a, b
      print( "Doing for ", N[i] )
      print( " |- Iterating" )
      start = time.time()
      Optimizer = computational_OT.l_bfgs_b(    K,
                                                a,
                                                b,
                                                f,
                                                g,
                                                epsilon )
      out = Optimizer._update()
      results_BFGS.append( out )
      end = time.time()
      times_BFGS.append( end - start )
      print( " |- Computing P" )
      print( "" )
      u_opt = np.exp( out['potential_f']/epsilon )
      K = np.exp( - C/epsilon )
      v_opt =  np.exp( out['potential_g']/epsilon )
      P_opt = GetP( u_opt, K, v_opt )
      BFGSP.append( P_opt )
# end for

##### Error plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.title( "$||P1 -a||_1 + ||P^T 1 -b||_1$" )
for result in results_BFGS:
  error = np.asarray( result['error_a'] ) + np.asarray( result['error_b'] ) 
  plt.plot( error, linewidth = 2 )
plt.yscale( 'log' )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/Convergence_LBGFS.pdf", format = 'pdf'  )
plt.show()
print( "\n Error plots can increase! The error is not the objective function!" )

##### Objective function plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.subplot( 2, 1, 1 ),
plt.title( "Objective Function" )
for result in results_BFGS:
  plt.plot( np.asarray(result['objective_values']), linewidth = 2 )
plt.legend( [ "N = "+str(i) for i in N ], loc = "upper right" )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/Objective_LBGFS.pdf", format = 'pdf'  )
plt.show()

#### For varying epsilons

In [None]:
# BFGS
print("BFGS...")
BFGSP = []
results_BFGS = []
epsilons = [  0.1, 0.08, 0.05, 0.01 ]
N = ( 400, 400 )
#Cost matrix
C = distmat( x[1], y[1] )
# a and b
a = normalize( np.ones( N[1] ) )
a = a.reshape( a.shape[0], - 1 )
b = normalize( np.ones( N[1] ) )
b = b.reshape( b.shape[0], - 1 )
print( "Doing for ",(N[1], N[1]) )
for eps in epsilons:
      #Kernel
      K = np.exp( - C/eps )
      f, g = a, b
      print("\n For epsilon = " +str(eps) )
      print( " |- Iterating" )
      start = time.time()
      Optimizer = computational_OT.l_bfgs_b(    K,
                                                a,
                                                b,
                                                f,
                                                g,
                                                eps )
      out = Optimizer._update()
      results_BFGS.append( out )
      end = time.time()
      print( " |- Computing P" )
      print( "" )
      u_opt = np.exp( out['potential_f']/epsilon )
      K = np.exp( - C/epsilon )
      v_opt =  np.exp( out['potential_g']/epsilon )
      P_opt = GetP( u_opt, K, v_opt )
      BFGSP.append( P_opt )
# end for

##### Error plot

In [None]:
plt.figure( figsize = ( 20, 7 ) )
plt.title( "$$" )
plt.title( "$||P1 -a||_1+||P^T 1 -b||_1$" )
for i in range( len( results_BFGS ) ):
  error = np.asarray(results_BFGS[i]['error_a']) + np.asarray(results_BFGS[i]['error_b'] )
  plt.plot( error, label = 'LBGFS for $\epsilon = $'+ str(epsilons[i]), linewidth = 2 )
# end for
plt.legend( loc = 'upper right' )
plt.yscale( 'log' )
plt.savefig( "../Images/Performance_comparison_exisitng_algorithms_images/LBGFSconvergencevaryepsilon.pdf", format = 'pdf'  )
plt.show()