# <div style="text-align: center">18.335/6.337 Final Project - The L-BFGS algorithm</div>
##  <div style="text-align: center">Strong Wolfe Line-Search</div>
### <div style="text-align: center">Created by Yusu Liu and Simon Batzner</div>

## Import test functions and set up environment

In [242]:
include("testfns.jl")
m = 2

2

In [243]:
function lbfgs!(F, x0,maxIt,m,τgrad=1e-5, verbose = 0)
    #INPUT
    # F: function to be optimized
    # x0: initial guess
    # maxIt: maximum Iteration
    # m: last m input differences and gradient differences are stored
    # τgrad: tolerance for norm of the slope


    #OUTPUT
    #x1: optimized variable
    #f1: function value at x1
    #k iteration number

    k=0
    n=length(x0)
    Sm=zeros(n,m) #S_k=x_k+1-x_k
    Ym=zeros(n,m) #Y_k=g_k+1-g_k
    f0,g0=F(x0)
    #use the simplest line search to find step size
    α, f1, g1=strongwolfe(F,-g0,x0,f0,g0)
    x1 = x0 - α.*g0
    k=1

    while true
        if k>maxIt
            break
        end
        gnorm=norm(g0)
        if gnorm < τgrad
            break
        end
        s0=x1-x0
        y0=g1-g0
        #println("y0=$y0")
        H0=s0'*y0/(y0'*y0) #hessian diagonal satisfying secant condition

        #update Sm and Ym
        if k<=m
            Sm[:,k]=s0
            Ym[:,k]=y0
            p=-approxInvHess(g1,Sm[:,1:k],Ym[:,1:k],H0)
        # only keep m entries in Sm and Ym so purge the old ones
        elseif (k>m)
            Sm[:,1:(m-1)]=Sm[:,2:m]
            Ym[:,1:(m-1)]=Sm[:,2:m]
            Sm[:,m]=s0
            Ym[:,m]=y0
            p=-approxInvHess(g1,Sm,Ym,H0)
        end
        # new direction=p, find new step size
        α, fs, gs=strongwolfe(F,p,x1,f1,g1)
        #update for next iteration
        x0=x1
        g0=g1
        x1=x1+α.*p
        f1=fs
        g1=gs
        k=k+1
        
        if verbose == 1
            println("It=$k,x=$x1")
        end
    end
    k=k-1
    return x1, f1, k
end

lbfgs! (generic function with 3 methods)

In [244]:
function strongwolfe(F,d,x0,fx0,gx0,maxIt=5)
# strong wolfe
    α_m=20
    α_p=0
    c1=1e-4
    c2=0.9
    α_x=1
    gx0=copy(gx0'*d)
    fxp=copy(fx0)
    gxp=copy(gx0)
    i=1
    α_s=0
    fs=copy(fx0)
    gs=copy(gx0)
    while true
        xx=x0+α_x*d
        fxx,gxx=F(xx)
        fs=copy(fxx)
        gs=copy(gxx)
        gxx=copy(gxx'*d)

        if (fxx>(fx0+c1*α_x*gx0)[1]) || (i>1) & (fxx>=fxp)
            α_s,fs,gs=zoom(F,x0,d,α_p,α_x,fx0,gx0)
            return α_s,fs,gs
        end
        if abs(gxx)<=-c2*(gx0)
            α_s=copy(α_x)
            return α_s,fs,gs
        end
        if gxx>=0
        #if abs.(gxx)[1]>=0 && abs.(gxx)[2]>=0
            α_s,fs,gs=zoom(F,x0,d,α_x,α_p,fx0,gx0)
            return α_s,fs,gs
        end
        α_p=copy(α_x)
        fxp=copy(fxx)
        gxp=copy(gxx)

        if i>maxIt
            α_s=α_x
            return α_s,fs,gs

        end
        r=0.8
        #r=0.8
        α_x=α_x+(α_m-α_x)*r
        i=i+1

    end
    return α_s,fs,gs
end

strongwolfe (generic function with 2 methods)

In [245]:
function zoom(F,x0,d,α_l,α_h,fx0,gx0,maxIt=5)
    c1=1e-4
    c2=0.9
    i=0
    α_s=0
    fs=copy(fx0)
    gs=copy(gx0)
    while true
        α_x=0.5*(α_l+α_h)
        α_s=copy(α_x)
        xx=x0+α_x*d
        fxx,gxx=F(xx)
        fs=copy(fxx)
        gs=copy(gxx)
        gxx=gxx'*d
        xl=x0+α_l*d
        fxl,gxl=F(xl)
        if (fxx>(fx0+c1*α_x*gx0)[1]) || fxx>=fxl
            α_h=copy(α_x)
        else
            if abs(gxx)[1]<=-c2*(gx0)
                α_s=copy(α_x)
                return α_s,fs,gs
            end
            if gxx*(α_h-α_l)[1]>=0
                α_h=copy(α_l)
            end
            α_l=copy(α_x)
        end
        i=i+1
        if i>maxIt
            α_s=copy(α_x)
            return α_s,fs,gs
        end
    end
    return α_s,fs,gs
end

zoom (generic function with 2 methods)

In [246]:
function approxInvHess(g,S,Y,H0)
    #INPUT

    #g: gradient nx1 vector
    #S: nxk matrixs storing S[i]=x[i+1]-x[i]
    #Y: nxk matrixs storing Y[i]=g[i+1]-g[i]
    #H0: initial hessian diagnol scalar

    #OUTPUT
    # p:  the approximate inverse hessian multiplied by the gradient g
    #     which is the new direction
    #notation follows:
    #https://en.wikipedia.org/wiki/Limited-memory_BFGS

    n,k=size(S)
    rho=zeros(k)
    for i=1:k
        rho[i]=1/(Y[:,i]'*S[:,i])
    end

    q=zeros(n,k+1)
    r=zeros(n,1)
    α=zeros(k,1)
    β=zeros(k,1)

    q[:,k+1]=g

    for i=k:-1:1
        α[i]=rho[i]*S[:,i]'*q[:,i+1]
        q[:,i]=q[:,i+1]-α[i]*Y[:,i]
    end

    z=H0*q[:,1]


    for i=1:k
        β[i]=rho[i]*Y[:,i]'*z
        z=z+S[:,i]*(α[i]-β[i])
    end

    p=z

    return p
end

approxInvHess (generic function with 1 method)

## First Test

In [258]:
x0 = [1,3]
x1, f1, k=lbfgs!(rosenbrock, x0, 100, m)

([1.0, 1.0], 1.0290199155525137e-13, 48)

## Define Functions and intervals to scan over

In [259]:
function test_range(fun, step, tol)
    
    if fun == rosenbrock
        x1_opt = [1.0, 1.0]
        a0 = -2.048
        a1 = 2.048
        b0 = -2.048
        b1 = 2.048
        println("Scanning from $(a0) to $(a1) and from $(b0) to $(b1)")
    elseif fun == himmelblau
        x1_opt_arry = [[3,2], [-2.805118, 3.131312], [-3.779310, -3.283186], [3.584428, -1.848126]]
        a0 = -10
        a1 = 10
        b0 = -10
        b1 = 10
        println("Scanning from $(a0) to $(a1) and from $(b0) to $(b1)")
    elseif fun == booth
        x1_opt = [1,3]
        a0 = -10
        a1 = 10
        b0 = -10
        b1 = 10
        println("Scanning from $(a0) to $(a1) and from $(b0) to $(b1)")
    elseif fun == bohachevsky1
        x1_opt = [0,0]
        a0 = -100
        a1 = 100
        b0 = -100
        b1 = 100
        println("Scanning from $(a0) to $(a1) and from $(b0) to $(b1)")
    else 
        println("ERORR: provide proper test function.")
    end
    
    for a in a0:step:a1
        for b in b0:step:b1
            
            x0 = [a, b]
            x1, f1, k=lbfgs!(fun, x0, 100, 2); 
            
            if fun == himmelblau
                println("\nERROR: Himmelblau not properly implemented yet")
                return
            else
                if (norm(x1 - x1_opt)) < tol
                    break;
                else
                    println("======\nfailed for: x0 = [$a, $b]")
                    println("x1 found was: $(x1)\n")
                end
            end
            
        end
    end
end

test_range (generic function with 3 methods)

## Scan over range

In [None]:
x1_opt = [1.0, 1.0]
step = 0.1
tol = 1e-6
test_range(rosenbrock, step, tol)

Scanning from -2.048 to 2.048 and from -2.048 to 2.048
failed for: x0 = [-1.948, -2.048]
x1 found was: [78783.3, 6.20681e9]

failed for: x0 = [-1.948, -1.948]
x1 found was: [67176.1, 4.51263e9]

failed for: x0 = [-1.948, -1.848]
x1 found was: [-36719.1, 1.34829e9]

failed for: x0 = [-1.948, -1.748]
x1 found was: [NaN, NaN]

failed for: x0 = [-1.948, -1.648]
x1 found was: [NaN, NaN]

failed for: x0 = [-1.948, -1.548]
x1 found was: [7867.7, 6.19006e7]

failed for: x0 = [-1.948, -1.448]
x1 found was: [-7441.78, 5.538e7]

failed for: x0 = [-1.948, -1.348]
x1 found was: [NaN, NaN]

failed for: x0 = [-1.948, -1.248]
x1 found was: [NaN, NaN]

failed for: x0 = [-1.948, -1.148]
x1 found was: [-4.70142e5, 2.21033e11]

failed for: x0 = [-1.948, -1.048]
x1 found was: [-223.554, 49975.0]

failed for: x0 = [-1.948, -0.948]
x1 found was: [69039.7, 4.76648e9]

failed for: x0 = [-1.948, -0.848]
x1 found was: [1.0, 1.0]

failed for: x0 = [-1.848, -2.048]
x1 found was: [-1.59672e47, -9.51814e92]

failed 

# Try different optimization test functions

In [263]:
lbfgs!(himmelblau, x0, 100, 2, 1e-6, 0)

([3.0, 2.0], 1.0395264331275901e-15, 34)

In [264]:
lbfgs!(booth, x0, 100, 2, 1e-6, 0)

([1, 3], 0, 0)

In [265]:
x0 = [50, -50]
lbfgs!(bohachevsky1, x0, 100, 2, 1e-6, 0)

([-1.8274e-8, -3.65543e-9], 5.10702591327572e-15, 45)

# Timing

In [266]:
maxIt = 100
m = 2
tol = 1e-6
verbose = 0
range = 1000
funs = [rosenbrock, himmelblau, booth, bohachevsky1]

for fun in funs
    t = 0.0
    
    for i in 1:range
        x0 = [rand(-1:0.1:1), rand(-1:0.1:1)]
        t += @elapsed lbfgs!(fun, x0, maxIt, m, tol, verbose)
    end
    
    println("\nTime elapsed for $(range) evaluations of $(fun): $t")
end


Time elapsed for 1000 evaluations of rosenbrock: 1.4275954839999971

Time elapsed for 1000 evaluations of himmelblau: 0.7864017630000006

Time elapsed for 1000 evaluations of booth: 0.7812254140000015

Time elapsed for 1000 evaluations of bohachevsky1: 0.865397372
