In [None]:
import numpy as np
from sklearn . datasets import load_digits
digits = load_digits()
# check the shape of digits data
print( digits.data.shape )
# check the shape of digits target
print( digits.target.shape )

A = digits.data
#In the following code , we create a NX1 vector of target labels
y = 1.0*np.ones([ A.shape[0],1])
for i in range(digits.target.shape[0]):
  y[i] = digits.target[i]


(1797, 64)
(1797,)


**1. Use Newton’s method to solve the problem (1), which is direct OLSLR. Use the starting point x = 0. Report
the difficulties, you encounter. Check if these difficulties remain if you use Newton’s method to solve the
problem (2), the regularized OLSLR with λ = 0.001 and the same starting point. Explain the reasons for your
observation. Report the values of $x^∗_f$ and $x^∗_{fλ}$.**

*$minxf(x) = \frac{1}{2}∥Ax − y∥^2_2$*

$minxf(x) = \frac{λ}{2}x^Tx+\frac{1}{2}∥Ax − y∥^2_2$

#Direct OLSLR

In [None]:
def f(x):
    return 0.5*np.linalg.norm(A@x-y)**2

def gradient(x):
    return A.transpose()@A@x - A.transpose()@y

def hessian(x):
    return A.transpose()@A

def dk_f(x):
  return np.linalg.inv(hessian(x))



def get_alpha(x, alpha0, rho, gamma):
  Dk = dk_f(x)
  alpha = alpha0
  pk = -gradient(x)
  while f(x + alpha*Dk@pk) > (f(x) + gamma*alpha*gradient(x)@Dk@pk):
    alpha = rho*alpha
  return alpha

def newton_method_with_backtracking(x0,tol,alpha0, rho, gamma):
  x = np.copy(x0)
  count = 0
  pk = gradient(x)
  xs = []
  xs.append(x)

  while np.linalg.norm(pk)>tol:
    Dk = dk_f(x)
    alpha = get_alpha(x, alpha0, rho, gamma)
    x = x - alpha * Dk @ pk
    pk = gradient(x)
    xs.append(x)
    count += 1

  return count, x, f(x), xs

In [None]:
print(digits)

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]]), 'target': array([0, 1, 2, ..., 8, 9, 8]), 'frame': None, 'feature_names': ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', '

In [None]:
x0 = np.zeros(64)
alpha0 = 0.99
rho = 0.5
gamma = 0.5
tol = 1e-4

iterations,minimizer,final_value,xks = newton_method_with_backtracking(x0,tol,alpha0,rho, gamma)

LinAlgError: Singular matrix

##OBSERVATION
**Here the error shows that the Dk matrix cannot be computed since A is not invertible.Therefore the direct OLSLR cannot be computed using the Newton's method**

In [None]:
y=digits.target

In [None]:
def f(x,lmbda):
    return 0.5 * lmbda * np.dot(x, x) + 0.5 * np.linalg.norm(np.dot(A, x) - y)**2

def grad(x,lmbda):
    return lmbda * x + np.dot(A.T, np.dot(A, x) - y)

def hessian(lmbda):

    return lmbda * np.eye(A.shape[1]) + np.dot(A.T, A)



def dk_f(x0,lmbda):
  return np.linalg.inv(hessian(lmbda))

def get_alpha(x, alpha0, rho, gamma,lmbda):
  Dk = dk_f(x,lmbda)
  alpha = alpha0
  pk = -grad(x,lmbda)
  while f(x + alpha*Dk@pk,lmbda) > (f(x,lmbda) + gamma*alpha*grad(x,lmbda)@Dk@pk):
    alpha = rho*alpha
  return alpha

def newton_method_with_backtracking(x0,tol,alpha0, rho, gamma,lmbda):
  x = np.copy(x0)
  count = 0
  pk = grad(x,lmbda)
  xs = []
  xs.append(x)

  while np.linalg.norm(pk)>tol:
    Dk = dk_f(x,lmbda)
    alpha = get_alpha(x, alpha0, rho, gamma,lmbda)
    x = x - alpha * Dk @ pk
    pk = grad(x,lmbda)
    xs.append(x)
    count += 1



  return count, x, f(x,lmbda), xs

In [None]:
lmbda = 1e-3
x0 = np.zeros(64)
alpha0 = 0.99
rho = 0.5
gamma = 0.5
tol = 1e-4

iterations,minimizer,final_value,xks = newton_method_with_backtracking(x0,tol,alpha0,rho, gamma,lmbda)

print('Iterations = ',iterations)
print('Minimizer = ',minimizer)
print('Final_value = ',final_value)


Iterations =  5
Minimizer =  [ 0.00000000e+00  9.69076882e-02 -4.32192762e-03 -7.75916338e-03
  7.49591987e-02  1.13946582e-02 -2.71293921e-02 -7.34410664e-03
  9.98267907e-01 -2.88089376e-02  1.18688356e-01  6.60922719e-02
 -5.57075915e-02 -6.97056149e-02  9.65844014e-02  2.55196013e-01
 -7.29830665e-01  2.42711745e-02  7.73241544e-02 -2.33008498e-02
 -5.64077619e-02  5.72413971e-02 -4.88675248e-02 -2.62555998e-01
 -9.06071169e-01 -1.49769638e-01  5.64022790e-02  8.96665319e-02
  8.39315938e-02  9.85410047e-02  1.69269848e-03 -2.96649688e+00
  0.00000000e+00 -1.54361470e-01 -9.32404639e-03  1.39497978e-01
 -3.69237437e-02  5.46111773e-02 -9.20425586e-03  0.00000000e+00
  1.03326506e-01  1.23983484e-01 -1.37635230e-02  5.40029021e-03
  1.31185700e-01  5.49577815e-02  2.24935899e-02  7.48046265e-03
  6.17507773e-01  2.44100619e-02  1.42356436e-03 -6.21114801e-02
 -2.07024631e-01 -3.38510517e-02  1.05486785e-01 -1.40336482e-01
 -9.82347827e-01 -1.14473639e-01  2.10485229e-02 -4.36083412e

##OBSERVATION
**Here in contrast to the direct OLSLR , the Dk matrix can be computed in the regularized OLSLR since the hessian becomes invertible due to addition of the lambda term.**

**2. Use the BFGS method with starting point x = 0, to solve the problem (1) and describe if you observe any
difficulty. Check if solving the regularized problem (2) helps (use λ = 0.001 and starting point x = 0). Explain
your observations. Report the values of $x^∗_f$ and $x^∗_{fλ}$.**


In [None]:
def f(x):
    return 0.5*np.linalg.norm(A@x-y)**2

def gradient(x):
    return A.transpose()@A@x - A.transpose()@y

def hessian(x):
    return A.transpose()@A

def dk_f(x):
  return np.linalg.inv(hessian(x))



def get_alpha_bfgs(x, alpha0, rho, gamma, Bk):
  alpha = alpha0
  pk = -gradient(x)
  while f(x + alpha*Bk@pk) > (f(x) + gamma*alpha*gradient(x)@Bk@pk):
    alpha = rho*alpha
  return alpha


def bfgs(x0, tol, alpha0, rho, gamma, max_iter=500):

  x = np.copy(x0)
  n = len(x0)
  Bk = np.eye(n)
  count = 0
  pk = gradient(x)
  xs = []
  xs.append(x)
  while (np.linalg.norm(pk)>tol):
    if count > max_iter:
      break

    alpha = get_alpha_bfgs(x, alpha0, rho, gamma, Bk)
    xnext = x - alpha*(Bk@pk)
    sk = xnext - x
    yk = gradient(xnext) - gradient(x)
    # BFGS
    Bk = np.dot((np.eye(len(x)) - np.outer(sk, yk) / np.dot(yk, sk)), np.dot(Bk, (np.eye(len(x)) - np.outer(yk, sk) / np.dot(yk, sk)))) + np.outer(sk, sk) / np.dot(yk, sk)

    x = xnext
    pk = gradient(x)
    xs.append(x)
    count += 1


  return count, x, f(x), xs

In [None]:
x0 = np.zeros(64)
alpha0 = 0.99
rho = 0.5
gamma = 0.5
tol = 1e-5

# Solve the problem


iterations1,minimizer1,final_value1,xks1 = bfgs(x0, tol, alpha0, rho, gamma, max_iter=500)

print('Iterations = ',iterations1)
print('Minimizer = ',minimizer1)
print('Final_value = ',final_value1)

Iterations =  74
Minimizer =  [ 0.00000000e+00  9.69033568e-02 -4.32277231e-03 -7.76028319e-03
  7.49594380e-02  1.13947198e-02 -2.71328245e-02 -7.33176337e-03
  9.98337968e-01 -2.88095538e-02  1.18688288e-01  6.60916265e-02
 -5.57069862e-02 -6.97063705e-02  9.65876439e-02  2.55182251e-01
 -7.29828609e-01  2.42709916e-02  7.73249596e-02 -2.33000278e-02
 -5.64086144e-02  5.72426822e-02 -4.88717684e-02 -2.62467763e-01
 -9.06562828e-01 -1.49767791e-01  5.64019538e-02  8.96663590e-02
  8.39318159e-02  9.85411936e-02  1.69317613e-03 -2.96805758e+00
  0.00000000e+00 -1.54362338e-01 -9.32361206e-03  1.39497628e-01
 -3.69234835e-02  5.46111776e-02 -9.20505070e-03  0.00000000e+00
  1.03279535e-01  1.23983258e-01 -1.37639605e-02  5.40087816e-03
  1.31185107e-01  5.49570758e-02  2.24938237e-02  7.47977908e-03
  6.17755029e-01  2.44122357e-02  1.42333037e-03 -6.21110760e-02
 -2.07025036e-01 -3.38506003e-02  1.05486736e-01 -1.40335958e-01
 -9.84169004e-01 -1.14467153e-01  2.10494893e-02 -4.36076105

#Regularized OLSLR

In [None]:
def f(x,lmbda):
    return 0.5 * lmbda * np.dot(x, x) + 0.5 * np.linalg.norm(np.dot(A, x) - y)**2

def grad(x,lmbda):
    return lmbda * x + np.dot(A.T, np.dot(A, x) - y)

def hessian(lmbda):

    return lmbda * np.eye(A.shape[1]) + np.dot(A.T, A)



def dk_f(x0,lmbda):
  return np.linalg.inv(hessian(lmbda))

def get_alpha_bfgs(x, alpha0, rho, gamma, Bk,lmbda):
  alpha = alpha0
  pk = -grad(x,lmbda)
  while f(x + alpha*Bk@pk,lmbda) > (f(x,lmbda) + gamma*alpha*grad(x,lmbda)@Bk@pk):
    alpha = rho*alpha
  return alpha

def bfgs(x0, tol, alpha0, rho, gamma,lmbda,max_iter=500):

  x = np.copy(x0)
  n = len(x0)
  Bk = np.eye(n)
  count = 0
  pk = grad(x,lmbda)
  xs = []
  xs.append(x)
  while (np.linalg.norm(pk)>tol):
    if count > max_iter:
      break

    alpha = get_alpha_bfgs(x, alpha0, rho, gamma, Bk,lmbda)
    xnext = x - alpha*(Bk@pk)
    sk = xnext - x
    yk = grad(xnext,lmbda) - grad(x,lmbda)
    # BFGS
    Bk = np.dot((np.eye(len(x)) - np.outer(sk, yk) / np.dot(yk, sk)), np.dot(Bk, (np.eye(len(x)) - np.outer(yk, sk) / np.dot(yk, sk)))) + np.outer(sk, sk) / np.dot(yk, sk)

    x = xnext
    pk = grad(x,lmbda)
    xs.append(x)
    count += 1


  return count, x, f(x,lmbda), xs

In [None]:
lamda = 1e-3
x0 = np.zeros(64)
alpha0 = 0.99
rho = 0.5
gamma = 0.5
tol = 1e-5

# Solve the problem

iterations2,minimizer2,final_value2,xks2 = bfgs(x0, tol, alpha0, rho, gamma,lmbda, max_iter=500)
print('----------------------------------------------------------------------------------------')
print('Iterations',iterations2)
print('Minimizer',minimizer2)
print('Final value',final_value2)

----------------------------------------------------------------------------------------
Iterations 71
Minimizer [ 0.00000000e+00  9.69076882e-02 -4.32192762e-03 -7.75916338e-03
  7.49591987e-02  1.13946582e-02 -2.71293921e-02 -7.34410664e-03
  9.98267907e-01 -2.88089376e-02  1.18688356e-01  6.60922719e-02
 -5.57075915e-02 -6.97056149e-02  9.65844014e-02  2.55196013e-01
 -7.29830665e-01  2.42711745e-02  7.73241544e-02 -2.33008498e-02
 -5.64077619e-02  5.72413971e-02 -4.88675248e-02 -2.62555998e-01
 -9.06071170e-01 -1.49769638e-01  5.64022790e-02  8.96665319e-02
  8.39315938e-02  9.85410047e-02  1.69269848e-03 -2.96649687e+00
  0.00000000e+00 -1.54361470e-01 -9.32404640e-03  1.39497978e-01
 -3.69237437e-02  5.46111773e-02 -9.20425586e-03  0.00000000e+00
  1.03326506e-01  1.23983484e-01 -1.37635230e-02  5.40029021e-03
  1.31185700e-01  5.49577815e-02  2.24935899e-02  7.48046265e-03
  6.17507773e-01  2.44100619e-02  1.42356436e-03 -6.21114801e-02
 -2.07024631e-01 -3.38510517e-02  1.054867

##OBSERVATION

Here it was seen that the newtons method took more number of iterations than the BFGS and the BFGS method gave a better minimum value than that of newton's method.