In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import numpy as np
from cs771 import genSyntheticData as gsd
from cs771 import plotData as pd
from cs771 import optLib as opt
from matplotlib import pyplot as plt
import random

In [7]:
def solver( X, y, timeout, spacing ):
	(n, d) = X.shape
	t = 0
	totTime = 0
	
	# w is the model vector and will get returned once timeout happens
	w = np.zeros( (d,) )
	tic = tm.perf_counter()

In [8]:
traindata = np.loadtxt("train")
y = traindata[:,-1]
X = traindata[:,0:-1]


In [9]:
# Vanilla Gradient Descent
def getCSVMGrad( theta, t ):
    w = theta[0:-1]
    b = theta[-1]
    discriminant = np.multiply( (X.dot( w ) + b), y )
    g = np.zeros( (y.size,) )
    g[discriminant < 1] = -1
    delb = C * g.dot( y )
    delw = w + C * (X.T * g).dot( y )
    return np.append( delw, delb )

# Stochastic Gradient Descent
def getCSVMSGrad( theta, t ):
    w = theta[0:-1]
    b = theta[-1]
    n = y.size
    i = random.randint( 0, n-1 )
    x = X[i,:]
    discriminant = (x.dot( w ) + b) * y[i]
    g = 0
    if discriminant < 1:
        g = -1
    delb = C * n * g * y[i]
    delw = w + C * n * (x * g) * y[i]
    return np.append( delw, delb )

# Mini-batch Stochastic Gradient Descent
def getCSVMMBGrad( theta, t ):
    w = theta[0:-1]
    b = theta[-1]
    n = y.size
    # Be careful not to ask for more samples than there are training points
    # otherwise the sample() routine will throw an exception
    B_eff = min( B, n )
    samples = random.sample( range(0, n), B_eff )
    X_ = X[samples,:]
    y_ = y[samples]
    discriminant = np.multiply( (X_.dot( w ) + b), y_ )
    g = np.zeros( (B_eff,) )
    g[discriminant < 1] = -1
    delb = C * n/B_eff * g.dot( y_ )
    delw = w + C * n/B_eff * (X_.T * g).dot( y_ )
    return np.append( delw, delb )

# Variable Batch-size Stochastic Gradient Descent
def getCSVMVarMBGrad( theta, t ):
    w = theta[0:-1]
    b = theta[-1]
    n = y.size
    # Increase the batch size every few iterations -- there are two tuneable hyperparameters here
    # How frequently to update the batch size and how much to increase it at each update
    B_eff = min( B * int( pow(1.2, t//40) ), n )
    samples = random.sample( range(0, n), B_eff )
    X_ = X[samples,:]
    y_ = y[samples]
    discriminant = np.multiply( (X_.dot( w ) + b), y_ )
    g = np.zeros( (B_eff,) )
    g[discriminant < 1] = -1
    delb = C * n/B_eff * g.dot( y_ )
    delw = w + C * n/B_eff * (X_.T * g).dot( y_ )
    return np.append( delw, delb )

# Get the CSVM objective value in order to plot convergence curves
def getLassoObjVal( theta ):
    w = theta[0:-1]
    b = theta[-1]
    hingeLoss = np.maximum( 1 - np.multiply( (X.dot( w ) + b), y ), 0 )
    return 0.5 * w.dot( w ) + C * np.sum( hingeLoss )

In [6]:
objFun = np.linalg.norm(w,ord=1)+np.linalg.norm(X.dot(w)-y,ord=2)

NameError: name 'w' is not defined

In [6]:
while True:
		t = t + 1
		if t % spacing == 0:
			toc = tm.perf_counter()
			totTime = totTime + (toc - tic)
			if totTime > timeout:
				return (w, totTime)
			else:
				tic = tm.perf_counter()
                
return (w, totTime)

SyntaxError: 'return' outside function (<ipython-input-6-4c66e4e3ee3e>, line 7)

In [None]:
# Get the MAE and LASSO objective on original data by translating the model
def getLASSOObjNorm( model ):
    w = model[:-1]
    b = model[-1]
    # Translate the model to work with original data features
    b = b - w.dot(mu / sg)
    w = w / sg
    res = X_extend.dot(w) + b - y
    objVal = alpha * nplin.norm( w, 1 ) + 1/(2*n) * ( nplin.norm( res ) ** 2 )
    MAEVal = np.mean( np.abs( res ) )
    return (objVal, MAEVal)

# Get the gradient to the loss function in LASSO for normalized data
def getLASSOGradNorm( model, t ):
    w = model[:-1]
    b = model[-1]
    samples = random.sample( range(0, n), B )
    X_ = XNorm[samples,:]
    y_ = y[samples]
    res = X_.dot(w) + b - y_
    grad = np.append( X_.T.dot(res), np.sum(res) )
    return grad/B

# Set hyperparameters and initialize the model as before
# Since our normalized data is better conditioned, we are able to use a much
# bigger value of the step length parameter which leads to faster progress
alpha = 1
B = 10
eta = 1e-2
init = np.zeros( (d+1,) )
modelPrev = np.zeros( (d+1,) )

# A constant step length seems to work well here
stepFunc = opt.stepLengthGenerator( "constant", eta )
# Notice that we are running the ProxGD method for far fewer iterations (1000)
# than we did (50000) when we had badly conditioned data
(modelProxGD, objProxGD, timeProxGD) = opt.doGD( getLASSOGradNorm, stepFunc, getLASSOObjNorm, init, horizon = 1000, doModelAveraging = True, postGradFunc = doSoftThresholding )
objVals = [objProxGD[i][0] for i in range(len(objProxGD))]
MAEVals = [objProxGD[i][1] for i in range(len(objProxGD))]

fig8 = pd.getFigure( 7, 7 )
ax = plt.gca()
ax.set_title( "The Accelerated ProxGD Solver on Normalized Data" )
ax.set_xlabel( "Elapsed time (sec)" )
ax.set_ylabel( "Objective Value for LASSO", color = "r" )
ax.plot( timeProxGD, objVals, color = 'r', linestyle = ':' )
ax2 = ax.twinx()
ax2.set_ylabel( "MAE Value for LASSO", color = "b" )
ax2.plot( timeProxGD, MAEVals, color = 'b', linestyle = '--' )
plt.ylim( 2, 10 )

In [None]:
idxTop = np.argsort( np.abs(modelProxGD) )[::-1][:20]
print( "The top 20 coordinates in terms of magnitude are \n ", idxTop )