In [2]:
import math
import numpy as np
import pandas as pd
import scipy.linalg as lin

In [3]:
# load data
def loaddata(filename):
    data = pd.read_csv(filename, sep='\s+', header=None)
    data = data.as_matrix()
    row, col = data.shape
    X = np.c_[np.ones([row,1]), data[:, 0 : col-1]]
    Y = data[:, col-1:col]
    return X, Y

In [8]:
# calc mistake
def mistake(X, Y, theta):
    yhat = np.sign(X.dot(theta)) # attention, yhat collects the sign of outputs !
    yhat[yhat==0] = -1
    error = float(np.sum(yhat != Y))/len(Y)
    return error

In [9]:
#Q13
X, Y = loaddata('hw4_train.dat')
Xtest, Ytest = loaddata('hw4_test.dat')
lamb = 10
row, col = Xtrain.shape
wreg = lin.pinv(lamb*np.eye(col)+X.T.dot(X)).dot(X.T).dot(Y)
ein = mistake(X, Y, wreg)
eout = mistake(Xtest, Ytest, wreg)
print 'ein is %s, eout is %s' %(ein, eout)

ein is 0.05, eout is 0.045


In [11]:
#Q14 and Q15. Selection among lambs
arr = np.arange(-10,3,1)
num = len(arr)
lamb = 10.0**arr
ein = np.zeros((num,)); eout = np.zeros((num,)); evali = np.zeros((num,))
for i in range(num):
    wreg = lin.pinv(lamb[i]*np.eye(col)+ X.T.dot(X)).dot(X.T).dot(Y)
    ein[i] = mistake(X, Y, wreg)
    eout[i] = mistake(Xtest, Ytest, wreg)
out = np.c_[np.c_[np.array(lamb), ein],eout]
print '\tlambda\t\t Ein\t\t Eout'
print out

	lambda		 Ein		 Eout
[[  1.00000000e-10   1.50000000e-02   2.00000000e-02]
 [  1.00000000e-09   1.50000000e-02   2.00000000e-02]
 [  1.00000000e-08   1.50000000e-02   2.00000000e-02]
 [  1.00000000e-07   3.00000000e-02   1.50000000e-02]
 [  1.00000000e-06   3.50000000e-02   1.60000000e-02]
 [  1.00000000e-05   3.00000000e-02   1.60000000e-02]
 [  1.00000000e-04   3.00000000e-02   1.60000000e-02]
 [  1.00000000e-03   3.00000000e-02   1.60000000e-02]
 [  1.00000000e-02   3.00000000e-02   1.60000000e-02]
 [  1.00000000e-01   3.50000000e-02   1.60000000e-02]
 [  1.00000000e+00   3.50000000e-02   2.00000000e-02]
 [  1.00000000e+01   5.00000000e-02   4.50000000e-02]
 [  1.00000000e+02   2.40000000e-01   2.61000000e-01]]


In [13]:
#Q16 and Q17. Selection among lambs with validation
X, Y = loaddata('hw4_train.dat')
Xtest, Ytest = loaddata('hw4_test.dat')
Xtrain = X[0:120,:]; Ytrain = Y[0:120, :]
Xval = X[120:, :]; Yval = Y[120:, :]
row, col = X.shape
arr = np.arange(-10,3,1); num = len(arr)
lamb = 10.0 ** arr
ein = np.zeros((num,)); eout = np.zeros((num,)); evali = np.zeros((num,))
for i in range(num):
    wreg = lin.pinv(lamb[i]*np.eye(col) + Xtrain.T.dot(Xtrain)).dot(Xtrain.T).dot(Ytrain)
    ein[i] = mistake(Xtrain, Ytrain, wreg)
    evali[i] = mistake(Xval, Yval, wreg)
    eout[i] = mistake(Xtest, Ytest, wreg)
out = np.c_[np.c_[np.c_[np.array(lamb), ein], evali], eout]
print '\tlambda\t\t Ein\t\t Eval\t\t Eout'
print out

	lambda		 Ein		 Eval		 Eout
[[  1.00000000e-10   8.33333333e-03   1.25000000e-01   4.00000000e-02]
 [  1.00000000e-09   0.00000000e+00   1.00000000e-01   3.80000000e-02]
 [  1.00000000e-08   0.00000000e+00   5.00000000e-02   2.50000000e-02]
 [  1.00000000e-07   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-06   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-05   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-04   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-03   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-02   3.33333333e-02   3.75000000e-02   2.10000000e-02]
 [  1.00000000e-01   3.33333333e-02   3.75000000e-02   2.20000000e-02]
 [  1.00000000e+00   3.33333333e-02   3.75000000e-02   2.80000000e-02]
 [  1.00000000e+01   7.50000000e-02   1.25000000e-01   8.00000000e-02]
 [  1.00000000e+02   3.41666667e-01   4.12500000e-01   4.14000000e-01]]


In [15]:
#Q18 use the optimal lambda
lambmin = lamb [np.where(evali == evali.min())[0][-1]]
wreg = lin.pinv(lambmin*np.eye(col) + X.T.dot(X)).dot(X.T).dot(Y)
ein = mistake(X, Y, wreg)
eout = mistake(Xtest, Ytest, wreg)
print 'ein is %s, eout is %s' %(ein, eout)

ein is 0.035, eout is 0.02


In [21]:
#Q19 5 fold validation
ein = np.zeros((num,)); evali = np.zeros((num,))
for m in range(num):
    for i in range(5):
        Xval = X[40*i:40*(i+1), :]
        Yval = Y[40*i:40*(i+1), :]
        Xtrain = np.r_[X[0:40*i, :], X[40*(i+1):, :]]
        Ytrain = np.r_[Y[0:40*i, :], Y[40*(i+1):, :]]
        wreg = lin.pinv(lamb[m]*np.eye(col)+Xtrain.T.dot(Xtrain)).dot(Xtrain.T).dot(Ytrain)
        evali[m] += mistake(Xval, Yval, wreg)
    evali[m] /= 5
out = np.c_[lamb, evali]
print '\tlambda\t\t eval'
print out

	lambda		 eval
[[  1.00000000e-10   5.00000000e-02]
 [  1.00000000e-09   5.00000000e-02]
 [  1.00000000e-08   3.00000000e-02]
 [  1.00000000e-07   3.50000000e-02]
 [  1.00000000e-06   3.50000000e-02]
 [  1.00000000e-05   3.50000000e-02]
 [  1.00000000e-04   3.50000000e-02]
 [  1.00000000e-03   3.50000000e-02]
 [  1.00000000e-02   3.50000000e-02]
 [  1.00000000e-01   3.50000000e-02]
 [  1.00000000e+00   3.50000000e-02]
 [  1.00000000e+01   6.00000000e-02]
 [  1.00000000e+02   2.90000000e-01]]


In [23]:
#Q20 calculate ein and eout using optimal lambda
lambmin = lamb[np.where(evali == np.min(evali))[0][-1]]
wreg = lin.pinv(lambmin*np.eye(col) + X.T.dot(X)).dot(X.T).dot(Y)
ein = mistake(X, Y, wreg)
eout = mistake(Xtest, Ytest, wreg)
print 'ein: %s, eout: %s' % (ein, eout)

ein: 0.015, eout: 0.02
