### 问题6

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 导入数据函数
def loadData(filename):
    data = pd.read_csv(filename, sep='\s+', header=None)
    data = data.as_matrix()
    col, row = data.shape
    X = np.c_[np.ones((col, 1)), data[:, 0: row-1]]
    Y = data[:, row-1:row]
    return X, Y

In [6]:
# Q15-Q17导入数据项
X, Y = loadData('hw1_15_train.dat')
col, row = X.shape
theta = np.zeros((row, 1))
print('X的前五项：\n',X[0:5, :])
print('Y的前五项: \n',Y[0:5,:].T)

X的前五项：
 [[ 1.        0.97681   0.10723   0.64385   0.29556 ]
 [ 1.        0.67194   0.2418    0.83075   0.42741 ]
 [ 1.        0.20619   0.23321   0.81004   0.98691 ]
 [ 1.        0.51583   0.055814  0.92274   0.75797 ]
 [ 1.        0.70893   0.10836   0.33951   0.77058 ]]
Y的前五项: 
 [[ 1.  1.  1.  1.  1.]]


In [7]:
# 感知机算法
# 下述实现需注意：加入了一个prevpos变量，为了保证每次都先从当前数据的后面数据中寻找错误项
#（这样的方式相比每次均从第一个数据开始寻找要更快速）
def perceptron(X, Y, theta, eta=1):
    num = 0; prevpos = 0
    while(True):
        yhat = np.sign(X.dot(theta))
        yhat[np.where(yhat == 0)] = -1
        index = np.where(yhat != Y)[0]
        if not index.any():
            break
        if not index[index >= prevpos].any():
            prevpos = 0
        pos = index[index >= prevpos][0]
        prevpos = pos
        theta += eta*Y[pos, 0]*X[pos:pos+1, :].T
        num += 1
    return theta, num

In [8]:
# Q15的结果
theta, num = perceptron(X, Y, theta)
print('总共更新theta的次数：',num)

总共更新theta的次数： 39


In [10]:
# Q16的结果
total = 0
for i in range(2000):
    theta = np.zeros((row, 1))
    randpos = np.random.permutation(col)
    Xrnd = X[randpos, :]
    Yrnd = Y[randpos, 0:1]
    _, num = perceptron(Xrnd, Yrnd, theta)
    total += num
print('2000次平均每次更新theta的次数：',total/2000)

2000次平均每次更新theta的次数： 39.806


In [11]:
# Q17的结果
total = 0
for i in range(2000):
    theta = np.zeros((row, 1))
    randpos = np.random.permutation(col)
    Xrnd = X[randpos, :]
    Yrnd = Y[randpos, 0:1]
    _, num = perceptron(Xrnd, Yrnd, theta, 0.5)
    total += num
print('2000次平均每次更新theta的次数：',total/2000)

2000次平均每次更新theta的次数： 39.758


In [15]:
# Q18-20导入数据
X, Y = loadData('hw1_18_train.dat')
Xtest, Ytest = loadData('hw1_18_test.dat')
col, row = X.shape
theta = np.zeros((row, 1))

In [16]:
# 在定义Pocket算法前，先引入错误率函数
def mistake(yhat, y):
    row, col = y.shape
    return np.sum(yhat != y)/row

In [17]:
# Pocket算法
def pocket(X, Y, theta, iternum, eta = 1):
    yhat = np.sign(X.dot(theta))
    yhat[np.where(yhat == 0)] = -1
    errold = mistake(yhat, Y)
    thetabest = np.zeros(theta.shape)
    for t in range(iternum):
        index = np.where(yhat != Y)[0]
        if not index.any():
            break
        pos = index[np.random.permutation(len(index))[0]]
        theta += eta * Y[pos, 0] * X[pos:pos + 1, :].T
        yhat = np.sign(X.dot(theta))
        yhat[np.where(yhat == 0)] = -1
        errnow = mistake(yhat, Y)
        if errnow < errold:
            thetabest = theta.copy() # 这一步切勿弄错，如果直接thetabest=theta则会使两者指向同一块空间
            errold = errnow
    return thetabest, theta

In [19]:
# Q18
total = 0
for i in range(2000):
    theta = np.zeros((row, 1))
    randpos = np.random.permutation(col)
    Xrnd = X[randpos, :]
    Yrnd = Y[randpos, 0:1]
    theta, thetabad = pocket(Xrnd, Yrnd, theta, 50)
    yhat = np.sign(Xtest.dot(theta))
    yhat[np.where(yhat == 0)] = -1
    err = mistake(yhat, Ytest)
    total += err
print('迭代次数为50时，theta_pocket情况下的测试集错误率：',total/2000)

迭代次数为50时，theta_pocket情况下的测试集错误率： 0.132035


In [20]:
# Q19
total = 0
for i in range(2000):
    theta = np.zeros((row, 1))
    randpos = np.random.permutation(col)
    Xrnd = X[randpos, :]
    Yrnd = Y[randpos, 0:1]
    theta, thetabad = pocket(Xrnd, Yrnd, theta, 50)
    yhat = np.sign(Xtest.dot(thetabad))
    yhat[np.where(yhat == 0)] = -1
    err = mistake(yhat, Ytest)
    total += err
print('迭代次数为50时，theta_50情况下的测试集错误率：',total/2000)

迭代次数为50时，theta_50情况下的测试集错误率： 0.354342


In [21]:
# Q20
total = 0
for i in range(2000):
    theta = np.zeros((row, 1))
    randpos = np.random.permutation(col)
    Xrnd = X[randpos, :]
    Yrnd = Y[randpos, 0:1]
    theta, thetabad = pocket(Xrnd, Yrnd, theta, 100)
    yhat = np.sign(Xtest.dot(theta))
    yhat[np.where(yhat == 0)] = -1
    err = mistake(yhat, Ytest)
    total += err
print('迭代次数为100时，theta_pocket情况下的测试集错误率：',total/2000)

迭代次数为100时，theta_pocket情况下的测试集错误率： 0.11616
