In [1]:
import numpy as np

In [2]:
def generate_data(n):
    X = np.random.uniform(-1, 1, n)
    y = np.sign(X)
    y[y == 0] = -1
    noise = np.random.uniform(0, 1, 20)
    y[noise <= 0.2] *= -1
    return X, y

In [3]:
n = 20
X, y = generate_data(n)

In [4]:
print(X)
print(y)

[ 0.64861376  0.19209039  0.88617672  0.65745168  0.16263194  0.69747568
 -0.24027285 -0.82851049  0.89709643 -0.14140749 -0.06953078 -0.63502654
 -0.01689799 -0.20845739 -0.74415017 -0.08900498 -0.44459655 -0.36137167
  0.94130754 -0.58257041]
[ 1.  1.  1.  1.  1.  1. -1.  1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
  1. -1.]


In [5]:
def decision_stump(X, y):
    n = y.shape[0]
    all_X = np.tile(X, (n, 1))
    diff = all_X - all_X.T
    predict = np.sign(diff)
    predict[predict == 0] = -1
    err = (predict!=y).astype(int)
    total_err = np.sum(err, axis=1)
    if np.min(total_err) < n-np.max(total_err): #consider s=1 or s=-1 are opposite
        return 1, X[np.argmin(total_err)], np.min(total_err)/n
    else:
        return -1, X[np.argmax(total_err)], (n-np.max(total_err))/n

In [6]:
def train_5000(n):
    e_in = 0
    e_out = 0
    for i in np.arange(5000):
        np.random.seed(i)
        X, y = generate_data(n)
        s, theta, e = decision_stump(X, y)
        e_in += e
        e_out += 0.5+0.3*s*(abs(theta)-1)
    return e_in/5000, e_out/5000

In [7]:
train_5000(n)

(0.16926000000000044, 0.26562358904000427)

In [8]:
import pandas as pd
columns = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','y']
data_train = pd.read_csv("hw2_train.dat", header=None, delimiter=r"\s+", names=columns)
data_test = pd.read_csv("hw2_test.dat", header=None, delimiter=r"\s+", names=columns)
data_train.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y
0,8.105,-3.5,4.769,4.541,-9.829,5.252,3.838,-3.408,-4.824,-1
1,-6.273,-2.097,9.404,1.143,3.487,-5.206,0.061,5.024,-6.687,1
2,1.624,-1.173,4.26,-3.607,-6.632,4.431,-8.355,7.206,-8.977,1
3,-10.0,7.758,-2.67,-8.88,-1.099,-9.183,-4.086,8.962,5.841,1
4,8.464,1.762,2.729,2.724,8.155,6.096,-2.844,9.8,3.302,-1


In [20]:
def multi_decision_stump(data):
    y = data.iloc[:, -1].values
    X = data.iloc[:, :-1].values
    n, m = X.shape
    s = np.zeros(m)
    theta = np.zeros(m)
    e_in = np.zeros(m)
    for i in np.arange(m):
        s[i], theta[i], e_in[i] = decision_stump(X[:, i], y)
    indx = np.argmin(e_in)
    return indx, s[indx], theta[indx], e_in[indx]

indx, s, theta, e_in = multi_decision_stump(data_train)
e_in

0.25

In [22]:
pred = s*np.sign(data_test.iloc[:, indx]-theta)
pred[pred == 0] = -1
y_test = data_test.iloc[:, -1]
e_out = (pred != y_test).astype(int).sum()/y_test.shape[0]
e_out

0.355