In [1]:
import numpy as np
import sklearn.metrics

In [2]:
def get_metrics(y_true, y_hat):
    acc = sklearn.metrics.accuracy_score(y_true, y_hat)
    balanced_acc = sklearn.metrics.balanced_accuracy_score(y_true, y_hat)
    return acc, balanced_acc  

In [3]:
class DecisionStump:
    def __init__(self, steps=400):
        self.features = None
        self.bestn = None
        self.bestd = None
        self.bestp = None
        self.steps = steps

    def train(self, X, y, W):
        """
        Parameters
        ----------
        X_ : np.array of shape = [n_samples, n_features]
            The inputs of the training samples.
        y_ : np.array of shape = [n_samples]
            The class labels of the training samples.
            Currently only supports class -1 and 1.
        W_ : np.array of shape = [n_samples]
            The weights of each samples.
        """

        X = np.array(X)
        y = np.array(y)
        W = np.array(W)

        n_samples, n_features = X.shape
        assert n_samples == y.size

        bestn = 0
        bestd = 1
        bestp = 0
        # minerr = W.sum()
        minerr = 2
        # print('minerr', minerr)
        for n in range(n_features):
            err, d, p = self._optimize(X[:, n], y, W, self.steps)
            if err < minerr:
                minerr = err
                bestn = n
                bestd = d
                bestp = p

        self.features = n_features
        self.bestn = bestn
        self.bestd = bestd
        self.bestp = bestp

        return minerr

    def _optimize(self, x, y, W, steps):
        """Get optimal direction and position to divided X.

        Parameters
        ----------
        x : np.array of shape = [n_samples]
            The inputs of a certain feature of the training samples.
        y : np.array of shape = [n_samples]
            The class labels of the training samples.
        W : np.array of shape = [n_samples]
            The weights of each samples.
        steps : int
            Count of training iterations.
        """

        min_x, max_x = x.min(), x.max()
        len_x = max_x - min_x

        bestd = 1
        bestp = min_x
        # minerr = W.sum()
        minerr = 2

        if len_x > 0.0:
            for p in np.arange(min_x, max_x, len_x / steps):
                for d in [-1, 1]:
                    gy = np.ones((y.size)).reshape(-1,1)
                    gy[x * d < p * d] = -1
                    err = np.sum((gy != y) * W)
                    if err < minerr:
                        minerr = err
                        bestd = d
                        bestp = p

        return minerr, bestd, bestp

    def predict(self, X_test):
        """Predict the classes of input samples

        Parameters
        ----------
        test_set_ : array-like of shape = [n_samples, n_features]
            The inputs of the testing samples.
        """

        X_test = np.array(X_test)
        n_samples, n_features = X_test.shape

        assert n_features == self.features

        single_feature = X_test[:, self.bestn]
        h = np.ones((n_samples))
        h[single_feature * self.bestd < self.bestp * self.bestd] = -1
        return h

In [4]:
train_data = np.genfromtxt("datasets/Syndata-train.csv", delimiter=",")[1:]
X_train = train_data[:, :2]
y_train = train_data[:, 2:]

test_data = np.genfromtxt("datasets/Syndata-test.csv", delimiter=",")[1:]
X_test = test_data[:, :2]
y_test = test_data[:, 2:]

In [5]:
# sample distribution in the algorithm is initialized by 1/N
sample_dist = np.ones((X_train.shape[0])).reshape(-1,1) / X_train.shape[0]
# print(sample_dist.shape)


dt = DecisionStump()
train_err = dt.train(X_train, y_train, sample_dist)
print(f'train error = {train_err}')

y_test_hat = dt.predict(X_test)
acc, _ = get_metrics(y_test, y_test_hat)
print(f'test acc = {acc}')

train error = 0.36375
test acc = 0.63
