In [1]:
import numpy as np
import sklearn.metrics

In [2]:
def get_metrics(y_true, y_hat):
    acc = sklearn.metrics.accuracy_score(y_true, y_hat)
    balanced_acc = sklearn.metrics.balanced_accuracy_score(y_true, y_hat)
    return acc, balanced_acc  

In [3]:
class DecisionStump:
    def __init__(self, granularity=100):
        self.best_feature = None
        self.best_direction = None
        self.best_position = None
        self.granularity = granularity

    def train(self, X, y, dist):
        """
        X: n✕d training inputs
        y: n✕1 labels with values 1 or -1
        dist: n✕1 distribution of samples
        """

        X = np.asarray(X)
        y = np.asarray(y)
        dist = np.asarray(dist)

        # maximum value for error is 1
        min_err = 1
        for feature in range(X.shape[1]):
            err, direction, position = self.__find_best_separator(
                X[:, feature], y, dist
            )
            if err < min_err:
                min_err = err
                self.best_feature = feature
                self.best_direction = direction
                self.best_position = position

        return min_err

    # for a given feature, finds the best separator
    def __find_best_separator(self, x, y, dist):
        """
        x: n✕1 one feature of inputs
        y: n✕1 labels with values 1 or -1
        dist: n✕1 distribution of samples
        """

        best_dir = 1
        best_pos = x.min()

        # maximum value for error is 1
        min_err = 1
        for p in np.arange(x.min(), x.max(), (x.max() - x.min()) / self.granularity):
            for d in (-1, 1):
                y_hat = np.ones((y.size)).reshape(-1, 1)
                if d == 1:
                    y_hat[x < p] = -1
                else:
                    y_hat[x > p] = -1

                err = np.sum((y_hat != y) * dist)
                if err < min_err:
                    min_err = err
                    best_dir = d
                    best_pos = p

        return min_err, best_dir, best_pos

    def predict(self, X_test):
        """
        X_test: n✕d test inputs
        """

        X_test = np.array(X_test)

        y_hat = np.ones((X_test.shape[0]))
        if self.best_direction == 1:
            y_hat[X_test[:, self.best_feature] < self.best_position] = -1
        else:
            y_hat[X_test[:, self.best_feature] > self.best_position] = -1

        return y_hat

In [4]:
train_data = np.genfromtxt("datasets/Syndata-train.csv", delimiter=",")[1:]
X_train = train_data[:, :2]
y_train = train_data[:, 2:]

test_data = np.genfromtxt("datasets/Syndata-test.csv", delimiter=",")[1:]
X_test = test_data[:, :2]
y_test = test_data[:, 2:]

In [5]:
# sample distribution in the algorithm is initialized by 1/N
sample_dist = np.ones((X_train.shape[0])).reshape(-1,1) / X_train.shape[0]
# print(sample_dist.shape)


dt = DecisionStump()
train_err = dt.train(X_train, y_train, sample_dist)
print(f'train error = {train_err}')

y_test_hat = dt.predict(X_test)
acc, _ = get_metrics(y_test, y_test_hat)
print(f'test acc = {acc}')

train error = 0.36625
test acc = 0.62
