In [3]:
class LogisticRegressor:
    def __init__(self, eta, runs):
        self.eta = eta
        self.runs = runs
        self.W = None

    def fit(self, x, y, w_init):
        """
        Optimize the weights W to minimize the negative log-likelihood by using gradient descent

        :param x: a 2D numpy array of transformed feature values. Shape is (n x 2), (n x 3), or (n x 6)
        :param y: a 2D numpy array of output values. Shape is (n x 1)
        :param w_init: a 2D numpy array that initializes the weights. Shape is (2 x 1), (3 x 1), or (6 x 1)
        :return: None
        """
        # Keep this for the autograder
        self.W = w_init
        # TODO
        N = x.shape[0]
        for run in range(self.runs):
            # Compute gradient
            # grad = np.zeros(self.W.shape)
            # assert grad.shape == self.W.shape
            grad = np.dot((self.predict(x) - y).T, x).T / N
            # Iterate over all points in data set
            # for x_n, y_n in zip(x, y):
            #     add = -(y_n[0] - sigmoid(np.dot(self.W.T, x_n)[0])) * x_n
            #     grad += np.reshape(add, (-1, 1))
            # Update weights W in direction of negative gradient
            self.W = self.W - self.eta * grad

    def predict(self, x):
        """
        Predict classification probability of transformed input x
        
        :param x: a 2D numpy array of transformed feature values. Shape is (n x 2), (n x 3), or (n x 6)
        :return: a 2D numpy array of predicted probabilities given current weights. Shape should be (n x 1)
        """
        # TODO
        return sigmoid(np.dot(self.W, x))
        # return np.array([[sigmoid(np.dot(self.W.T, x_n)[0])] for x_n in x])

In [None]:
# Function to create a one-hot vector from class value
def one_hot(class_val, num_classes):
    y = np.zeros(num_classes, dtype=int)
    y[class_val] = 1
    return y

# Softmax classifier with L_2 regularization
class SoftmaxRegression:
    def __init__(self, eta, lam):
        self.eta = eta
        self.lam = lam
        self.W = None
        self.runs = 200000

    def fit(self, X, y):
        """
        Fit the weights W of softmax regression using gradient descent with L2 regularization
        in the form (lambda/2) * norm(w)^2
        Use the results from Problem 2 to find an expression for the gradient
        
        :param X: a 2D numpy array of (transformed) feature values. Shape is (n x 2)
        :param y: a 1D numpy array of target values (Dwarf=0, Giant=1, Supergiant=2).
        :return: None
        """
        # Add bias column to features X
        X = np.hstack([np.ones((X.shape[0], 1)), X])
        # Initializing the weights (do not change!)
        # The number of classes is 1 + (the highest numbered class)
        num_classes = 1 + y.max()
        num_features = X.shape[1]
        self.W = np.ones((num_classes, num_features))
        # Convert y into array of one-hot vectors
        y = np.array([one_hot(y_n, num_classes) for y_n in y])
        # Iterate over each row of weights
        for run in range(self.runs):
            preds = self.predict_proba(X)
            for j in range(num_classes):
                # Compute gradient
                # grad_wj = self.lam * self.W[j]
                # for x_n, y_n in zip(X, y):
                # # Add to gradient of NLL
                #     grad_wj += (softmax(np.dot(self.W, x_n))[j] - y_n[j]) * x_n
                grad_wj = np.dot(preds[:, j] - y[:, j], X)
                grad_wj += self.lam * self.W[j]
                # Update the row of weights
                self.W[j] -= self.eta * grad_wj


    def predict(self, X_pred):
        """
        The code in this method should be removed and replaced! We included it
        just so that the distribution code is runnable and produces a
        (currently meaningless) visualization.
        
        Predict classes of points given feature values in X_pred
        
        :param X_pred: a 2D numpy array of (transformed) feature values. Shape is (n x 2)
        :return: a 1D numpy array of predicted classes (Dwarf=0, Giant=1, Supergiant=2).
                 Shape should be (n,)
        """
        # Add column of ones at the beginning of X_pred matrix
        # X_pred = np.hstack([np.ones((X_pred.shape[0], 1)), X_pred])
        return np.array([softmax(np.dot(self.W, x)).argmax() for x in X_pred])
    
    def predict_proba(self, X_pred):
        """    
        Predict classification probabilities of points given feature values in X_pred
        
        :param X_pred: a 2D numpy array of (transformed) feature values. Shape is (n x 2)
        :return: a 2D numpy array of predicted class probabilities (Dwarf=index 0, Giant=index 1, Supergiant=index 2).
                 Shape should be (n x 3)
        """
        # Add column of ones at the beginning of X_pred matrix
        # X_pred = np.hstack([np.ones((X_pred.shape[0], 1)), X_pred])
        return np.array([softmax(np.dot(self.W, x)) for x in X_pred])