In [None]:
import numpy
from typing import Tuple, Optional

class SoftmaxRegression:
    def __init__(
        self,
        n_features: int = 10,
        n_classes: int = 5,
        learning_rate: float = 0.1,
        num_epochs: int = 100,
        reg_lambda: float = 0.0
    ):

        self.n_features = n_features
        self.n_classes = n_classes
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.reg_lambda = reg_lambda

        # Model parameters
        self.W: Optional[np.ndarray] = None
        self.b: Optional[np.ndarray] = None

        # track loss over epochs
        self.loss_history = []

    def _softmax(self, Z: np.ndarray) -> np.ndarray:
        """
        # check if copying is the most efficient/effective
        Z = W^\top x + b, Z \in \mathbb{R}^{N \times 5}
        Z = \begin{bmatrix}
                2.0, 1.0, 0.0, -1.0, -2.0
                2.1, 3.4, 2.5, 0.2, -3.43
            \end{bmatrix}

        Z_copy = Z.copy()
        Z_copy = exp(Z_copy)
        _sum = sum(Z_copy)
        Z_copy = Z_copy / _sum
        return Z
        """
        raise NotImplementedError
        

    def _one_hot(self, y: np.ndarray) -> np.ndarray:
        """
        y = [2, 0, 4, 1, ..., N], N \in \mathbb{R}^{data points}
        Y = np.zeros((y.shape[0], 5), dtype=float32)
        for i in range(len(y)):
            Y[i] = y[i]
        """
        raise NotImplementedError

    def _compute_loss(self, X:np.ndarray, y: np.ndarray) -> float:
        """
        cross-entropy loss
        d = 2 features
        K = 3 classes
        batch_num = N (e.g. 4)

        shapes:
            X: N x d (4 x 2)
            W: d x k (2 x 3)
            b: 1 x k (bias)
            y: 1 x N (example true classes)

        compute logits:
            X = \begin{bmatrix}
                    1, 1
                    1, 2
                    2, 1
                    2, 2
                \end{bmatrix}

            W = \begin{bmatrix}
                    0.1, -.2, -0.1
                    0.0, 0.1, 0.2
                \end{bmatrix}

            b = \begin{bmatrix}
                    0, 0, 0
                \end{bmatrix}

            y = \begin{bmatrix}
                    1, 2, 0, 1
                \end{bmatrix}

            Z = XW + b

            z(0,0)=0.1
            z(0,1)=0.3
            z(0,2)=0.1

            z(1,0)=0.1
            z(1,1)=0.4
            z(1,2)=0.3

            ...

            z(3,2)=0.2

            Z = \begin{bmatrix}
                    0.1, 0.3, 0.1
                    0.1, 0.4, 0.3
                    0.2, 0.5, 0.0
                    0.2, 0.6, 0.2
                \end{bmatrix}

        compute softmax row-by-row:
            P_{i,k} = \frac{e^{Z_{i,k}}{\sum_j e^{Z_{i,j}}

            exp(0.1)=1.105
            exp(0.3)=1.350
            exp(0.1)=1.105

            sum = 3.560

            P(0,0)=0.31
            P(0,1)=0.38
            P(0,2)=0.31

            P = \begin{bmatrix}
                    0.31, 0.38, 0.31
                    0.28, 0.38, 0.34
                    0.32, 0.43, 0.26
                    0.29, 0.43, 0.29
                \end{bmatrix}

            y = _one_hot(self, y) -> 4 x 3

            Y = \begin{bmatrix}
            log(0.38)
            log(0.34)
            log(0.316)
            log(0.427)
                \end{bmatrix}

            loss = (all logs) / (N -> 4)

        return loss
        """
        raise NotImplementedError

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        raise NotImplementedError

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError

    def predict(self, X: np.ndarray) -> np.ndarray:
        raise NotImplementedError

    def score(self, X: np.ndarray, y: np.ndarray) -> float:
        raise NotImplementedError