From 45ae36714cb85676255d28a58499cbbdb8170895 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sat, 14 Aug 2021 16:02:44 -0400
Subject: [PATCH 01/18] Generalized argmin param across dimensions

---
 algorithms/linfa-logistic/src/argmin_param.rs | 34 +++++++++----------
 algorithms/linfa-logistic/src/float.rs        | 12 +++----
 2 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/algorithms/linfa-logistic/src/argmin_param.rs b/algorithms/linfa-logistic/src/argmin_param.rs
index b726c48c5..5e180f4ea 100644
--- a/algorithms/linfa-logistic/src/argmin_param.rs
+++ b/algorithms/linfa-logistic/src/argmin_param.rs
@@ -1,4 +1,4 @@
-//! This module defines newtypes for ndarray's Array1.
+//! This module defines newtypes for ndarray's Array.
 //!
 //! This is necessary to be able to abstract over floats (f32 and f64) so that
 //! the logistic regression code can be abstract in the float type it works
@@ -8,51 +8,51 @@
 
 use crate::float::Float;
 use argmin::prelude::*;
-use ndarray::Array1;
+use ndarray::{Array, Dimension};
 use serde::{Deserialize, Serialize};
 
 #[derive(Serialize, Clone, Deserialize, Debug, Default)]
-pub struct ArgminParam<F>(pub Array1<F>);
+pub struct ArgminParam<F, D>(pub Array<F, D>);
 
-impl<F> ArgminParam<F> {
+impl<F, D> ArgminParam<F, D> {
     #[inline]
-    pub fn as_array(&self) -> &Array1<F> {
+    pub fn as_array(&self) -> &Array<F, D> {
         &self.0
     }
 }
 
-impl<F: Float> ArgminSub<ArgminParam<F>, ArgminParam<F>> for ArgminParam<F> {
-    fn sub(&self, other: &ArgminParam<F>) -> ArgminParam<F> {
+impl<F: Float, D: Dimension> ArgminSub<ArgminParam<F, D>, ArgminParam<F, D>> for ArgminParam<F, D> {
+    fn sub(&self, other: &ArgminParam<F, D>) -> ArgminParam<F, D> {
         ArgminParam(&self.0 - &other.0)
     }
 }
 
-impl<F: Float> ArgminAdd<ArgminParam<F>, ArgminParam<F>> for ArgminParam<F> {
-    fn add(&self, other: &ArgminParam<F>) -> ArgminParam<F> {
-        ArgminParam(&self.0 + &other.0)
+impl<F: Float, D: Dimension> ArgminAdd<ArgminParam<F, D>, ArgminParam<F, D>> for ArgminParam<F, D> {
+    fn add(&self, other: &ArgminParam<F, D>) -> ArgminParam<F, D> {
+        ArgminParam(&self.0 - &other.0)
     }
 }
 
-impl<F: Float> ArgminDot<ArgminParam<F>, F> for ArgminParam<F> {
-    fn dot(&self, other: &ArgminParam<F>) -> F {
+impl<F: Float, D: Dimension> ArgminDot<ArgminParam<F, D>, F> for ArgminParam<F, D> {
+    fn dot(&self, other: &ArgminParam<F, D>) -> F {
         self.0.dot(&other.0)
     }
 }
 
-impl<F: Float> ArgminNorm<F> for ArgminParam<F> {
+impl<F: Float, D: Dimension> ArgminNorm<F> for ArgminParam<F, D> {
     fn norm(&self) -> F {
         self.0.dot(&self.0)
     }
 }
 
-impl<F: Float> ArgminMul<F, ArgminParam<F>> for ArgminParam<F> {
-    fn mul(&self, other: &F) -> ArgminParam<F> {
+impl<F: Float, D: Dimension> ArgminMul<F, ArgminParam<F, D>> for ArgminParam<F, D> {
+    fn mul(&self, other: &F) -> ArgminParam<F, D> {
         ArgminParam(&self.0 * *other)
     }
 }
 
-impl<F: Float> ArgminMul<ArgminParam<F>, ArgminParam<F>> for ArgminParam<F> {
-    fn mul(&self, other: &ArgminParam<F>) -> ArgminParam<F> {
+impl<F: Float, D: Dimension> ArgminMul<ArgminParam<F, D>, ArgminParam<F, D>> for ArgminParam<F, D> {
+    fn mul(&self, other: &ArgminParam<F, D>) -> ArgminParam<F, D> {
         ArgminParam(&self.0 * &other.0)
     }
 }
diff --git a/algorithms/linfa-logistic/src/float.rs b/algorithms/linfa-logistic/src/float.rs
index 3eb697e37..612a81027 100644
--- a/algorithms/linfa-logistic/src/float.rs
+++ b/algorithms/linfa-logistic/src/float.rs
@@ -1,6 +1,6 @@
 use crate::argmin_param::ArgminParam;
 use argmin::prelude::{ArgminFloat, ArgminMul};
-use ndarray::NdFloat;
+use ndarray::{Dimension, NdFloat};
 use ndarray_linalg::Lapack;
 use num_traits::FromPrimitive;
 
@@ -13,21 +13,21 @@ pub trait Float:
     + Default
     + Clone
     + FromPrimitive
-    + ArgminMul<ArgminParam<Self>, ArgminParam<Self>>
+    //+ ArgminMul<ArgminParam<Self>, ArgminParam<Self>>
     + linfa::Float
 {
     const POSITIVE_LABEL: Self;
     const NEGATIVE_LABEL: Self;
 }
 
-impl ArgminMul<ArgminParam<Self>, ArgminParam<Self>> for f64 {
-    fn mul(&self, other: &ArgminParam<Self>) -> ArgminParam<Self> {
+impl<D: Dimension> ArgminMul<ArgminParam<Self, D>, ArgminParam<Self, D>> for f64 {
+    fn mul(&self, other: &ArgminParam<Self, D>) -> ArgminParam<Self, D> {
         ArgminParam(&other.0 * *self)
     }
 }
 
-impl ArgminMul<ArgminParam<Self>, ArgminParam<Self>> for f32 {
-    fn mul(&self, other: &ArgminParam<Self>) -> ArgminParam<Self> {
+impl<D: Dimension> ArgminMul<ArgminParam<Self, D>, ArgminParam<Self, D>> for f32 {
+    fn mul(&self, other: &ArgminParam<Self, D>) -> ArgminParam<Self, D> {
         ArgminParam(&other.0 * *self)
     }
 }

From f0209bb830ad5b15b46449f4914787296e428c02 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 02:48:46 -0400
Subject: [PATCH 02/18] Wrote multinomial loss and gradient

---
 algorithms/linfa-logistic/src/argmin_param.rs |  21 ++-
 algorithms/linfa-logistic/src/float.rs        |   5 +-
 algorithms/linfa-logistic/src/lib.rs          | 163 ++++++++++++++++--
 3 files changed, 163 insertions(+), 26 deletions(-)

diff --git a/algorithms/linfa-logistic/src/argmin_param.rs b/algorithms/linfa-logistic/src/argmin_param.rs
index 5e180f4ea..34f9b8b7e 100644
--- a/algorithms/linfa-logistic/src/argmin_param.rs
+++ b/algorithms/linfa-logistic/src/argmin_param.rs
@@ -8,13 +8,22 @@
 
 use crate::float::Float;
 use argmin::prelude::*;
-use ndarray::{Array, Dimension};
+use ndarray::{Array, ArrayBase, Data, Dimension, Zip};
 use serde::{Deserialize, Serialize};
 
+pub fn elem_dot<F: linfa::Float, A1: Data<Elem = F>, A2: Data<Elem = F>, D: Dimension>(
+    a: &ArrayBase<A1, D>,
+    b: &ArrayBase<A2, D>,
+) -> F {
+    Zip::from(a)
+        .and(b)
+        .fold(F::zero(), |acc, &a, &b| acc + a * b)
+}
+
 #[derive(Serialize, Clone, Deserialize, Debug, Default)]
-pub struct ArgminParam<F, D>(pub Array<F, D>);
+pub struct ArgminParam<F, D: Dimension>(pub Array<F, D>);
 
-impl<F, D> ArgminParam<F, D> {
+impl<F, D: Dimension> ArgminParam<F, D> {
     #[inline]
     pub fn as_array(&self) -> &Array<F, D> {
         &self.0
@@ -35,13 +44,15 @@ impl<F: Float, D: Dimension> ArgminAdd<ArgminParam<F, D>, ArgminParam<F, D>> for
 
 impl<F: Float, D: Dimension> ArgminDot<ArgminParam<F, D>, F> for ArgminParam<F, D> {
     fn dot(&self, other: &ArgminParam<F, D>) -> F {
-        self.0.dot(&other.0)
+        //self.0.dot(&other.0)
+        todo!()
     }
 }
 
 impl<F: Float, D: Dimension> ArgminNorm<F> for ArgminParam<F, D> {
     fn norm(&self) -> F {
-        self.0.dot(&self.0)
+        //self.0.dot(&self.0)
+        todo!()
     }
 }
 
diff --git a/algorithms/linfa-logistic/src/float.rs b/algorithms/linfa-logistic/src/float.rs
index 612a81027..49d0f6846 100644
--- a/algorithms/linfa-logistic/src/float.rs
+++ b/algorithms/linfa-logistic/src/float.rs
@@ -1,6 +1,6 @@
 use crate::argmin_param::ArgminParam;
 use argmin::prelude::{ArgminFloat, ArgminMul};
-use ndarray::{Dimension, NdFloat};
+use ndarray::{Dimension, Ix1, Ix2, NdFloat};
 use ndarray_linalg::Lapack;
 use num_traits::FromPrimitive;
 
@@ -13,7 +13,8 @@ pub trait Float:
     + Default
     + Clone
     + FromPrimitive
-    //+ ArgminMul<ArgminParam<Self>, ArgminParam<Self>>
+    + ArgminMul<ArgminParam<Self, Ix1>, ArgminParam<Self, Ix1>>
+    + ArgminMul<ArgminParam<Self, Ix2>, ArgminParam<Self, Ix2>>
     + linfa::Float
 {
     const POSITIVE_LABEL: Self;
diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 6c7c5906e..c6a6e6063 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -24,13 +24,15 @@ use argmin::solver::linesearch::MoreThuenteLineSearch;
 use argmin::solver::quasinewton::lbfgs::LBFGS;
 use linfa::prelude::{AsTargets, DatasetBase};
 use linfa::traits::{Fit, PredictInplace};
-use ndarray::{s, Array, Array1, ArrayBase, Data, Ix1, Ix2};
+use ndarray::{
+    s, Array, Array1, Array2, ArrayBase, Axis, Data, Dimension, Ix1, Ix2, RemoveAxis, Slice,
+};
 use std::default::Default;
 
 mod argmin_param;
 mod float;
 
-use argmin_param::ArgminParam;
+use argmin_param::{elem_dot, ArgminParam};
 use float::Float;
 
 /// A two-class logistic regression model.
@@ -77,7 +79,8 @@ impl<F: Float> Default for LogisticRegression<F> {
     }
 }
 
-type LBFGSType<F> = LBFGS<MoreThuenteLineSearch<ArgminParam<F>, F>, ArgminParam<F>, F>;
+type LBFGSType1<F> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, Ix1>, F>, ArgminParam<F, Ix1>, F>;
+type LBFGSType2<F> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, Ix2>, F>, ArgminParam<F, Ix2>, F>;
 
 impl<F: Float> LogisticRegression<F> {
     /// Creates a new LogisticRegression with default configuration.
@@ -201,8 +204,8 @@ impl<F: Float> LogisticRegression<F> {
         &self,
         x: &'a ArrayBase<A, Ix2>,
         target: Array1<F>,
-    ) -> LogisticRegressionProblem<'a, F, A> {
-        LogisticRegressionProblem {
+    ) -> LogisticRegressionProblem1<'a, F, A> {
+        LogisticRegressionProblem1 {
             x,
             target,
             alpha: self.alpha,
@@ -236,7 +239,7 @@ impl<F: Float> LogisticRegression<F> {
 
     /// Create the LBFGS solver using MoreThuenteLineSearch and set gradient
     /// tolerance.
-    fn setup_solver(&self) -> LBFGSType<F> {
+    fn setup_solver(&self) -> LBFGSType1<F> {
         let linesearch = MoreThuenteLineSearch::new();
         LBFGS::new(linesearch, 10).with_tol_grad(self.gradient_tolerance)
     }
@@ -244,10 +247,10 @@ impl<F: Float> LogisticRegression<F> {
     /// Run the LBFGS solver until it converges or runs out of iterations.
     fn run_solver<'a, A>(
         &self,
-        problem: LogisticRegressionProblem<'a, F, A>,
-        solver: LBFGSType<F>,
+        problem: LogisticRegressionProblem1<'a, F, A>,
+        solver: LBFGSType1<F>,
         init_params: Array1<F>,
-    ) -> Result<ArgminResult<LogisticRegressionProblem<'a, F, A>>>
+    ) -> Result<ArgminResult<LogisticRegressionProblem1<'a, F, A>>>
     where
         A: Data<Elem = F>,
     {
@@ -261,7 +264,7 @@ impl<F: Float> LogisticRegression<F> {
     fn convert_result<A, C>(
         &self,
         labels: ClassLabels<F, C>,
-        result: &ArgminResult<LogisticRegressionProblem<F, A>>,
+        result: &ArgminResult<LogisticRegressionProblem1<F, A>>,
     ) -> Result<FittedLogisticRegression<F, C>>
     where
         A: Data<Elem = F>,
@@ -364,17 +367,25 @@ where
 
 /// Conditionally split the feature vector `w` into parameter vector and
 /// intercept parameter.
-fn convert_params<F: Float>(n_features: usize, w: &Array1<F>) -> (Array1<F>, F) {
-    if n_features == w.len() {
-        (w.to_owned(), F::zero())
-    } else if n_features + 1 == w.len() {
-        (w.slice(s![..w.len() - 1]).to_owned(), w[w.len() - 1])
+/// Dimensions of `w` are either (f) or (f, n_classes)
+fn convert_params<F: Float, D: Dimension + RemoveAxis>(
+    n_features: usize,
+    w: &Array<F, D>,
+) -> (Array<F, D>, Array<F, D::Smaller>) {
+    let nrows = w.shape()[0];
+    if n_features == nrows {
+        (w.to_owned(), Array::zeros(w.raw_dim().remove_axis(Axis(0))))
+    } else if n_features + 1 == nrows {
+        (
+            w.slice_axis(Axis(0), Slice::from(..n_features)).to_owned(),
+            w.index_axis(Axis(0), n_features).to_owned(),
+        )
     } else {
         panic!(
             "Unexpected length of parameter vector `w`, exected {} or {}, found {}",
             n_features,
             n_features + 1,
-            w.len()
+            nrows
         );
     }
 }
@@ -399,6 +410,22 @@ fn log_logistic<F: linfa::Float>(x: F) -> F {
     }
 }
 
+/// Finds the log of the sum of exponents across a specific axis in a numerically stable way. More
+/// specifically, computes `ln(exp(x1) + exp(x2) + exp(e3) + ...)` across an axis.
+///
+/// Based off this implementation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.logsumexp.html
+fn log_sum_exp<F: linfa::Float, A: Data<Elem = F>>(
+    m: &ArrayBase<A, Ix2>,
+    axis: Axis,
+) -> Array<F, Ix1> {
+    // Find max value of the array
+    let max = m.iter().copied().reduce(F::max).unwrap();
+    // Computes `max + ln(exp(x1-max) + exp(x2-max) + exp(x3-max) + ...)`, which is equal to the
+    // log_sum_exp formula
+    let reduced = m.fold_axis(axis, F::zero(), |acc, elem| *acc + (*elem - max).exp());
+    reduced.mapv_into(|e| e.ln() + max)
+}
+
 /// Computes the logistic loss assuming the training labels $y \in {-1, 1}$
 ///
 /// Because the logistic function fullfills $\sigma(-z) = 1 - \sigma(z)$
@@ -421,6 +448,38 @@ fn logistic_loss<F: Float, A: Data<Elem = F>>(
     -yz.sum() + F::cast(0.5) * alpha * params.dot(&params)
 }
 
+/// Computes loss function of `-sum(Y * log(softmax(H))) + alpha/2 * norm(W)`, where H is `X . W + b`.
+/// Also returns the log of the probabilities, which is `log(softmax(H))`, along with `W`.
+/// `Y` is the output (n_samples * n_classes), `X` is the input (n_samples * n_features), `W` is the
+/// params (n_features * n_classes), `b` is the intercept vector (n_classes).
+fn multi_logistic_loss_prob_params<F: Float, A: Data<Elem = F>>(
+    x: &ArrayBase<A, Ix2>,
+    y: &Array2<F>,
+    alpha: F,
+    w: &Array2<F>, // This parameter includes `W` and `b`
+) -> (F, Array2<F>, Array2<F>) {
+    let n_features = x.shape()[1];
+    let (params, intercept) = convert_params(n_features, w);
+    // Compute H
+    let h = x.dot(&params) + intercept;
+    // This computes `H - log(sum(exp(H)))`, which is equal to
+    // `log(softmax(H)) = log(exp(H) / sum(exp(H)))`
+    let log_prob = h - log_sum_exp(&h, Axis(1));
+    // Calculate loss
+    // XXX Should we divide cost by n_samples???
+    let loss = -elem_dot(&log_prob, y) + F::cast(0.5) * alpha * elem_dot(&params, &params);
+    (loss, log_prob, params)
+}
+
+fn multi_logistic_loss<F: Float, A: Data<Elem = F>>(
+    x: &ArrayBase<A, Ix2>,
+    y: &Array2<F>,
+    alpha: F,
+    w: &Array2<F>,
+) -> F {
+    multi_logistic_loss_prob_params(x, y, alpha, w).0
+}
+
 /// Computes the gradient of the logistic loss function
 fn logistic_grad<F: Float, A: Data<Elem = F>>(
     x: &ArrayBase<A, Ix2>,
@@ -445,6 +504,36 @@ fn logistic_grad<F: Float, A: Data<Elem = F>>(
     }
 }
 
+/// Computes multinomial gradients for `W` and `b`, combine them, and flatten the result.
+/// Gradient for `W` is `Xt . (softmax(H) - Y) + alpha * W`.
+/// Gradient for `b` is `sum(softmax(H) - Y)`.
+fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
+    x: &ArrayBase<A, Ix2>,
+    y: &Array2<F>,
+    alpha: F,
+    w: &Array2<F>,
+) -> Array1<F> {
+    let (loss, log_prob, params) = multi_logistic_loss_prob_params(x, y, alpha, w);
+    let (n_features, n_classes) = params.dim();
+    let intercept = x.ncols() > n_features;
+    let mut grad = Array::zeros((n_features + intercept as usize, n_classes));
+
+    // This value is `softmax(H)`
+    let prob = log_prob.mapv_into(num_traits::Float::exp);
+    let diff = prob - y;
+    // Compute gradient for `W` and place it at start of the grad matrix
+    let mut dw = x.t().dot(&diff);
+    params *= alpha;
+    dw += &params;
+    grad.slice_mut(s![..n_features, ..]).assign(&dw);
+    // Compute gradient for `b` and place it at end of grad matrix
+    if intercept {
+        grad.row_mut(n_features).assign(&diff.sum_axis(Axis(0)));
+    }
+    // XXX should we divide by n_samples??
+    grad.into_shape(grad.len()).unwrap()
+}
+
 /// A fitted logistic regression which can make predictions
 #[derive(PartialEq, Debug)]
 pub struct FittedLogisticRegression<F: Float, C: PartialOrd + Clone> {
@@ -549,15 +638,15 @@ fn class_from_label<F: Float, C: PartialOrd + Clone>(labels: &[ClassLabel<F, C>]
 
 /// Internal representation of a logistic regression problem.
 /// This data structure exists to be handed to Argmin.
-struct LogisticRegressionProblem<'a, F: Float, A: Data<Elem = F>> {
+struct LogisticRegressionProblem1<'a, F: Float, A: Data<Elem = F>> {
     x: &'a ArrayBase<A, Ix2>,
     target: Array1<F>,
     alpha: F,
 }
 
-impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem<'a, F, A> {
+impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a, F, A> {
     /// Type of the parameter vector
-    type Param = ArgminParam<F>;
+    type Param = ArgminParam<F, Ix1>;
     /// Type of the return value computed by the cost function
     type Output = F;
     /// Type of the Hessian. Can be `()` if not needed.
@@ -585,6 +674,42 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem<'a,
     }
 }
 
+struct LogisticRegressionProblem2<'a, F: Float, A: Data<Elem = F>> {
+    x: &'a ArrayBase<A, Ix2>,
+    target: Array2<F>,
+    alpha: F,
+}
+
+impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem2<'a, F, A> {
+    /// Type of the parameter vector
+    type Param = ArgminParam<F, Ix2>;
+    /// Type of the return value computed by the cost function
+    type Output = F;
+    /// Type of the Hessian. Can be `()` if not needed.
+    type Hessian = ();
+    /// Type of the Jacobian. Can be `()` if not needed.
+    type Jacobian = Array1<F>;
+    /// Floating point precision
+    type Float = F;
+
+    /// Apply the cost function to a parameter `p`
+    fn apply(&self, p: &Self::Param) -> std::result::Result<Self::Output, argmin::core::Error> {
+        let w = p.as_array();
+        Ok(multi_logistic_loss(self.x, &self.target, self.alpha, w))
+    }
+
+    /// Compute the gradient at parameter `p`.
+    fn gradient(&self, p: &Self::Param) -> std::result::Result<Self::Param, argmin::core::Error> {
+        let w = p.as_array();
+        Ok(ArgminParam(multi_logistic_grad(
+            self.x,
+            &self.target,
+            self.alpha,
+            w,
+        )))
+    }
+}
+
 #[cfg(test)]
 mod test {
     extern crate linfa;

From 4a64b9a322411bf31672ab2af22c26716eb6a731 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 13:26:45 -0400
Subject: [PATCH 03/18] Partially done

---
 algorithms/linfa-logistic/src/lib.rs | 287 ++++++++++++++-------------
 1 file changed, 152 insertions(+), 135 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index c6a6e6063..44d0a8c9a 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -65,26 +65,98 @@ use float::Float;
 /// let model = LogisticRegression::default().fit(&dataset).unwrap();
 /// let prediction = model.predict(&dataset);
 /// ```
-pub struct LogisticRegression<F: Float> {
+pub struct LogisticRegression<F: Float, D: Dimension> {
     alpha: F,
     fit_intercept: bool,
     max_iterations: u64,
     gradient_tolerance: F,
-    initial_params: Option<(Array1<F>, F)>,
+    initial_params: Option<Array<F, D>>,
 }
 
-impl<F: Float> Default for LogisticRegression<F> {
-    fn default() -> LogisticRegression<F> {
+impl<F: Float, D: Dimension> Default for LogisticRegression<F, D> {
+    fn default() -> Self {
         LogisticRegression::new()
     }
 }
 
-type LBFGSType1<F> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, Ix1>, F>, ArgminParam<F, Ix1>, F>;
-type LBFGSType2<F> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, Ix2>, F>, ArgminParam<F, Ix2>, F>;
+type LBFGSType<F, D> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, D>, F>, ArgminParam<F, D>, F>;
+type LBFGSType1<F> = LBFGSType<F, Ix1>;
+type LBFGSType2<F> = LBFGSType<F, Ix2>;
 
-impl<F: Float> LogisticRegression<F> {
+impl<F: Float> LogisticRegression<F, Ix1> {
+    /// Given a 2-dimensional feature matrix array `x` with shape
+    /// (n_samples, n_features) and an iterable of target classes to predict,
+    /// create a `FittedLinearRegression` object which allows making
+    /// predictions.
+    ///
+    /// The iterable of target classes `y` must have exactly two distinct
+    /// values, (e.g. 0.0 and 1.0, 0 and 1, "cat" and "dog", ...), which
+    /// represent the two different classes the model is supposed to predict.
+    ///
+    /// The iterable `y` must also produces exactly `n_samples` items, i.e.
+    /// exactly as many items as there are rows in the feature matrix `x`.
+    ///
+    /// This method returns an error if any of the preconditions are violated,
+    /// i.e. any values are `Inf` or `NaN`, `y` doesn't have as many items as
+    /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have
+    /// been set to inalid values.
+    fn fit<A, T, C>(&self, x: &ArrayBase<A, Ix2>, y: T) -> Result<FittedLogisticRegression<F, C>>
+    where
+        A: Data<Elem = F>,
+        T: AsTargets<Elem = C>,
+        C: PartialOrd + Clone,
+    {
+        let (labels, target) = label_classes(y)?;
+        self.validate_data(x, &target)?;
+        let problem = self.setup_problem(x, target);
+        let solver = self.setup_solver();
+        let init_params = self.setup_init_params(x.ncols());
+        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
+        self.convert_result(labels, &result)
+    }
+
+    /// Create the initial parameters, either from a user supplied guess
+    /// or a 1-d array of `0`s.
+    fn setup_init_params(&self, n_features: usize) -> Array1<F> {
+        if let Some(params) = self.initial_params.as_ref() {
+            params.clone()
+        } else {
+            Array::zeros(n_features + self.fit_intercept as usize)
+        }
+    }
+}
+
+impl<F: Float> LogisticRegression<F, Ix2> {
+    fn fit<A, T, C>(&self, x: &ArrayBase<A, Ix2>, y: T) -> Result<FittedLogisticRegression<F, C>>
+    where
+        A: Data<Elem = F>,
+        T: AsTargets<Elem = C>,
+        C: PartialOrd + Clone,
+    {
+        let (labels, target) = label_classes_multi(y)?;
+        self.validate_data(x, &target)?;
+        let problem = self.setup_problem(x, target);
+        let solver = self.setup_solver();
+        let init_params = self.setup_init_params(x.ncols(), target.ncols());
+        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
+        self.convert_result(labels, &result)
+    }
+
+    /// Create the initial parameters, either from a user supplied guess
+    /// or a 1-d array of `0`s.
+    fn setup_init_params(&self, n_features: usize, n_classes: usize) -> Array2<F> {
+        if let Some(params) = self.initial_params.as_ref() {
+            params.clone()
+        } else {
+            n_features + self.fit_intercept as usize;
+            Array::zeros((n_features + self.fit_intercept as usize, n_classes))
+        }
+    }
+}
+
+impl<F: Float, D: Dimension> LogisticRegression<F, D> {
     /// Creates a new LogisticRegression with default configuration.
-    pub fn new() -> LogisticRegression<F> {
+    pub fn new() -> Self {
         LogisticRegression {
             alpha: F::cast(1.0),
             fit_intercept: true,
@@ -96,79 +168,48 @@ impl<F: Float> LogisticRegression<F> {
 
     /// Set the normalization parameter `alpha` used for L2 normalization,
     /// defaults to `1.0`.
-    pub fn alpha(mut self, alpha: F) -> LogisticRegression<F> {
+    pub fn alpha(mut self, alpha: F) -> Self {
         self.alpha = alpha;
         self
     }
 
     /// Configure if an intercept should be fitted, defaults to `true`.
-    pub fn with_intercept(mut self, fit_intercept: bool) -> LogisticRegression<F> {
+    pub fn with_intercept(mut self, fit_intercept: bool) -> Self {
         self.fit_intercept = fit_intercept;
         self
     }
 
     /// Configure the maximum number of iterations that the solver should perform,
     /// defaults to `100`.
-    pub fn max_iterations(mut self, max_iterations: u64) -> LogisticRegression<F> {
+    pub fn max_iterations(mut self, max_iterations: u64) -> Self {
         self.max_iterations = max_iterations;
         self
     }
 
     /// Configure the minimum change to the gradient to continue the solver,
     /// defaults to `1e-4`.
-    pub fn gradient_tolerance(mut self, gradient_tolerance: F) -> LogisticRegression<F> {
+    pub fn gradient_tolerance(mut self, gradient_tolerance: F) -> Self {
         self.gradient_tolerance = gradient_tolerance;
         self
     }
 
     /// Configure the initial parameters from where the optimization starts.
-    /// The `params` array must have the same size as the number of columns of
-    /// the feature matrix `x` passed to the `fit` method
-    pub fn initial_params(mut self, params: Array1<F>, intercept: F) -> LogisticRegression<F> {
-        self.initial_params = Some((params, intercept));
+    /// The `params` array must have at least the same number of rows as there are columns on the
+    /// feature matrix `x` passed to the `fit` method
+    pub fn initial_params(mut self, params: Array<F, D>) -> Self {
+        self.initial_params = Some(params);
         self
     }
 
-    /// Given a 2-dimensional feature matrix array `x` with shape
-    /// (n_samples, n_features) and an iterable of target classes to predict,
-    /// create a `FittedLinearRegression` object which allows making
-    /// predictions.
-    ///
-    /// The iterable of target classes `y` must have exactly two distinct
-    /// values, (e.g. 0.0 and 1.0, 0 and 1, "cat" and "dog", ...), which
-    /// represent the two different classes the model is supposed to predict.
-    ///
-    /// The iterable `y` must also produces exactly `n_samples` items, i.e.
-    /// exactly as many items as there are rows in the feature matrix `x`.
-    ///
-    /// This method returns an error if any of the preconditions are violated,
-    /// i.e. any values are `Inf` or `NaN`, `y` doesn't have as many items as
-    /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have
-    /// been set to inalid values.
-    fn fit<A, T, C>(&self, x: &ArrayBase<A, Ix2>, y: T) -> Result<FittedLogisticRegression<F, C>>
-    where
-        A: Data<Elem = F>,
-        T: AsTargets<Elem = C>,
-        C: PartialOrd + Clone,
-    {
-        let (labels, target) = label_classes(y)?;
-        self.validate_data(x, &target)?;
-        let problem = self.setup_problem(x, target);
-        let solver = self.setup_solver();
-        let init_params = self.setup_init_params(x);
-        let result = self.run_solver(problem, solver, init_params)?;
-        self.convert_result(labels, &result)
-    }
-
     /// Ensure that `x` and `y` have the right shape and that all data and
     /// configuration parameters are finite.
-    fn validate_data<A, B>(&self, x: &ArrayBase<A, Ix2>, y: &ArrayBase<B, Ix1>) -> Result<()>
-    where
-        A: Data<Elem = F>,
-        B: Data<Elem = F>,
-    {
-        if x.shape()[0] != y.len() {
-            return Err(Error::MismatchedShapes(x.shape()[0], y.len()));
+    fn validate_data<A: Data<Elem = F>, B: Data<Elem = F>>(
+        &self,
+        x: &ArrayBase<A, Ix2>,
+        y: &ArrayBase<B, D>,
+    ) -> Result<()> {
+        if x.shape()[0] != y.shape()[0] {
+            return Err(Error::MismatchedShapes(x.shape()[0], y.shape()[0]));
         }
         if x.iter().any(|x| !x.is_finite())
             || y.iter().any(|y| !y.is_finite())
@@ -179,20 +220,18 @@ impl<F: Float> LogisticRegression<F> {
         if !self.gradient_tolerance.is_finite() || self.gradient_tolerance <= F::zero() {
             return Err(Error::InvalidGradientTolerance);
         }
-        self.validate_init_params(x)?;
+        self.validate_init_params(x.shape()[1], y.shape().get(1).copied())?;
         Ok(())
     }
 
-    fn validate_init_params<A>(&self, x: &ArrayBase<A, Ix2>) -> Result<()>
-    where
-        A: Data<Elem = F>,
-    {
-        if let Some((params, intercept)) = self.initial_params.as_ref() {
-            let (_, n_features) = x.dim();
-            if n_features != params.dim() {
+    fn validate_init_params(&self, mut n_features: usize, n_classes: Option<usize>) -> Result<()> {
+        if let Some(params) = self.initial_params.as_ref() {
+            let shape = params.shape();
+            n_features += self.fit_intercept as usize;
+            if n_features != shape[0] || n_classes != shape.get(1).copied() {
                 return Err(Error::InvalidInitialParametersGuessSize);
             }
-            if params.iter().any(|p| !p.is_finite()) || !intercept.is_finite() {
+            if params.iter().any(|p| !p.is_finite()) {
                 return Err(Error::InvalidInitialParametersGuess);
             }
         }
@@ -203,58 +242,30 @@ impl<F: Float> LogisticRegression<F> {
     fn setup_problem<'a, A: Data<Elem = F>>(
         &self,
         x: &'a ArrayBase<A, Ix2>,
-        target: Array1<F>,
-    ) -> LogisticRegressionProblem1<'a, F, A> {
-        LogisticRegressionProblem1 {
+        target: Array<F, D>,
+    ) -> LogisticRegressionProblem<'a, F, A, D> {
+        LogisticRegressionProblem {
             x,
             target,
             alpha: self.alpha,
         }
     }
 
-    /// Create the initial parameters, either from a user supplied guess
-    ///  or a 1-d array of `0`s.
-    fn setup_init_params<A>(&self, x: &ArrayBase<A, Ix2>) -> Array1<F>
-    where
-        A: Data<Elem = F>,
-    {
-        let n_features = x.shape()[1];
-        let param_len = if self.fit_intercept {
-            n_features + 1
-        } else {
-            n_features
-        };
-
-        let mut init_parmas = Array1::zeros(param_len);
-
-        if let Some((params, intercept)) = self.initial_params.as_ref() {
-            init_parmas.slice_mut(s![..n_features]).assign(params);
-            if param_len == n_features + 1 {
-                init_parmas[n_features] = *intercept;
-            }
-        }
-
-        init_parmas
-    }
-
     /// Create the LBFGS solver using MoreThuenteLineSearch and set gradient
     /// tolerance.
-    fn setup_solver(&self) -> LBFGSType1<F> {
+    fn setup_solver(&self) -> LBFGSType<F, D> {
         let linesearch = MoreThuenteLineSearch::new();
         LBFGS::new(linesearch, 10).with_tol_grad(self.gradient_tolerance)
     }
 
     /// Run the LBFGS solver until it converges or runs out of iterations.
-    fn run_solver<'a, A>(
+    fn run_solver<P: SolvableProblem>(
         &self,
-        problem: LogisticRegressionProblem1<'a, F, A>,
-        solver: LBFGSType1<F>,
-        init_params: Array1<F>,
-    ) -> Result<ArgminResult<LogisticRegressionProblem1<'a, F, A>>>
-    where
-        A: Data<Elem = F>,
-    {
-        Executor::new(problem, solver, ArgminParam(init_params))
+        problem: P,
+        solver: P::Solver,
+        init_params: P::Param,
+    ) -> Result<ArgminResult<P>> {
+        Executor::new(problem, solver, init_params)
             .max_iters(self.max_iterations)
             .run()
             .map_err(|err| err.into())
@@ -365,6 +376,15 @@ where
     ))
 }
 
+fn label_classes_multi<F, T, C>(y: T) -> Result<(ClassLabels<F, C>, Array2<F>)>
+where
+    F: Float,
+    T: AsTargets<Elem = C>,
+    C: PartialOrd + Clone,
+{
+    todo!()
+}
+
 /// Conditionally split the feature vector `w` into parameter vector and
 /// intercept parameter.
 /// Dimensions of `w` are either (f) or (f, n_classes)
@@ -448,36 +468,34 @@ fn logistic_loss<F: Float, A: Data<Elem = F>>(
     -yz.sum() + F::cast(0.5) * alpha * params.dot(&params)
 }
 
-/// Computes loss function of `-sum(Y * log(softmax(H))) + alpha/2 * norm(W)`, where H is `X . W + b`.
-/// Also returns the log of the probabilities, which is `log(softmax(H))`, along with `W`.
+/// Compute the log of probabilities, which is `log(softmax(H))`, along with `W`.
 /// `Y` is the output (n_samples * n_classes), `X` is the input (n_samples * n_features), `W` is the
 /// params (n_features * n_classes), `b` is the intercept vector (n_classes).
-fn multi_logistic_loss_prob_params<F: Float, A: Data<Elem = F>>(
+fn multi_logistic_prob_params<F: Float, A: Data<Elem = F>>(
     x: &ArrayBase<A, Ix2>,
-    y: &Array2<F>,
-    alpha: F,
     w: &Array2<F>, // This parameter includes `W` and `b`
-) -> (F, Array2<F>, Array2<F>) {
+) -> (Array2<F>, Array2<F>) {
     let n_features = x.shape()[1];
     let (params, intercept) = convert_params(n_features, w);
     // Compute H
     let h = x.dot(&params) + intercept;
     // This computes `H - log(sum(exp(H)))`, which is equal to
     // `log(softmax(H)) = log(exp(H) / sum(exp(H)))`
-    let log_prob = h - log_sum_exp(&h, Axis(1));
-    // Calculate loss
-    // XXX Should we divide cost by n_samples???
-    let loss = -elem_dot(&log_prob, y) + F::cast(0.5) * alpha * elem_dot(&params, &params);
-    (loss, log_prob, params)
+    let log_prob = &h - log_sum_exp(&h, Axis(1));
+    (log_prob, params)
 }
 
+/// Computes loss function of `-sum(Y * log(softmax(H))) + alpha/2 * norm(W)`, where H is `X . W + b`.
 fn multi_logistic_loss<F: Float, A: Data<Elem = F>>(
     x: &ArrayBase<A, Ix2>,
     y: &Array2<F>,
     alpha: F,
     w: &Array2<F>,
 ) -> F {
-    multi_logistic_loss_prob_params(x, y, alpha, w).0
+    let (log_prob, params) = multi_logistic_prob_params(x, w);
+    // Calculate loss
+    // XXX Should we divide cost by n_samples???
+    -elem_dot(&log_prob, y) + F::cast(0.5) * alpha * elem_dot(&params, &params)
 }
 
 /// Computes the gradient of the logistic loss function
@@ -504,7 +522,7 @@ fn logistic_grad<F: Float, A: Data<Elem = F>>(
     }
 }
 
-/// Computes multinomial gradients for `W` and `b`, combine them, and flatten the result.
+/// Computes multinomial gradients for `W` and `b` and combine them.
 /// Gradient for `W` is `Xt . (softmax(H) - Y) + alpha * W`.
 /// Gradient for `b` is `sum(softmax(H) - Y)`.
 fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
@@ -512,8 +530,8 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
     y: &Array2<F>,
     alpha: F,
     w: &Array2<F>,
-) -> Array1<F> {
-    let (loss, log_prob, params) = multi_logistic_loss_prob_params(x, y, alpha, w);
+) -> Array2<F> {
+    let (log_prob, mut params) = multi_logistic_prob_params(x, w);
     let (n_features, n_classes) = params.dim();
     let intercept = x.ncols() > n_features;
     let mut grad = Array::zeros((n_features + intercept as usize, n_classes));
@@ -531,7 +549,7 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
         grad.row_mut(n_features).assign(&diff.sum_axis(Axis(0)));
     }
     // XXX should we divide by n_samples??
-    grad.into_shape(grad.len()).unwrap()
+    grad
 }
 
 /// A fitted logistic regression which can make predictions
@@ -638,22 +656,20 @@ fn class_from_label<F: Float, C: PartialOrd + Clone>(labels: &[ClassLabel<F, C>]
 
 /// Internal representation of a logistic regression problem.
 /// This data structure exists to be handed to Argmin.
-struct LogisticRegressionProblem1<'a, F: Float, A: Data<Elem = F>> {
+struct LogisticRegressionProblem<'a, F: Float, A: Data<Elem = F>, D: Dimension> {
     x: &'a ArrayBase<A, Ix2>,
-    target: Array1<F>,
+    target: Array<F, D>,
     alpha: F,
 }
 
+type LogisticRegressionProblem1<'a, F, A> = LogisticRegressionProblem<'a, F, A, Ix1>;
+type LogisticRegressionProblem2<'a, F, A> = LogisticRegressionProblem<'a, F, A, Ix2>;
+
 impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a, F, A> {
-    /// Type of the parameter vector
     type Param = ArgminParam<F, Ix1>;
-    /// Type of the return value computed by the cost function
     type Output = F;
-    /// Type of the Hessian. Can be `()` if not needed.
     type Hessian = ();
-    /// Type of the Jacobian. Can be `()` if not needed.
     type Jacobian = Array1<F>;
-    /// Floating point precision
     type Float = F;
 
     /// Apply the cost function to a parameter `p`
@@ -674,22 +690,11 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a
     }
 }
 
-struct LogisticRegressionProblem2<'a, F: Float, A: Data<Elem = F>> {
-    x: &'a ArrayBase<A, Ix2>,
-    target: Array2<F>,
-    alpha: F,
-}
-
 impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem2<'a, F, A> {
-    /// Type of the parameter vector
     type Param = ArgminParam<F, Ix2>;
-    /// Type of the return value computed by the cost function
     type Output = F;
-    /// Type of the Hessian. Can be `()` if not needed.
     type Hessian = ();
-    /// Type of the Jacobian. Can be `()` if not needed.
     type Jacobian = Array1<F>;
-    /// Floating point precision
     type Float = F;
 
     /// Apply the cost function to a parameter `p`
@@ -710,6 +715,18 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem2<'a
     }
 }
 
+trait SolvableProblem: ArgminOp + Sized {
+    type Solver: Solver<Self>;
+}
+
+impl<'a, F: Float, A: Data<Elem = F>> SolvableProblem for LogisticRegressionProblem1<'a, F, A> {
+    type Solver = LBFGSType1<F>;
+}
+
+impl<'a, F: Float, A: Data<Elem = F>> SolvableProblem for LogisticRegressionProblem2<'a, F, A> {
+    type Solver = LBFGSType2<F>;
+}
+
 #[cfg(test)]
 mod test {
     extern crate linfa;

From b289afe9e4e44c2f7a4ce58dc88a9196097f80c8 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 15:59:45 -0400
Subject: [PATCH 04/18] Finished multi fitted model

---
 algorithms/linfa-logistic/Cargo.toml |   1 +
 algorithms/linfa-logistic/src/lib.rs | 146 +++++++++++++++++++++++----
 2 files changed, 127 insertions(+), 20 deletions(-)

diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml
index 9342faf64..c7454605e 100644
--- a/algorithms/linfa-logistic/Cargo.toml
+++ b/algorithms/linfa-logistic/Cargo.toml
@@ -16,6 +16,7 @@ categories = ["algorithms", "mathematics", "science"]
 [dependencies]
 ndarray = { version = "0.15", features = ["approx", "blas"] }
 ndarray-linalg = "0.14"
+ndarray-stats = "0.5.0"
 num-traits = "0.2"
 argmin = { version = "0.4.6", features = ["ndarrayl"] }
 serde = "1.0"
diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 44d0a8c9a..cc09e441c 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -25,8 +25,10 @@ use argmin::solver::quasinewton::lbfgs::LBFGS;
 use linfa::prelude::{AsTargets, DatasetBase};
 use linfa::traits::{Fit, PredictInplace};
 use ndarray::{
-    s, Array, Array1, Array2, ArrayBase, Axis, Data, Dimension, Ix1, Ix2, RemoveAxis, Slice,
+    s, Array, Array1, Array2, ArrayBase, Axis, Data, DataMut, Dimension, Ix1, Ix2, RemoveAxis,
+    Slice, Zip,
 };
+use ndarray_stats::QuantileExt;
 use std::default::Default;
 
 mod argmin_param;
@@ -272,27 +274,29 @@ impl<F: Float, D: Dimension> LogisticRegression<F, D> {
     }
 
     /// Take an ArgminResult and return a FittedLogisticRegression.
-    fn convert_result<A, C>(
+    fn convert_result<'a, A, C>(
         &self,
         labels: ClassLabels<F, C>,
-        result: &ArgminResult<LogisticRegressionProblem1<F, A>>,
+        result: &ArgminResult<LogisticRegressionProblem<'a, F, A, D>>,
     ) -> Result<FittedLogisticRegression<F, C>>
     where
         A: Data<Elem = F>,
         C: PartialOrd + Clone,
+        LogisticRegressionProblem<'a, F, A, D>: SolvableProblem,
     {
-        let mut intercept = F::cast(0.0);
-        let mut params = result.state().best_param.as_array().clone();
-        if self.fit_intercept {
-            intercept = params[params.len() - 1];
-            params = params.slice(s![..params.len() - 1]).to_owned();
-        }
+        let params = result.state().best_param.as_array();
+        let n_features = if self.fit_intercept {
+            params.shape()[0] - 1
+        } else {
+            params.shape()[0]
+        };
+        let (w, intercept) = convert_params(n_features, params);
         Ok(FittedLogisticRegression::new(intercept, params, labels))
     }
 }
 
 impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
-    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F>
+    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix1>
 {
     type Object = FittedLogisticRegression<F, C>;
 
@@ -317,6 +321,16 @@ impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<E
     }
 }
 
+impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
+    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix2>
+{
+    type Object = MultiFittedLogisticRegression<F, C>;
+
+    fn fit(&self, dataset: &DatasetBase<ArrayBase<D, Ix2>, T>) -> Result<Self::Object> {
+        self.fit(dataset.records(), dataset.targets())
+    }
+}
+
 /// Identify the distinct values of the classes  `y` and associate
 /// the target labels `-1.0` and `1.0` to it. -1.0 always labels the
 /// smaller class (by PartialOrd) and 1.0 always labels the larger
@@ -446,6 +460,14 @@ fn log_sum_exp<F: linfa::Float, A: Data<Elem = F>>(
     reduced.mapv_into(|e| e.ln() + max)
 }
 
+/// Computes `exp(n - max) / sum(exp(n- max))`, which is a numerically stable version of softmax
+fn softmax_inplace<F: linfa::Float, A: DataMut<Elem = F>>(v: &mut ArrayBase<A, Ix1>) {
+    let max = v.iter().copied().reduce(F::max).unwrap();
+    v.mapv_inplace(|n| (n - max).exp());
+    let sum = v.sum();
+    v.mapv_inplace(|n| n / max);
+}
+
 /// Computes the logistic loss assuming the training labels $y \in {-1, 1}$
 ///
 /// Because the logistic function fullfills $\sigma(-z) = 1 - \sigma(z)$
@@ -601,24 +623,100 @@ impl<F: Float, C: PartialOrd + Clone> FittedLogisticRegression<F, C> {
         probs.mapv_inplace(logistic);
         probs
     }
+}
 
+impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
+    PredictInplace<ArrayBase<D, Ix2>, Array1<C>> for FittedLogisticRegression<F, C>
+{
     /// Given a feature matrix, predict the classes learned when the model was
     /// fitted.
-    fn predict<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Array1<C> {
+    fn predict_inplace(&self, x: &ArrayBase<D, Ix2>, y: &mut Array1<C>) {
+        assert_eq!(
+            x.nrows(),
+            y.len(),
+            "The number of data points must match the number of output targets."
+        );
+        assert_eq!(
+            x.ncols(),
+            self.params.len(),
+            "Number of data features must match the number of features the model was trained with."
+        );
+
         let pos_class = class_from_label(&self.labels, F::POSITIVE_LABEL);
         let neg_class = class_from_label(&self.labels, F::NEGATIVE_LABEL);
-        self.predict_probabilities(x).mapv(|probability| {
-            if probability >= self.threshold {
-                pos_class.clone()
-            } else {
-                neg_class.clone()
-            }
-        })
+        Zip::from(&self.predict_probabilities(x))
+            .and(y)
+            .for_each(|prob, out| {
+                *out = if *prob >= self.threshold {
+                    pos_class.clone()
+                } else {
+                    neg_class.clone()
+                }
+            });
+    }
+
+    fn default_target(&self, x: &ArrayBase<D, Ix2>) -> Array1<C> {
+        Array1::default(x.nrows())
+    }
+}
+
+/// A fitted logistic regression which can make predictions
+#[derive(PartialEq, Debug)]
+pub struct MultiFittedLogisticRegression<F: Float, C: PartialOrd + Clone> {
+    threshold: F,
+    intercept: F,
+    params: Array2<F>,
+    classes: Vec<C>,
+}
+
+impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
+    fn new(intercept: F, params: Array2<F>, classes: Vec<C>) -> Self {
+        Self {
+            threshold: F::cast(0.5),
+            intercept,
+            params,
+            classes,
+        }
+    }
+
+    /// Set the probability threshold for which the 'positive' class will be
+    /// predicted. Defaults to 0.5.
+    pub fn set_threshold(mut self, threshold: F) -> Self {
+        if threshold < F::zero() || threshold > F::one() {
+            panic!("FittedLogisticRegression::set_threshold: threshold needs to be between 0.0 and 1.0");
+        }
+        self.threshold = threshold;
+        self
+    }
+
+    pub fn intercept(&self) -> F {
+        self.intercept
+    }
+
+    pub fn params(&self) -> &Array2<F> {
+        &self.params
+    }
+
+    /// Return non-normalized probabilities (n_samples * n_classes)
+    fn predict_nonorm_probabilities<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Array2<F> {
+        let mut probs = x.dot(&self.params) + self.intercept;
+        probs
+    }
+
+    /// Return normalized probabilities for each output class. The output dimensions are (n_samples
+    /// * n_classes).
+    pub fn predict_probabilities<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Array2<F> {
+        let mut probs = self.predict_nonorm_probabilities(x);
+        probs
+            .rows_mut()
+            .into_iter()
+            .for_each(|row| softmax_inplace(&mut row));
+        probs
     }
 }
 
 impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
-    PredictInplace<ArrayBase<D, Ix2>, Array1<C>> for FittedLogisticRegression<F, C>
+    PredictInplace<ArrayBase<D, Ix2>, Array1<C>> for MultiFittedLogisticRegression<F, C>
 {
     /// Given a feature matrix, predict the classes learned when the model was
     /// fitted.
@@ -628,8 +726,16 @@ impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
             y.len(),
             "The number of data points must match the number of output targets."
         );
+        assert_eq!(
+            x.ncols(),
+            self.params.nrows(),
+            "Number of data features must match the number of features the model was trained with."
+        );
 
-        *y = self.predict(x);
+        let probs = self.predict_nonorm_probabilities(x);
+        Zip::from(probs.rows()).and(y).for_each(|prob_row, out| {
+            *out = prob_row.argmax_skipnan().unwrap();
+        });
     }
 
     fn default_target(&self, x: &ArrayBase<D, Ix2>) -> Array1<C> {

From 7ba41c1e921604a021ea9fcee7df452a8256ea5e Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 17:04:51 -0400
Subject: [PATCH 05/18] Fixed non-test code

---
 algorithms/linfa-logistic/src/lib.rs | 179 ++++++++++-----------------
 1 file changed, 68 insertions(+), 111 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index cc09e441c..2d32e18d2 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -25,8 +25,8 @@ use argmin::solver::quasinewton::lbfgs::LBFGS;
 use linfa::prelude::{AsTargets, DatasetBase};
 use linfa::traits::{Fit, PredictInplace};
 use ndarray::{
-    s, Array, Array1, Array2, ArrayBase, Axis, Data, DataMut, Dimension, Ix1, Ix2, RemoveAxis,
-    Slice, Zip,
+    s, Array, Array1, Array2, ArrayBase, Axis, Data, DataMut, Dimension, IntoDimension, Ix1, Ix2,
+    RemoveAxis, Slice, Zip,
 };
 use ndarray_stats::QuantileExt;
 use std::default::Default;
@@ -85,77 +85,6 @@ type LBFGSType<F, D> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, D>, F>, Argmin
 type LBFGSType1<F> = LBFGSType<F, Ix1>;
 type LBFGSType2<F> = LBFGSType<F, Ix2>;
 
-impl<F: Float> LogisticRegression<F, Ix1> {
-    /// Given a 2-dimensional feature matrix array `x` with shape
-    /// (n_samples, n_features) and an iterable of target classes to predict,
-    /// create a `FittedLinearRegression` object which allows making
-    /// predictions.
-    ///
-    /// The iterable of target classes `y` must have exactly two distinct
-    /// values, (e.g. 0.0 and 1.0, 0 and 1, "cat" and "dog", ...), which
-    /// represent the two different classes the model is supposed to predict.
-    ///
-    /// The iterable `y` must also produces exactly `n_samples` items, i.e.
-    /// exactly as many items as there are rows in the feature matrix `x`.
-    ///
-    /// This method returns an error if any of the preconditions are violated,
-    /// i.e. any values are `Inf` or `NaN`, `y` doesn't have as many items as
-    /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have
-    /// been set to inalid values.
-    fn fit<A, T, C>(&self, x: &ArrayBase<A, Ix2>, y: T) -> Result<FittedLogisticRegression<F, C>>
-    where
-        A: Data<Elem = F>,
-        T: AsTargets<Elem = C>,
-        C: PartialOrd + Clone,
-    {
-        let (labels, target) = label_classes(y)?;
-        self.validate_data(x, &target)?;
-        let problem = self.setup_problem(x, target);
-        let solver = self.setup_solver();
-        let init_params = self.setup_init_params(x.ncols());
-        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
-        self.convert_result(labels, &result)
-    }
-
-    /// Create the initial parameters, either from a user supplied guess
-    /// or a 1-d array of `0`s.
-    fn setup_init_params(&self, n_features: usize) -> Array1<F> {
-        if let Some(params) = self.initial_params.as_ref() {
-            params.clone()
-        } else {
-            Array::zeros(n_features + self.fit_intercept as usize)
-        }
-    }
-}
-
-impl<F: Float> LogisticRegression<F, Ix2> {
-    fn fit<A, T, C>(&self, x: &ArrayBase<A, Ix2>, y: T) -> Result<FittedLogisticRegression<F, C>>
-    where
-        A: Data<Elem = F>,
-        T: AsTargets<Elem = C>,
-        C: PartialOrd + Clone,
-    {
-        let (labels, target) = label_classes_multi(y)?;
-        self.validate_data(x, &target)?;
-        let problem = self.setup_problem(x, target);
-        let solver = self.setup_solver();
-        let init_params = self.setup_init_params(x.ncols(), target.ncols());
-        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
-        self.convert_result(labels, &result)
-    }
-
-    /// Create the initial parameters, either from a user supplied guess
-    /// or a 1-d array of `0`s.
-    fn setup_init_params(&self, n_features: usize, n_classes: usize) -> Array2<F> {
-        if let Some(params) = self.initial_params.as_ref() {
-            params.clone()
-        } else {
-            n_features + self.fit_intercept as usize;
-            Array::zeros((n_features + self.fit_intercept as usize, n_classes))
-        }
-    }
-}
-
 impl<F: Float, D: Dimension> LogisticRegression<F, D> {
     /// Creates a new LogisticRegression with default configuration.
     pub fn new() -> Self {
@@ -203,6 +132,18 @@ impl<F: Float, D: Dimension> LogisticRegression<F, D> {
         self
     }
 
+    /// Create the initial parameters, either from a user supplied guess
+    /// or a 1-d array of `0`s.
+    fn setup_init_params(&self, dims: D::Pattern) -> Array<F, D> {
+        if let Some(params) = self.initial_params.as_ref() {
+            params.clone()
+        } else {
+            let mut dims = dims.into_dimension();
+            dims.as_array_view_mut()[0] += self.fit_intercept as usize;
+            Array::zeros(dims)
+        }
+    }
+
     /// Ensure that `x` and `y` have the right shape and that all data and
     /// configuration parameters are finite.
     fn validate_data<A: Data<Elem = F>, B: Data<Elem = F>>(
@@ -272,30 +213,9 @@ impl<F: Float, D: Dimension> LogisticRegression<F, D> {
             .run()
             .map_err(|err| err.into())
     }
-
-    /// Take an ArgminResult and return a FittedLogisticRegression.
-    fn convert_result<'a, A, C>(
-        &self,
-        labels: ClassLabels<F, C>,
-        result: &ArgminResult<LogisticRegressionProblem<'a, F, A, D>>,
-    ) -> Result<FittedLogisticRegression<F, C>>
-    where
-        A: Data<Elem = F>,
-        C: PartialOrd + Clone,
-        LogisticRegressionProblem<'a, F, A, D>: SolvableProblem,
-    {
-        let params = result.state().best_param.as_array();
-        let n_features = if self.fit_intercept {
-            params.shape()[0] - 1
-        } else {
-            params.shape()[0]
-        };
-        let (w, intercept) = convert_params(n_features, params);
-        Ok(FittedLogisticRegression::new(intercept, params, labels))
-    }
 }
 
-impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
+impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
     Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix1>
 {
     type Object = FittedLogisticRegression<F, C>;
@@ -317,17 +237,41 @@ impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<E
     /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have
     /// been set to inalid values.
     fn fit(&self, dataset: &DatasetBase<ArrayBase<D, Ix2>, T>) -> Result<Self::Object> {
-        self.fit(dataset.records(), dataset.targets())
+        let (x, y) = (dataset.records(), dataset.targets());
+        let (labels, target) = label_classes(y)?;
+        self.validate_data(x, &target)?;
+        let problem = self.setup_problem(x, target);
+        let solver = self.setup_solver();
+        let init_params = self.setup_init_params(x.ncols());
+        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
+
+        let params = result.state().best_param.as_array();
+        let (w, intercept) = convert_params(x.nrows(), params);
+        Ok(FittedLogisticRegression::new(
+            intercept.into_scalar(),
+            w,
+            labels,
+        ))
     }
 }
 
-impl<'a, C: 'a + PartialOrd + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
+impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
     Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix2>
 {
     type Object = MultiFittedLogisticRegression<F, C>;
 
     fn fit(&self, dataset: &DatasetBase<ArrayBase<D, Ix2>, T>) -> Result<Self::Object> {
-        self.fit(dataset.records(), dataset.targets())
+        let (x, y) = (dataset.records(), dataset.targets());
+        let (classes, target) = label_classes_multi(y)?;
+        self.validate_data(x, &target)?;
+        let problem = self.setup_problem(x, target);
+        let solver = self.setup_solver();
+        let init_params = self.setup_init_params((x.ncols(), classes.len()));
+        let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
+
+        let params = result.state().best_param.as_array();
+        let (w, intercept) = convert_params(x.nrows(), params);
+        Ok(MultiFittedLogisticRegression::new(intercept, w, classes))
     }
 }
 
@@ -341,7 +285,7 @@ fn label_classes<F, T, C>(y: T) -> Result<(ClassLabels<F, C>, Array1<F>)>
 where
     F: Float,
     T: AsTargets<Elem = C>,
-    C: PartialOrd + Clone,
+    C: Ord + Clone,
 {
     let y_single_target = y.try_single_target()?;
     let mut classes: Vec<&C> = vec![];
@@ -390,13 +334,26 @@ where
     ))
 }
 
-fn label_classes_multi<F, T, C>(y: T) -> Result<(ClassLabels<F, C>, Array2<F>)>
+fn label_classes_multi<F, T, C>(y: T) -> Result<(Vec<C>, Array2<F>)>
 where
     F: Float,
     T: AsTargets<Elem = C>,
-    C: PartialOrd + Clone,
+    C: Ord + Clone,
 {
-    todo!()
+    let y_single_target = y.try_single_target()?;
+    let mut classes = y_single_target.to_vec();
+    // Dedup the list of classes
+    classes.sort();
+    classes.dedup();
+
+    let mut onehot = Array2::zeros((y_single_target.len(), classes.len()));
+    Zip::from(onehot.rows_mut())
+        .and(&y_single_target)
+        .for_each(|mut oh_row, cls| {
+            let idx = classes.binary_search(cls).unwrap();
+            oh_row[idx] = F::one();
+        });
+    Ok((classes, onehot))
 }
 
 /// Conditionally split the feature vector `w` into parameter vector and
@@ -465,7 +422,7 @@ fn softmax_inplace<F: linfa::Float, A: DataMut<Elem = F>>(v: &mut ArrayBase<A, I
     let max = v.iter().copied().reduce(F::max).unwrap();
     v.mapv_inplace(|n| (n - max).exp());
     let sum = v.sum();
-    v.mapv_inplace(|n| n / max);
+    v.mapv_inplace(|n| n / sum);
 }
 
 /// Computes the logistic loss assuming the training labels $y \in {-1, 1}$
@@ -664,13 +621,13 @@ impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
 #[derive(PartialEq, Debug)]
 pub struct MultiFittedLogisticRegression<F: Float, C: PartialOrd + Clone> {
     threshold: F,
-    intercept: F,
+    intercept: Array1<F>,
     params: Array2<F>,
     classes: Vec<C>,
 }
 
 impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
-    fn new(intercept: F, params: Array2<F>, classes: Vec<C>) -> Self {
+    fn new(intercept: Array1<F>, params: Array2<F>, classes: Vec<C>) -> Self {
         Self {
             threshold: F::cast(0.5),
             intercept,
@@ -689,8 +646,8 @@ impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
         self
     }
 
-    pub fn intercept(&self) -> F {
-        self.intercept
+    pub fn intercept(&self) -> &Array1<F> {
+        &self.intercept
     }
 
     pub fn params(&self) -> &Array2<F> {
@@ -699,8 +656,7 @@ impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
 
     /// Return non-normalized probabilities (n_samples * n_classes)
     fn predict_nonorm_probabilities<A: Data<Elem = F>>(&self, x: &ArrayBase<A, Ix2>) -> Array2<F> {
-        let mut probs = x.dot(&self.params) + self.intercept;
-        probs
+        x.dot(&self.params) + &self.intercept
     }
 
     /// Return normalized probabilities for each output class. The output dimensions are (n_samples
@@ -710,7 +666,7 @@ impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
         probs
             .rows_mut()
             .into_iter()
-            .for_each(|row| softmax_inplace(&mut row));
+            .for_each(|mut row| softmax_inplace(&mut row));
         probs
     }
 }
@@ -734,7 +690,8 @@ impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
 
         let probs = self.predict_nonorm_probabilities(x);
         Zip::from(probs.rows()).and(y).for_each(|prob_row, out| {
-            *out = prob_row.argmax_skipnan().unwrap();
+            let idx = prob_row.argmax().unwrap();
+            *out = self.classes[idx].clone();
         });
     }
 

From 023fa481039bed782d1f8bdc826895895cd083d1 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 17:06:13 -0400
Subject: [PATCH 06/18] Implement dot and norm on ArgminParams

---
 algorithms/linfa-logistic/src/argmin_param.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/algorithms/linfa-logistic/src/argmin_param.rs b/algorithms/linfa-logistic/src/argmin_param.rs
index 34f9b8b7e..be6316294 100644
--- a/algorithms/linfa-logistic/src/argmin_param.rs
+++ b/algorithms/linfa-logistic/src/argmin_param.rs
@@ -44,15 +44,13 @@ impl<F: Float, D: Dimension> ArgminAdd<ArgminParam<F, D>, ArgminParam<F, D>> for
 
 impl<F: Float, D: Dimension> ArgminDot<ArgminParam<F, D>, F> for ArgminParam<F, D> {
     fn dot(&self, other: &ArgminParam<F, D>) -> F {
-        //self.0.dot(&other.0)
-        todo!()
+        elem_dot(&self.0, &other.0)
     }
 }
 
 impl<F: Float, D: Dimension> ArgminNorm<F> for ArgminParam<F, D> {
     fn norm(&self) -> F {
-        //self.0.dot(&self.0)
-        todo!()
+        elem_dot(&self.0, &self.0)
     }
 }
 

From 2e76958d365025708675c22bba6d8af3fab7369d Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 17:36:27 -0400
Subject: [PATCH 07/18] Everything compiles

---
 algorithms/linfa-logistic/src/lib.rs | 112 +++++++++++++++++----------
 1 file changed, 71 insertions(+), 41 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 2d32e18d2..1e57a6517 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -67,7 +67,7 @@ use float::Float;
 /// let model = LogisticRegression::default().fit(&dataset).unwrap();
 /// let prediction = model.predict(&dataset);
 /// ```
-pub struct LogisticRegression<F: Float, D: Dimension> {
+pub struct LogisticRegressionBase<F: Float, D: Dimension> {
     alpha: F,
     fit_intercept: bool,
     max_iterations: u64,
@@ -75,9 +75,12 @@ pub struct LogisticRegression<F: Float, D: Dimension> {
     initial_params: Option<Array<F, D>>,
 }
 
-impl<F: Float, D: Dimension> Default for LogisticRegression<F, D> {
+pub type LogisticRegression<F> = LogisticRegressionBase<F, Ix1>;
+pub type MultiLogisticRegression<F> = LogisticRegressionBase<F, Ix2>;
+
+impl<F: Float, D: Dimension> Default for LogisticRegressionBase<F, D> {
     fn default() -> Self {
-        LogisticRegression::new()
+        LogisticRegressionBase::new()
     }
 }
 
@@ -85,10 +88,10 @@ type LBFGSType<F, D> = LBFGS<MoreThuenteLineSearch<ArgminParam<F, D>, F>, Argmin
 type LBFGSType1<F> = LBFGSType<F, Ix1>;
 type LBFGSType2<F> = LBFGSType<F, Ix2>;
 
-impl<F: Float, D: Dimension> LogisticRegression<F, D> {
+impl<F: Float, D: Dimension> LogisticRegressionBase<F, D> {
     /// Creates a new LogisticRegression with default configuration.
     pub fn new() -> Self {
-        LogisticRegression {
+        LogisticRegressionBase {
             alpha: F::cast(1.0),
             fit_intercept: true,
             max_iterations: 100,
@@ -216,7 +219,7 @@ impl<F: Float, D: Dimension> LogisticRegression<F, D> {
 }
 
 impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
-    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix1>
+    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F>
 {
     type Object = FittedLogisticRegression<F, C>;
 
@@ -256,7 +259,7 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
 }
 
 impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C>>
-    Fit<ArrayBase<D, Ix2>, T, Error> for LogisticRegression<F, Ix2>
+    Fit<ArrayBase<D, Ix2>, T, Error> for MultiLogisticRegression<F>
 {
     type Object = MultiFittedLogisticRegression<F, C>;
 
@@ -796,6 +799,7 @@ mod test {
 
     use super::*;
     use approx::{assert_abs_diff_eq, AbsDiffEq};
+    use linfa::prelude::*;
     use ndarray::{array, Array2};
 
     /// Test that the logistic loss function works as expected.
@@ -925,11 +929,15 @@ mod test {
     fn simple_example_1() {
         let log_reg = LogisticRegression::default();
         let x = array![[-1.0], [-0.01], [0.01], [1.0]];
-        let y = array![0.0, 0.0, 1.0, 1.0];
-        let res = log_reg.fit(&x, &y).unwrap();
+        let y = array![0, 0, 1, 1];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
         assert_abs_diff_eq!(res.intercept(), 0.0);
         assert!(res.params().abs_diff_eq(&array![0.681], 1e-3));
-        assert_eq!(res.predict(&x), y);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
     }
 
     #[test]
@@ -937,13 +945,17 @@ mod test {
         let log_reg = LogisticRegression::default();
         let x = array![[0.01], [1.0], [-1.0], [-0.01]];
         let y = array!["dog", "dog", "cat", "cat"];
-        let res = log_reg.fit(&x, &y).unwrap();
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
         assert_abs_diff_eq!(res.intercept(), 0.0);
         assert!(res.params().abs_diff_eq(&array![0.681], 1e-3));
         assert!(res
-            .predict_probabilities(&x)
+            .predict_probabilities(dataset.records())
             .abs_diff_eq(&array![0.501, 0.664, 0.335, 0.498], 1e-3));
-        assert_eq!(res.predict(&x), y);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
     }
 
     #[test]
@@ -961,11 +973,15 @@ mod test {
             [8.0],
             [9.0]
         ];
-        let y = array![0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
-        let res = log_reg.fit(&x, &y).unwrap();
+        let y = array![0, 0, 0, 0, 1, 1, 1, 1, 1, 1];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
         assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
         assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
-        assert_eq!(res.predict(&x), y);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
     }
 
     #[test]
@@ -973,7 +989,7 @@ mod test {
         let log_reg = LogisticRegression::default();
         let x = array![[0.01], [1.0], [-1.0], [-0.01]];
         let y = array![[0, 0], [0, 0], [0, 0], [0, 0]];
-        let res = log_reg.fit(&x, &y);
+        let res = log_reg.fit(&Dataset::new(x, y));
         assert_eq!(
             res.unwrap_err().to_string(),
             "multiple targets not supported".to_string()
@@ -984,8 +1000,8 @@ mod test {
     fn rejects_mismatching_x_y() {
         let log_reg = LogisticRegression::default();
         let x = array![[-1.0], [-0.01], [0.01]];
-        let y = array![0.0, 0.0, 1.0, 1.0];
-        let res = log_reg.fit(&x, &y);
+        let y = array![0, 0, 1, 1];
+        let res = log_reg.fit(&Dataset::new(x, y));
         assert_eq!(
             res.unwrap_err().to_string(),
             "Expected `x` and `y` to have same number of rows, got 3 != 4".to_string()
@@ -998,15 +1014,15 @@ mod test {
         let inf_xs: Vec<_> = infs.iter().map(|&inf| array![[1.0], [inf]]).collect();
         let log_reg = LogisticRegression::default();
         let normal_x = array![[-1.0], [1.0]];
-        let y = array![0.0, 1.0];
+        let y = array![0, 1];
         let expected = "Values must be finite and not `Inf`, `-Inf` or `NaN`".to_string();
         for inf_x in &inf_xs {
-            let res = log_reg.fit(inf_x, &y);
+            let res = log_reg.fit(&Dataset::new(inf_x.to_owned(), y.to_owned()));
             assert_eq!(res.unwrap_err().to_string(), expected);
         }
         for inf in &infs {
             let log_reg = LogisticRegression::default().alpha(*inf);
-            let res = log_reg.fit(&normal_x, &y);
+            let res = log_reg.fit(&Dataset::new(normal_x.to_owned(), y.to_owned()));
             assert_eq!(res.unwrap_err().to_string(), expected);
         }
         let mut non_positives = infs;
@@ -1014,7 +1030,7 @@ mod test {
         non_positives.push(0.0);
         for inf in &non_positives {
             let log_reg = LogisticRegression::default().gradient_tolerance(*inf);
-            let res = log_reg.fit(&normal_x, &y);
+            let res = log_reg.fit(&Dataset::new(normal_x.to_owned(), y.to_owned()));
             assert_eq!(
                 res.unwrap_err().to_string(),
                 "gradient_tolerance must be a positive, finite number"
@@ -1026,30 +1042,36 @@ mod test {
     fn validates_initial_params() {
         let infs = vec![std::f64::INFINITY, std::f64::NEG_INFINITY, std::f64::NAN];
         let normal_x = array![[-1.0], [1.0]];
-        let normal_y = array![0.0, 1.0];
+        let normal_y = array![0, 1];
+        let dataset = Dataset::new(normal_x, normal_y);
         let expected = "Initial parameter guess must be finite".to_string();
         for inf in &infs {
-            let log_reg = LogisticRegression::default().initial_params(array![*inf], 0.0);
-            let res = log_reg.fit(&normal_x, &normal_y);
-            assert_eq!(res.unwrap_err().to_string(), expected);
-        }
-        for inf in &infs {
-            let log_reg = LogisticRegression::default().initial_params(array![0.0], *inf);
-            let res = log_reg.fit(&normal_x, &normal_y);
+            let log_reg = LogisticRegression::default().initial_params(array![*inf]);
+            let res = log_reg.fit(&dataset);
             assert_eq!(res.unwrap_err().to_string(), expected);
         }
         {
-            let log_reg = LogisticRegression::default().initial_params(array![0.0, 0.0], 0.0);
-            let res = log_reg.fit(&normal_x, &normal_y);
+            let log_reg = LogisticRegression::default().initial_params(array![0.0, 0.0]);
+            let res = log_reg.fit(&dataset);
             assert_eq!(res.unwrap_err().to_string(), "Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`".to_string());
         }
+        {
+            let log_reg = LogisticRegression::default()
+                .with_intercept(false)
+                .initial_params(array![0.0, 0.0]);
+            let res = log_reg.fit(&dataset);
+            assert_eq!(
+                res.unwrap_err().to_string(),
+                "Initial parameter should be smaller when there's no intercept".to_string()
+            );
+        }
     }
 
     #[test]
     fn uses_initial_params() {
-        let (params, intercept) = (array![1.2], -4.12);
+        let params = array![1.2, -4.12];
         let log_reg = LogisticRegression::default()
-            .initial_params(params, intercept)
+            .initial_params(params)
             .max_iterations(5);
         let x = array![
             [0.0],
@@ -1063,21 +1085,29 @@ mod test {
             [8.0],
             [9.0]
         ];
-        let y = array![0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
-        let res = log_reg.fit(&x, &y).unwrap();
+        let y = array![0, 0, 0, 0, 1, 1, 1, 1, 1, 1];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
         assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
         assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
-        assert_eq!(res.predict(&x), y);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
     }
 
     #[test]
     fn works_with_f32() {
         let log_reg = LogisticRegression::default();
         let x: Array2<f32> = array![[-1.0], [-0.01], [0.01], [1.0]];
-        let y: Array1<f32> = array![0.0, 0.0, 1.0, 1.0];
-        let res = log_reg.fit(&x, &y).unwrap();
+        let y = array![0, 0, 1, 1];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
         assert_abs_diff_eq!(res.intercept(), 0.0_f32);
         assert!(res.params().abs_diff_eq(&array![0.682_f32], 1e-3));
-        assert_eq!(res.predict(&x), y);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
     }
 }

From 2624bfdb1990a8a703c896064eefa158964d4892 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 17:37:42 -0400
Subject: [PATCH 08/18] Get rid of ndarray-linalg

---
 algorithms/linfa-logistic/Cargo.toml   | 1 -
 algorithms/linfa-logistic/src/float.rs | 2 --
 2 files changed, 3 deletions(-)

diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml
index c7454605e..98f9ee528 100644
--- a/algorithms/linfa-logistic/Cargo.toml
+++ b/algorithms/linfa-logistic/Cargo.toml
@@ -15,7 +15,6 @@ categories = ["algorithms", "mathematics", "science"]
 
 [dependencies]
 ndarray = { version = "0.15", features = ["approx", "blas"] }
-ndarray-linalg = "0.14"
 ndarray-stats = "0.5.0"
 num-traits = "0.2"
 argmin = { version = "0.4.6", features = ["ndarrayl"] }
diff --git a/algorithms/linfa-logistic/src/float.rs b/algorithms/linfa-logistic/src/float.rs
index 49d0f6846..1cac762b2 100644
--- a/algorithms/linfa-logistic/src/float.rs
+++ b/algorithms/linfa-logistic/src/float.rs
@@ -1,7 +1,6 @@
 use crate::argmin_param::ArgminParam;
 use argmin::prelude::{ArgminFloat, ArgminMul};
 use ndarray::{Dimension, Ix1, Ix2, NdFloat};
-use ndarray_linalg::Lapack;
 use num_traits::FromPrimitive;
 
 /// A Float trait that captures the requirements we need for the various
@@ -9,7 +8,6 @@ use num_traits::FromPrimitive;
 pub trait Float:
     ArgminFloat
     + NdFloat
-    + Lapack
     + Default
     + Clone
     + FromPrimitive

From 658acb3e181dce85ed62b1f7331e860637fd0d05 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 17:39:18 -0400
Subject: [PATCH 09/18] Revert "Get rid of ndarray-linalg"

This reverts commit 2624bfdb1990a8a703c896064eefa158964d4892.
---
 algorithms/linfa-logistic/Cargo.toml   | 1 +
 algorithms/linfa-logistic/src/float.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/algorithms/linfa-logistic/Cargo.toml b/algorithms/linfa-logistic/Cargo.toml
index 98f9ee528..c7454605e 100644
--- a/algorithms/linfa-logistic/Cargo.toml
+++ b/algorithms/linfa-logistic/Cargo.toml
@@ -15,6 +15,7 @@ categories = ["algorithms", "mathematics", "science"]
 
 [dependencies]
 ndarray = { version = "0.15", features = ["approx", "blas"] }
+ndarray-linalg = "0.14"
 ndarray-stats = "0.5.0"
 num-traits = "0.2"
 argmin = { version = "0.4.6", features = ["ndarrayl"] }
diff --git a/algorithms/linfa-logistic/src/float.rs b/algorithms/linfa-logistic/src/float.rs
index 1cac762b2..49d0f6846 100644
--- a/algorithms/linfa-logistic/src/float.rs
+++ b/algorithms/linfa-logistic/src/float.rs
@@ -1,6 +1,7 @@
 use crate::argmin_param::ArgminParam;
 use argmin::prelude::{ArgminFloat, ArgminMul};
 use ndarray::{Dimension, Ix1, Ix2, NdFloat};
+use ndarray_linalg::Lapack;
 use num_traits::FromPrimitive;
 
 /// A Float trait that captures the requirements we need for the various
@@ -8,6 +9,7 @@ use num_traits::FromPrimitive;
 pub trait Float:
     ArgminFloat
     + NdFloat
+    + Lapack
     + Default
     + Clone
     + FromPrimitive

From 596ad8b9da5ed59e48d964d0c4179820d7dbd39c Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 20:12:46 -0400
Subject: [PATCH 10/18] Fixed all tests to pass

---
 algorithms/linfa-logistic/src/argmin_param.rs |  2 +-
 algorithms/linfa-logistic/src/lib.rs          | 64 +++++++++----------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/algorithms/linfa-logistic/src/argmin_param.rs b/algorithms/linfa-logistic/src/argmin_param.rs
index be6316294..3905263df 100644
--- a/algorithms/linfa-logistic/src/argmin_param.rs
+++ b/algorithms/linfa-logistic/src/argmin_param.rs
@@ -38,7 +38,7 @@ impl<F: Float, D: Dimension> ArgminSub<ArgminParam<F, D>, ArgminParam<F, D>> for
 
 impl<F: Float, D: Dimension> ArgminAdd<ArgminParam<F, D>, ArgminParam<F, D>> for ArgminParam<F, D> {
     fn add(&self, other: &ArgminParam<F, D>) -> ArgminParam<F, D> {
-        ArgminParam(&self.0 - &other.0)
+        ArgminParam(&self.0 + &other.0)
     }
 }
 
diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 1e57a6517..7723fa0b2 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -249,7 +249,7 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
         let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
 
         let params = result.state().best_param.as_array();
-        let (w, intercept) = convert_params(x.nrows(), params);
+        let (w, intercept) = convert_params(x.ncols(), params);
         Ok(FittedLogisticRegression::new(
             intercept.into_scalar(),
             w,
@@ -273,7 +273,7 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
         let result = self.run_solver(problem, solver, ArgminParam(init_params))?;
 
         let params = result.state().best_param.as_array();
-        let (w, intercept) = convert_params(x.nrows(), params);
+        let (w, intercept) = convert_params(x.ncols(), params);
         Ok(MultiFittedLogisticRegression::new(intercept, w, classes))
     }
 }
@@ -741,18 +741,17 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a
     /// Apply the cost function to a parameter `p`
     fn apply(&self, p: &Self::Param) -> std::result::Result<Self::Output, argmin::core::Error> {
         let w = p.as_array();
-        Ok(logistic_loss(self.x, &self.target, self.alpha, w))
+        let cost = logistic_loss(self.x, &self.target, self.alpha, w);
+        println!("apply: {} {}", w, cost);
+        Ok(cost)
     }
 
     /// Compute the gradient at parameter `p`.
     fn gradient(&self, p: &Self::Param) -> std::result::Result<Self::Param, argmin::core::Error> {
         let w = p.as_array();
-        Ok(ArgminParam(logistic_grad(
-            self.x,
-            &self.target,
-            self.alpha,
-            w,
-        )))
+        let grad = ArgminParam(logistic_grad(self.x, &self.target, self.alpha, w));
+        println!("grad: {} {}", w, grad.as_array());
+        Ok(grad)
     }
 }
 
@@ -797,6 +796,7 @@ impl<'a, F: Float, A: Data<Elem = F>> SolvableProblem for LogisticRegressionProb
 mod test {
     extern crate linfa;
 
+    use super::Error;
     use super::*;
     use approx::{assert_abs_diff_eq, AbsDiffEq};
     use linfa::prelude::*;
@@ -990,10 +990,10 @@ mod test {
         let x = array![[0.01], [1.0], [-1.0], [-0.01]];
         let y = array![[0, 0], [0, 0], [0, 0], [0, 0]];
         let res = log_reg.fit(&Dataset::new(x, y));
-        assert_eq!(
-            res.unwrap_err().to_string(),
-            "multiple targets not supported".to_string()
-        );
+        assert!(matches!(
+            res.unwrap_err(),
+            Error::LinfaError(linfa::Error::MultipleTargets)
+        ));
     }
 
     #[test]
@@ -1002,10 +1002,7 @@ mod test {
         let x = array![[-1.0], [-0.01], [0.01]];
         let y = array![0, 0, 1, 1];
         let res = log_reg.fit(&Dataset::new(x, y));
-        assert_eq!(
-            res.unwrap_err().to_string(),
-            "Expected `x` and `y` to have same number of rows, got 3 != 4".to_string()
-        );
+        assert!(matches!(res.unwrap_err(), Error::MismatchedShapes(3, 4)));
     }
 
     #[test]
@@ -1015,15 +1012,14 @@ mod test {
         let log_reg = LogisticRegression::default();
         let normal_x = array![[-1.0], [1.0]];
         let y = array![0, 1];
-        let expected = "Values must be finite and not `Inf`, `-Inf` or `NaN`".to_string();
         for inf_x in &inf_xs {
             let res = log_reg.fit(&Dataset::new(inf_x.to_owned(), y.to_owned()));
-            assert_eq!(res.unwrap_err().to_string(), expected);
+            assert!(matches!(res.unwrap_err(), Error::InvalidValues));
         }
         for inf in &infs {
             let log_reg = LogisticRegression::default().alpha(*inf);
             let res = log_reg.fit(&Dataset::new(normal_x.to_owned(), y.to_owned()));
-            assert_eq!(res.unwrap_err().to_string(), expected);
+            assert!(matches!(res.unwrap_err(), Error::InvalidValues));
         }
         let mut non_positives = infs;
         non_positives.push(-1.0);
@@ -1031,10 +1027,7 @@ mod test {
         for inf in &non_positives {
             let log_reg = LogisticRegression::default().gradient_tolerance(*inf);
             let res = log_reg.fit(&Dataset::new(normal_x.to_owned(), y.to_owned()));
-            assert_eq!(
-                res.unwrap_err().to_string(),
-                "gradient_tolerance must be a positive, finite number"
-            );
+            assert!(matches!(res.unwrap_err(), Error::InvalidGradientTolerance));
         }
     }
 
@@ -1044,26 +1037,31 @@ mod test {
         let normal_x = array![[-1.0], [1.0]];
         let normal_y = array![0, 1];
         let dataset = Dataset::new(normal_x, normal_y);
-        let expected = "Initial parameter guess must be finite".to_string();
         for inf in &infs {
-            let log_reg = LogisticRegression::default().initial_params(array![*inf]);
+            let log_reg = LogisticRegression::default().initial_params(array![*inf, 0.0]);
             let res = log_reg.fit(&dataset);
-            assert_eq!(res.unwrap_err().to_string(), expected);
+            assert!(matches!(
+                res.unwrap_err(),
+                Error::InvalidInitialParametersGuess
+            ));
         }
         {
-            let log_reg = LogisticRegression::default().initial_params(array![0.0, 0.0]);
+            let log_reg = LogisticRegression::default().initial_params(array![0.0, 0.0, 0.0]);
             let res = log_reg.fit(&dataset);
-            assert_eq!(res.unwrap_err().to_string(), "Size of initial parameter guess must be the same as the number of columns in the feature matrix `x`".to_string());
+            assert!(matches!(
+                res.unwrap_err(),
+                Error::InvalidInitialParametersGuessSize
+            ));
         }
         {
             let log_reg = LogisticRegression::default()
                 .with_intercept(false)
                 .initial_params(array![0.0, 0.0]);
             let res = log_reg.fit(&dataset);
-            assert_eq!(
-                res.unwrap_err().to_string(),
-                "Initial parameter should be smaller when there's no intercept".to_string()
-            );
+            assert!(matches!(
+                res.unwrap_err(),
+                Error::InvalidInitialParametersGuessSize
+            ));
         }
     }
 

From 054bf3df1cf997075b3bfbddf6b52271d5587d80 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Sun, 15 Aug 2021 22:17:28 -0400
Subject: [PATCH 11/18] Add test for multinomial loss and grad

---
 algorithms/linfa-logistic/src/lib.rs | 118 +++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 5 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 7723fa0b2..c8f306c75 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -450,7 +450,8 @@ fn logistic_loss<F: Float, A: Data<Elem = F>>(
     -yz.sum() + F::cast(0.5) * alpha * params.dot(&params)
 }
 
-/// Compute the log of probabilities, which is `log(softmax(H))`, along with `W`.
+/// Compute the log of probabilities, which is `log(softmax(H))`, where H is `X . W + b`. Also
+/// returns `W` without the intercept.
 /// `Y` is the output (n_samples * n_classes), `X` is the input (n_samples * n_features), `W` is the
 /// params (n_features * n_classes), `b` is the intercept vector (n_classes).
 fn multi_logistic_prob_params<F: Float, A: Data<Elem = F>>(
@@ -463,11 +464,11 @@ fn multi_logistic_prob_params<F: Float, A: Data<Elem = F>>(
     let h = x.dot(&params) + intercept;
     // This computes `H - log(sum(exp(H)))`, which is equal to
     // `log(softmax(H)) = log(exp(H) / sum(exp(H)))`
-    let log_prob = &h - log_sum_exp(&h, Axis(1));
+    let log_prob = &h - log_sum_exp(&h, Axis(1)).into_shape((h.nrows(), 1)).unwrap();
     (log_prob, params)
 }
 
-/// Computes loss function of `-sum(Y * log(softmax(H))) + alpha/2 * norm(W)`, where H is `X . W + b`.
+/// Computes loss function of `-sum(Y * log(softmax(H))) + alpha/2 * norm(W)`
 fn multi_logistic_loss<F: Float, A: Data<Elem = F>>(
     x: &ArrayBase<A, Ix2>,
     y: &Array2<F>,
@@ -515,7 +516,7 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
 ) -> Array2<F> {
     let (log_prob, mut params) = multi_logistic_prob_params(x, w);
     let (n_features, n_classes) = params.dim();
-    let intercept = x.ncols() > n_features;
+    let intercept = w.nrows() > n_features;
     let mut grad = Array::zeros((n_features + intercept as usize, n_classes));
 
     // This value is `softmax(H)`
@@ -798,7 +799,7 @@ mod test {
 
     use super::Error;
     use super::*;
-    use approx::{assert_abs_diff_eq, AbsDiffEq};
+    use approx::{assert_abs_diff_eq, assert_relative_eq, AbsDiffEq};
     use linfa::prelude::*;
     use ndarray::{array, Array2};
 
@@ -1108,4 +1109,111 @@ mod test {
             dataset.targets().try_single_target().unwrap()
         );
     }
+
+    #[test]
+    fn test_log_sum_exp() {
+        let data = array![[3.3, 0.4, -2.1], [0.4, 2.2, -0.1], [1., 0., -1.]];
+        let out = log_sum_exp(&data, Axis(1));
+        assert_abs_diff_eq!(out, array![3.35783, 2.43551, 1.40761], epsilon = 1e-5);
+    }
+
+    #[test]
+    fn test_softmax() {
+        let mut data = array![3.3, 5.5, 0.1, -4.4, 8.0];
+        softmax_inplace(&mut data);
+        assert_relative_eq!(
+            data,
+            array![0.0083324, 0.075200047, 0.000339647, 0.000003773, 0.91612413],
+            epsilon = 1e-8
+        );
+        assert_abs_diff_eq!(data.sum(), 1.0);
+    }
+
+    #[test]
+    fn test_multi_logistic_loss_grad() {
+        let x = array![
+            [0.0, 0.5],
+            [1.0, -1.0],
+            [2.0, -2.0],
+            [3.0, -3.0],
+            [4.0, -4.0],
+            [5.0, -5.0],
+            [6.0, -6.0],
+            [7.0, -7.0],
+        ];
+        let y = array![
+            [1.0, 0.0, 0.0],
+            [1.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [0.0, 1.0, 0.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ];
+        let params1 = array![[4.4, -1.2, 3.3], [3.4, 0.1, 0.0]];
+        let params2 = array![[0.001, -3.2, 2.9], [0.1, 4.5, 5.7], [4.5, 2.2, 1.7]];
+        let alpha = 0.6;
+
+        {
+            let (log_prob, w) = multi_logistic_prob_params(&x, &params1);
+            assert_abs_diff_eq!(
+                log_prob,
+                array![
+                    [-3.18259845e-01, -1.96825985e+00, -2.01825985e+00],
+                    [-2.40463987e+00, -4.70463987e+00, -1.04639868e-01],
+                    [-4.61010168e+00, -9.21010168e+00, -1.01016809e-02],
+                    [-6.90100829e+00, -1.38010083e+01, -1.00829256e-03],
+                    [-9.20010104e+00, -1.84001010e+01, -1.01044506e-04],
+                    [-1.15000101e+01, -2.30000101e+01, -1.01301449e-05],
+                    [-1.38000010e+01, -2.76000010e+01, -1.01563199e-06],
+                    [-1.61000001e+01, -3.22000001e+01, -1.01826043e-07],
+                ],
+                epsilon = 1e-6
+            );
+            assert_abs_diff_eq!(w, params1);
+            let loss = multi_logistic_loss(&x, &y, alpha, &params1);
+            assert_abs_diff_eq!(loss, 57.11212197835295, epsilon = 1e-6);
+            let grad = multi_logistic_grad(&x, &y, alpha, &params1);
+            assert_abs_diff_eq!(
+                grad,
+                array![
+                    [1.7536815, -9.71074369, 11.85706219],
+                    [2.79002537, 9.12059357, -9.81061893]
+                ],
+                epsilon = 1e-6
+            );
+        }
+
+        {
+            let (log_prob, w) = multi_logistic_prob_params(&x, &params2);
+            assert_abs_diff_eq!(
+                log_prob,
+                array![
+                    [-1.06637742e+00, -1.16637742e+00, -1.06637742e+00],
+                    [-4.12429463e-03, -9.90512429e+00, -5.50512429e+00],
+                    [-2.74092305e-04, -1.75022741e+01, -8.20227409e+00],
+                    [-1.84027855e-05, -2.51030184e+01, -1.09030184e+01],
+                    [-1.23554225e-06, -3.27040012e+01, -1.36040012e+01],
+                    [-8.29523046e-08, -4.03050001e+01, -1.63050001e+01],
+                    [-5.56928016e-09, -4.79060000e+01, -1.90060000e+01],
+                    [-3.73912013e-10, -5.55070000e+01, -2.17070000e+01]
+                ],
+                epsilon = 1e-6
+            );
+            assert_abs_diff_eq!(w, params2.slice(s![..params2.nrows() - 1, ..]));
+            let loss = multi_logistic_loss(&x, &y, alpha, &params2);
+            assert_abs_diff_eq!(loss, 154.8177958366479, epsilon = 1e-6);
+            let grad = multi_logistic_grad(&x, &y, alpha, &params2);
+            assert_abs_diff_eq!(
+                grad,
+                array![
+                    [26.99587549, -10.91995003, -16.25532546],
+                    [-27.26314882, 11.85569669, 21.58745213],
+                    [5.33984376, -2.68845675, -2.65138701]
+                ],
+                epsilon = 1e-6
+            );
+        }
+    }
 }

From 1b136284455e4061a52d041c8f5ca36ceadde395 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Mon, 16 Aug 2021 00:15:42 -0400
Subject: [PATCH 12/18] Fix argmin param norm

---
 algorithms/linfa-logistic/src/argmin_param.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/linfa-logistic/src/argmin_param.rs b/algorithms/linfa-logistic/src/argmin_param.rs
index 3905263df..e4867143a 100644
--- a/algorithms/linfa-logistic/src/argmin_param.rs
+++ b/algorithms/linfa-logistic/src/argmin_param.rs
@@ -50,7 +50,7 @@ impl<F: Float, D: Dimension> ArgminDot<ArgminParam<F, D>, F> for ArgminParam<F,
 
 impl<F: Float, D: Dimension> ArgminNorm<F> for ArgminParam<F, D> {
     fn norm(&self) -> F {
-        elem_dot(&self.0, &self.0)
+        num_traits::Float::sqrt(elem_dot(&self.0, &self.0))
     }
 }
 

From 0cf4b440476631efd44f41bb19cf944975f12dde Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Mon, 16 Aug 2021 00:53:19 -0400
Subject: [PATCH 13/18] Add multi logistic regression tests

---
 algorithms/linfa-logistic/src/lib.rs | 98 ++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 11 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index c8f306c75..14330de7e 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -743,7 +743,6 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a
     fn apply(&self, p: &Self::Param) -> std::result::Result<Self::Output, argmin::core::Error> {
         let w = p.as_array();
         let cost = logistic_loss(self.x, &self.target, self.alpha, w);
-        println!("apply: {} {}", w, cost);
         Ok(cost)
     }
 
@@ -751,7 +750,6 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem1<'a
     fn gradient(&self, p: &Self::Param) -> std::result::Result<Self::Param, argmin::core::Error> {
         let w = p.as_array();
         let grad = ArgminParam(logistic_grad(self.x, &self.target, self.alpha, w));
-        println!("grad: {} {}", w, grad.as_array());
         Ok(grad)
     }
 }
@@ -766,18 +764,15 @@ impl<'a, F: Float, A: Data<Elem = F>> ArgminOp for LogisticRegressionProblem2<'a
     /// Apply the cost function to a parameter `p`
     fn apply(&self, p: &Self::Param) -> std::result::Result<Self::Output, argmin::core::Error> {
         let w = p.as_array();
-        Ok(multi_logistic_loss(self.x, &self.target, self.alpha, w))
+        let cost = multi_logistic_loss(self.x, &self.target, self.alpha, w);
+        Ok(cost)
     }
 
     /// Compute the gradient at parameter `p`.
     fn gradient(&self, p: &Self::Param) -> std::result::Result<Self::Param, argmin::core::Error> {
         let w = p.as_array();
-        Ok(ArgminParam(multi_logistic_grad(
-            self.x,
-            &self.target,
-            self.alpha,
-            w,
-        )))
+        let grad = ArgminParam(multi_logistic_grad(self.x, &self.target, self.alpha, w));
+        Ok(grad)
     }
 }
 
@@ -977,8 +972,6 @@ mod test {
         let y = array![0, 0, 0, 0, 1, 1, 1, 1, 1, 1];
         let dataset = Dataset::new(x, y);
         let res = log_reg.fit(&dataset).unwrap();
-        assert!(res.intercept().abs_diff_eq(&-4.124, 1e-3));
-        assert!(res.params().abs_diff_eq(&array![1.181], 1e-3));
         assert_eq!(
             &res.predict(dataset.records()),
             dataset.targets().try_single_target().unwrap()
@@ -1216,4 +1209,87 @@ mod test {
             );
         }
     }
+
+    #[test]
+    fn simple_multi_example() {
+        let x = array![[-1., 0.], [0., 1.], [1., 1.]];
+        let y = array![2, 1, 0];
+        let log_reg = MultiLogisticRegression::default()
+            .alpha(0.1)
+            .initial_params(Array::zeros((3, 3)));
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
+        assert_eq!(res.params().dim(), (2, 3));
+        assert_eq!(res.intercept().dim(), 3);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
+    }
+
+    #[test]
+    fn simple_multi_example_text() {
+        let log_reg = MultiLogisticRegression::default().alpha(0.1);
+        let x = array![[0.1], [1.0], [-1.0], [-0.1]];
+        let y = array!["dog", "ape", "rocket", "cat"];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
+        assert_eq!(res.params().dim(), (1, 4));
+        assert_eq!(res.intercept().dim(), 4);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
+    }
+
+    #[test]
+    fn multi_on_binary_problem() {
+        let log_reg = MultiLogisticRegression::default().alpha(1.0);
+        let x = array![
+            [0.0],
+            [1.0],
+            [2.0],
+            [3.0],
+            [4.0],
+            [5.0],
+            [6.0],
+            [7.0],
+            [8.0],
+            [9.0]
+        ];
+        let y = array![0, 0, 0, 0, 1, 1, 1, 1, 1, 1];
+        let dataset = Dataset::new(x, y);
+        let res = log_reg.fit(&dataset).unwrap();
+        assert_eq!(res.params().dim(), (1, 2));
+        assert_eq!(res.intercept().dim(), 2);
+        assert_eq!(
+            &res.predict(dataset.records()),
+            dataset.targets().try_single_target().unwrap()
+        );
+    }
+
+    #[test]
+    fn reject_num_class_mismatch() {
+        let n_samples = 4;
+        let n_classes = 3;
+        let n_features = 1;
+        let x = Array2::<f64>::zeros((n_samples, n_features));
+        let y = array![0, 1, 2, 0];
+        let dataset = Dataset::new(x, y);
+
+        let log_reg = MultiLogisticRegression::default()
+            .initial_params(Array::zeros((n_features, n_classes)));
+        assert!(matches!(
+            log_reg.fit(&dataset).unwrap_err(),
+            Error::InvalidInitialParametersGuessSize
+        ));
+
+        let log_reg = MultiLogisticRegression::default()
+            .with_intercept(false)
+            .initial_params(Array::zeros((n_features + 1, n_classes)));
+        assert!(matches!(
+            log_reg.fit(&dataset).unwrap_err(),
+            Error::InvalidInitialParametersGuessSize
+        ));
+    }
 }

From 9ea787f2c918ac5744995e19569c70c7866459b9 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Mon, 23 Aug 2021 16:04:02 -0400
Subject: [PATCH 14/18] Add docs

---
 algorithms/linfa-logistic/src/lib.rs | 64 +++++++++++++++++-----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 14330de7e..58f34bacf 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -5,7 +5,7 @@
 //! `linfa-logistic` is a crate in the [`linfa`](https://crates.io/crates/linfa) ecosystem, an effort to create a toolkit for classical Machine Learning implemented in pure Rust, akin to Python's `scikit-learn`.
 //!
 //! ## Current state
-//! `linfa-logistic` provides a pure Rust implementation of a two class [logistic regression model](struct.LogisticRegression.html).
+//! `linfa-logistic` provides a pure Rust implementation of a [binomial logistic regression model](struct.LogisticRegression.html) and a [multinomial logistic regression model](struct.MultiLogisticRegression).
 //!
 //! ## Examples
 //!
@@ -37,6 +37,16 @@ mod float;
 use argmin_param::{elem_dot, ArgminParam};
 use float::Float;
 
+/// A generalized logistic regression type that specializes as either binomial logistic regression
+/// or multinomial logistic regression.
+pub struct LogisticRegressionBase<F: Float, D: Dimension> {
+    alpha: F,
+    fit_intercept: bool,
+    max_iterations: u64,
+    gradient_tolerance: F,
+    initial_params: Option<Array<F, D>>,
+}
+
 /// A two-class logistic regression model.
 ///
 /// Logistic regression combines linear models with
@@ -67,15 +77,18 @@ use float::Float;
 /// let model = LogisticRegression::default().fit(&dataset).unwrap();
 /// let prediction = model.predict(&dataset);
 /// ```
-pub struct LogisticRegressionBase<F: Float, D: Dimension> {
-    alpha: F,
-    fit_intercept: bool,
-    max_iterations: u64,
-    gradient_tolerance: F,
-    initial_params: Option<Array<F, D>>,
-}
-
 pub type LogisticRegression<F> = LogisticRegressionBase<F, Ix1>;
+
+/// A multinomial class logistic regression model.
+///
+/// The output labels can map to any discrete feature space, since the algorithm calculates the
+/// likelihood of a feature vector corresponding to any given outcome using the softmax function
+/// `softmax(x) = exp(x) / sum(exp(xi))`
+///
+/// l2 regularization is used by this algorithm and is weighted by parameter `alpha`. Setting `alpha`
+/// close to zero removes regularization and the problem solved minimizes only the
+/// empirical risk. On the other hand, setting `alpha` to a high value increases
+/// the weight of the l2 norm of the linear model coefficients in the cost function.
 pub type MultiLogisticRegression<F> = LogisticRegressionBase<F, Ix2>;
 
 impl<F: Float, D: Dimension> Default for LogisticRegressionBase<F, D> {
@@ -228,9 +241,9 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
     /// create a `FittedLinearRegression` object which allows making
     /// predictions.
     ///
-    /// The array of target classes `y` must have exactly two distinct
-    /// values, (e.g. 0.0 and 1.0, 0 and 1, "cat" and "dog", ...), which
-    /// represent the two different classes the model is supposed to predict.
+    /// The array of target classes `y` must have exactly two discrete values, (e.g. 0 and 1, "cat"
+    /// and "dog", ...), which represent the two different classes the model is supposed to
+    /// predict.
     ///
     /// The array `y` must also have exactly `n_samples` items, i.e.
     /// exactly as many items as there are rows in the feature matrix `x`.
@@ -263,6 +276,16 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
 {
     type Object = MultiFittedLogisticRegression<F, C>;
 
+    /// Given a 2-dimensional feature matrix array `x` with shape
+    /// (n_samples, n_features) and an array of target classes to predict,
+    /// create a `MultiFittedLogisticRegression` object which allows making
+    /// predictions. The target classes can have any number of discrete values.
+    ///
+    /// This method returns an error if any of the preconditions are violated,
+    /// i.e. any values are `Inf` or `NaN`, `y` doesn't have as many items as
+    /// `x` has rows, or if other parameters (gradient_tolerance, alpha) have
+    /// been set to inalid values. The input features are also strongly recommended to be
+    /// normalized to ensure numerical stability.
     fn fit(&self, dataset: &DatasetBase<ArrayBase<D, Ix2>, T>) -> Result<Self::Object> {
         let (x, y) = (dataset.records(), dataset.targets());
         let (classes, target) = label_classes_multi(y)?;
@@ -337,6 +360,9 @@ where
     ))
 }
 
+/// Identify the distinct values of the classes in `y` and map each value to an integer. Smaller
+/// classes (by `PartialOrd`) map to smaller integers. Returns the mapping along with a one-hot
+/// encoding of the numerical labels corresponding to `y`.
 fn label_classes_multi<F, T, C>(y: T) -> Result<(Vec<C>, Array2<F>)>
 where
     F: Float,
@@ -621,10 +647,9 @@ impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>
     }
 }
 
-/// A fitted logistic regression which can make predictions
+/// A fitted multinomial logistic regression which can make predictions
 #[derive(PartialEq, Debug)]
 pub struct MultiFittedLogisticRegression<F: Float, C: PartialOrd + Clone> {
-    threshold: F,
     intercept: Array1<F>,
     params: Array2<F>,
     classes: Vec<C>,
@@ -633,23 +658,12 @@ pub struct MultiFittedLogisticRegression<F: Float, C: PartialOrd + Clone> {
 impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
     fn new(intercept: Array1<F>, params: Array2<F>, classes: Vec<C>) -> Self {
         Self {
-            threshold: F::cast(0.5),
             intercept,
             params,
             classes,
         }
     }
 
-    /// Set the probability threshold for which the 'positive' class will be
-    /// predicted. Defaults to 0.5.
-    pub fn set_threshold(mut self, threshold: F) -> Self {
-        if threshold < F::zero() || threshold > F::one() {
-            panic!("FittedLogisticRegression::set_threshold: threshold needs to be between 0.0 and 1.0");
-        }
-        self.threshold = threshold;
-        self
-    }
-
     pub fn intercept(&self) -> &Array1<F> {
         &self.intercept
     }

From 84f6011981a1e201ec79935e772106f7bd7dc4b8 Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Tue, 24 Aug 2021 23:35:06 -0400
Subject: [PATCH 15/18] Get rid of some cloning in the tests

---
 algorithms/linfa-logistic/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 58f34bacf..2d82b4050 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -1021,12 +1021,12 @@ mod test {
         let normal_x = array![[-1.0], [1.0]];
         let y = array![0, 1];
         for inf_x in &inf_xs {
-            let res = log_reg.fit(&Dataset::new(inf_x.to_owned(), y.to_owned()));
+            let res = log_reg.fit(&DatasetBase::new(inf_x.view(), &y));
             assert!(matches!(res.unwrap_err(), Error::InvalidValues));
         }
         for inf in &infs {
             let log_reg = LogisticRegression::default().alpha(*inf);
-            let res = log_reg.fit(&Dataset::new(normal_x.to_owned(), y.to_owned()));
+            let res = log_reg.fit(&DatasetBase::new(normal_x.view(), &y));
             assert!(matches!(res.unwrap_err(), Error::InvalidValues));
         }
         let mut non_positives = infs;

From 9f2ed3903294fe193bded36a2e51c278cc9c369e Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Thu, 26 Aug 2021 01:02:43 -0400
Subject: [PATCH 16/18] Fix comments and docs

---
 algorithms/linfa-logistic/src/lib.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 2d82b4050..1845b9fd7 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -113,7 +113,7 @@ impl<F: Float, D: Dimension> LogisticRegressionBase<F, D> {
         }
     }
 
-    /// Set the normalization parameter `alpha` used for L2 normalization,
+    /// Set the regularization parameter `alpha` used for L2 regularization,
     /// defaults to `1.0`.
     pub fn alpha(mut self, alpha: F) -> Self {
         self.alpha = alpha;
@@ -503,7 +503,6 @@ fn multi_logistic_loss<F: Float, A: Data<Elem = F>>(
 ) -> F {
     let (log_prob, params) = multi_logistic_prob_params(x, w);
     // Calculate loss
-    // XXX Should we divide cost by n_samples???
     -elem_dot(&log_prob, y) + F::cast(0.5) * alpha * elem_dot(&params, &params)
 }
 
@@ -557,7 +556,6 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
     if intercept {
         grad.row_mut(n_features).assign(&diff.sum_axis(Axis(0)));
     }
-    // XXX should we divide by n_samples??
     grad
 }
 

From 8e7cf9715787a9629984f0f00d8623c1ce6a793c Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Thu, 26 Aug 2021 23:00:29 -0400
Subject: [PATCH 17/18] Use CowArray and ArrayView to prevent copying

---
 algorithms/linfa-logistic/src/lib.rs | 103 +++++++++++++++------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 1845b9fd7..9db737c4a 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -25,8 +25,8 @@ use argmin::solver::quasinewton::lbfgs::LBFGS;
 use linfa::prelude::{AsTargets, DatasetBase};
 use linfa::traits::{Fit, PredictInplace};
 use ndarray::{
-    s, Array, Array1, Array2, ArrayBase, Axis, Data, DataMut, Dimension, IntoDimension, Ix1, Ix2,
-    RemoveAxis, Slice, Zip,
+    s, Array, Array1, Array2, ArrayBase, ArrayView, ArrayView2, Axis, CowArray, Data, DataMut,
+    Dimension, IntoDimension, Ix1, Ix2, RemoveAxis, Slice, Zip,
 };
 use ndarray_stats::QuantileExt;
 use std::default::Default;
@@ -140,16 +140,18 @@ impl<F: Float, D: Dimension> LogisticRegressionBase<F, D> {
         self
     }
 
-    /// Configure the initial parameters from where the optimization starts.
-    /// The `params` array must have at least the same number of rows as there are columns on the
-    /// feature matrix `x` passed to the `fit` method
+    /// Configure the initial parameters from where the optimization starts.  The `params` array
+    /// must have the same number of rows as there are columns on the feature matrix `x` passed to
+    /// the `fit` method. If `with_intercept` is set, then it needs to have one more row. For
+    /// multinomial regression, `params` also must have the same number of columns as the number of
+    /// distinct classes in `y`.
     pub fn initial_params(mut self, params: Array<F, D>) -> Self {
         self.initial_params = Some(params);
         self
     }
 
-    /// Create the initial parameters, either from a user supplied guess
-    /// or a 1-d array of `0`s.
+    /// Create the initial parameters, either from a user supplied array
+    /// or an array of 0s
     fn setup_init_params(&self, dims: D::Pattern) -> Array<F, D> {
         if let Some(params) = self.initial_params.as_ref() {
             params.clone()
@@ -264,8 +266,8 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
         let params = result.state().best_param.as_array();
         let (w, intercept) = convert_params(x.ncols(), params);
         Ok(FittedLogisticRegression::new(
-            intercept.into_scalar(),
-            w,
+            *intercept.view().into_scalar(),
+            w.to_owned(),
             labels,
         ))
     }
@@ -297,7 +299,11 @@ impl<'a, C: 'a + Ord + Clone, F: Float, D: Data<Elem = F>, T: AsTargets<Elem = C
 
         let params = result.state().best_param.as_array();
         let (w, intercept) = convert_params(x.ncols(), params);
-        Ok(MultiFittedLogisticRegression::new(intercept, w, classes))
+        Ok(MultiFittedLogisticRegression::new(
+            intercept.to_owned(),
+            w.to_owned(),
+            classes,
+        ))
     }
 }
 
@@ -391,14 +397,17 @@ where
 fn convert_params<F: Float, D: Dimension + RemoveAxis>(
     n_features: usize,
     w: &Array<F, D>,
-) -> (Array<F, D>, Array<F, D::Smaller>) {
+) -> (ArrayView<F, D>, CowArray<F, D::Smaller>) {
     let nrows = w.shape()[0];
     if n_features == nrows {
-        (w.to_owned(), Array::zeros(w.raw_dim().remove_axis(Axis(0))))
+        (
+            w.view(),
+            Array::zeros(w.raw_dim().remove_axis(Axis(0))).into(),
+        )
     } else if n_features + 1 == nrows {
         (
-            w.slice_axis(Axis(0), Slice::from(..n_features)).to_owned(),
-            w.index_axis(Axis(0), n_features).to_owned(),
+            w.slice_axis(Axis(0), Slice::from(..n_features)),
+            w.index_axis(Axis(0), n_features).into(),
         )
     } else {
         panic!(
@@ -471,19 +480,47 @@ fn logistic_loss<F: Float, A: Data<Elem = F>>(
 ) -> F {
     let n_features = x.shape()[1];
     let (params, intercept) = convert_params(n_features, w);
-    let mut yz = (x.dot(&params) + intercept) * y;
+    let yz = x.dot(&params.into_shape((params.len(), 1)).unwrap()) + intercept;
+    let len = yz.len();
+    let mut yz = yz.into_shape(len).unwrap() * y;
     yz.mapv_inplace(log_logistic);
     -yz.sum() + F::cast(0.5) * alpha * params.dot(&params)
 }
 
+/// Computes the gradient of the logistic loss function
+fn logistic_grad<F: Float, A: Data<Elem = F>>(
+    x: &ArrayBase<A, Ix2>,
+    y: &Array1<F>,
+    alpha: F,
+    w: &Array1<F>,
+) -> Array1<F> {
+    let n_features = x.shape()[1];
+    let (params, intercept) = convert_params(n_features, w);
+    let yz = x.dot(&params.into_shape((params.len(), 1)).unwrap()) + intercept;
+    let len = yz.len();
+    let mut yz = yz.into_shape(len).unwrap() * y;
+    yz.mapv_inplace(logistic);
+    yz -= F::one();
+    yz *= y;
+    if w.len() == n_features + 1 {
+        let mut grad = Array::zeros(w.len());
+        grad.slice_mut(s![..n_features])
+            .assign(&(x.t().dot(&yz) + (&params * alpha)));
+        grad[n_features] = yz.sum();
+        grad
+    } else {
+        x.t().dot(&yz) + (&params * alpha)
+    }
+}
+
 /// Compute the log of probabilities, which is `log(softmax(H))`, where H is `X . W + b`. Also
 /// returns `W` without the intercept.
 /// `Y` is the output (n_samples * n_classes), `X` is the input (n_samples * n_features), `W` is the
 /// params (n_features * n_classes), `b` is the intercept vector (n_classes).
-fn multi_logistic_prob_params<F: Float, A: Data<Elem = F>>(
+fn multi_logistic_prob_params<'a, F: Float, A: Data<Elem = F>>(
     x: &ArrayBase<A, Ix2>,
-    w: &Array2<F>, // This parameter includes `W` and `b`
-) -> (Array2<F>, Array2<F>) {
+    w: &'a Array2<F>, // This parameter includes `W` and `b`
+) -> (Array2<F>, ArrayView2<'a, F>) {
     let n_features = x.shape()[1];
     let (params, intercept) = convert_params(n_features, w);
     // Compute H
@@ -506,30 +543,6 @@ fn multi_logistic_loss<F: Float, A: Data<Elem = F>>(
     -elem_dot(&log_prob, y) + F::cast(0.5) * alpha * elem_dot(&params, &params)
 }
 
-/// Computes the gradient of the logistic loss function
-fn logistic_grad<F: Float, A: Data<Elem = F>>(
-    x: &ArrayBase<A, Ix2>,
-    y: &Array1<F>,
-    alpha: F,
-    w: &Array1<F>,
-) -> Array1<F> {
-    let n_features = x.shape()[1];
-    let (params, intercept) = convert_params(n_features, w);
-    let mut yz = (x.dot(&params) + intercept) * y;
-    yz.mapv_inplace(logistic);
-    yz -= F::one();
-    yz *= y;
-    if w.len() == n_features + 1 {
-        let mut grad = Array::zeros(w.len());
-        grad.slice_mut(s![..n_features])
-            .assign(&(x.t().dot(&yz) + &(params * alpha)));
-        grad[n_features] = yz.sum();
-        grad
-    } else {
-        x.t().dot(&yz) + &(params * alpha)
-    }
-}
-
 /// Computes multinomial gradients for `W` and `b` and combine them.
 /// Gradient for `W` is `Xt . (softmax(H) - Y) + alpha * W`.
 /// Gradient for `b` is `sum(softmax(H) - Y)`.
@@ -539,7 +552,7 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
     alpha: F,
     w: &Array2<F>,
 ) -> Array2<F> {
-    let (log_prob, mut params) = multi_logistic_prob_params(x, w);
+    let (log_prob, params) = multi_logistic_prob_params(x, w);
     let (n_features, n_classes) = params.dim();
     let intercept = w.nrows() > n_features;
     let mut grad = Array::zeros((n_features + intercept as usize, n_classes));
@@ -548,9 +561,7 @@ fn multi_logistic_grad<F: Float, A: Data<Elem = F>>(
     let prob = log_prob.mapv_into(num_traits::Float::exp);
     let diff = prob - y;
     // Compute gradient for `W` and place it at start of the grad matrix
-    let mut dw = x.t().dot(&diff);
-    params *= alpha;
-    dw += &params;
+    let dw = x.t().dot(&diff) + (&params * alpha);
     grad.slice_mut(s![..n_features, ..]).assign(&dw);
     // Compute gradient for `b` and place it at end of grad matrix
     if intercept {

From 65dc9138329399fae1100e2cf783e48d4e0a4bfe Mon Sep 17 00:00:00 2001
From: YuhanLiin <yuhanliin+github@protonmail.com>
Date: Tue, 31 Aug 2021 00:13:52 -0400
Subject: [PATCH 18/18] Add getter for classes for fitted model

---
 algorithms/linfa-logistic/src/lib.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/algorithms/linfa-logistic/src/lib.rs b/algorithms/linfa-logistic/src/lib.rs
index 9db737c4a..00d94a338 100644
--- a/algorithms/linfa-logistic/src/lib.rs
+++ b/algorithms/linfa-logistic/src/lib.rs
@@ -696,6 +696,11 @@ impl<F: Float, C: PartialOrd + Clone> MultiFittedLogisticRegression<F, C> {
             .for_each(|mut row| softmax_inplace(&mut row));
         probs
     }
+
+    /// Get the list of class labels, which maps the numerical class indices to the labels
+    pub fn classes(&self) -> &[C] {
+        &self.classes
+    }
 }
 
 impl<C: PartialOrd + Clone + Default, F: Float, D: Data<Elem = F>>