v2 joint regression (#395)

montanalow · web-flow · commit 9eac347d120c · 2022-10-10T18:32:05.000-07:00
diff --git a/pgml-extension/examples/joint_regression.sql b/pgml-extension/examples/joint_regression.sql
@@ -11,10 +11,10 @@ SELECT pgml.load_dataset('linnerud');
 SELECT * FROM pgml.linnerud LIMIT 10;
 
 -- train a simple model on the data
-SELECT * FROM pgml.train_joint('Exercise vs Physiology', 'regression', 'pgml.linnerud', ARRAY['weight', 'waste', 'pulse']);
+SELECT * FROM pgml.train_joint('Exercise vs Physiology', 'regression', 'pgml.linnerud', ARRAY['weight', 'waist', 'pulse']);
 
 -- check out the predictions
-SELECT weight, waste, pulse, pgml.predict_joint('Exercise vs Physiology', ARRAY[chins, situps, jumps]) AS prediction
+SELECT weight, waist, pulse, pgml.predict_joint('Exercise vs Physiology', ARRAY[chins, situps, jumps]) AS prediction
 FROM pgml.linnerud 
 LIMIT 10;
 
@@ -24,7 +24,7 @@ SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'lasso');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'elastic_net');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'least_angle');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'lasso_least_angle');
-SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'orthoganl_matching_pursuit');
+SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'orthogonal_matching_pursuit');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'bayesian_ridge');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'automatic_relevance_determination');
 SELECT * FROM pgml.train_joint('Exercise vs Physiology', algorithm => 'stochastic_gradient_descent');
@@ -77,6 +77,6 @@ SELECT * FROM pgml.deploy('Exercise vs Physiology', 'rollback');
 SELECT * FROM pgml.deploy('Exercise vs Physiology', 'best_score', 'svm');
 
 -- check out the improved predictions
-SELECT weight, waste, pulse, pgml.predict_joint('Exercise vs Physiology', ARRAY[chins, situps, jumps]) AS prediction
+SELECT weight, waist, pulse, pgml.predict_joint('Exercise vs Physiology', ARRAY[chins, situps, jumps]) AS prediction
 FROM pgml.linnerud 
 LIMIT 10;
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
@@ -102,7 +102,6 @@ pub fn python_version() -> String {
 fn version() -> String {
     crate::VERSION.to_string()
 }
-
 #[allow(clippy::too_many_arguments)]
 #[pg_extern]
 fn train(
@@ -126,6 +125,50 @@ fn train(
         name!(algorithm, String),
         name!(deployed, bool),
     ),
+> {
+    train_joint(
+        project_name,
+        task,
+        relation_name,
+        match y_column_name {
+            Some(y_column_name) => Some(vec![y_column_name.to_string()]),
+            None => None,
+        },
+        algorithm,
+        hyperparams,
+        search,
+        search_params,
+        search_args,
+        test_size,
+        test_sampling,
+        runtime,
+        automatic_deploy,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+#[pg_extern]
+fn train_joint(
+    project_name: &str,
+    task: Option<default!(Task, "NULL")>,
+    relation_name: Option<default!(&str, "NULL")>,
+    y_column_name: Option<default!(Vec<String>, "NULL")>,
+    algorithm: default!(Algorithm, "'linear'"),
+    hyperparams: default!(JsonB, "'{}'"),
+    search: Option<default!(Search, "NULL")>,
+    search_params: default!(JsonB, "'{}'"),
+    search_args: default!(JsonB, "'{}'"),
+    test_size: default!(f32, 0.25),
+    test_sampling: default!(Sampling, "'last'"),
+    runtime: Option<default!(Runtime, "NULL")>,
+    automatic_deploy: Option<default!(bool, true)>,
+) -> impl std::iter::Iterator<
+    Item = (
+        name!(project, String),
+        name!(task, String),
+        name!(algorithm, String),
+        name!(deployed, bool),
+    ),
 > {
     let project = match Project::find_by_name(project_name) {
         Some(project) => project,
@@ -364,6 +407,11 @@ fn deploy(
 
 #[pg_extern]
 fn predict(project_name: &str, features: Vec<f32>) -> f32 {
+    predict_joint(project_name, features)[0]
+}
+
+#[pg_extern]
+fn predict_joint(project_name: &str, features: Vec<f32>) -> Vec<f32> {
     let mut projects = PROJECT_NAME_TO_PROJECT_ID.lock();
     let project_id = match projects.get(project_name) {
         Some(project_id) => *project_id,
@@ -415,7 +463,12 @@ fn snapshot(
     test_size: default!(f32, 0.25),
     test_sampling: default!(Sampling, "'last'"),
 ) -> impl std::iter::Iterator<Item = (name!(relation, String), name!(y_column_name, String))> {
-    Snapshot::create(relation_name, y_column_name, test_size, test_sampling);
+    Snapshot::create(
+        relation_name,
+        vec![y_column_name.to_string()],
+        test_size,
+        test_sampling,
+    );
     vec![(relation_name.to_string(), y_column_name.to_string())].into_iter()
 }
 
@@ -442,7 +495,7 @@ fn load_dataset(
 #[pg_extern]
 fn model_predict(model_id: i64, features: Vec<f32>) -> f32 {
     let estimator = crate::orm::file::find_deployed_estimator_by_model_id(model_id);
-    estimator.predict(&features)
+    estimator.predict(&features)[0]
 }
 
 #[pg_extern]
diff --git a/pgml-extension/src/bindings/lightgbm.rs b/pgml-extension/src/bindings/lightgbm.rs
@@ -78,8 +78,8 @@ fn fit(dataset: &Dataset, hyperparams: &Hyperparams, task: Task) -> Box<dyn Bind
 
 impl Bindings for Estimator {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
diff --git a/pgml-extension/src/bindings/linfa.rs b/pgml-extension/src/bindings/linfa.rs
@@ -52,8 +52,8 @@ impl LinearRegression {
 
 impl Bindings for LinearRegression {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
@@ -182,8 +182,8 @@ impl LogisticRegression {
 
 impl Bindings for LogisticRegression {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
@@ -290,8 +290,8 @@ impl Svm {
 
 impl Bindings for Svm {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
@@ -22,7 +22,7 @@ pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box<dyn Bindi
 /// implement serde.
 pub trait Bindings: Send + Sync {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32;
+    fn predict(&self, features: &[f32]) -> Vec<f32>;
 
     /// Predict a set of datapoints.
     fn predict_batch(&self, features: &[f32]) -> Vec<f32>;
diff --git a/pgml-extension/src/bindings/sklearn.py b/pgml-extension/src/bindings/sklearn.py
@@ -67,19 +67,7 @@
 }
 
 
-def estimator(algorithm, num_features, hyperparams):
-    """Returns the correct estimator based on algorithm names
-    we defined internally.
-
-    Parameters:
-        - algorithm: The human-readable name of the algorithm (see dict above).
-        - num_features: The number of features in X.
-        - hyperparams: JSON of hyperparameters.
-    """
-    return estimator_joint(algorithm, num_features, 1, hyperparams)
-
-
-def estimator_joint(algorithm, num_features, num_targets, hyperparams):
+def estimator(algorithm, num_features, num_targets, hyperparams):
     """Returns the correct estimator based on algorithm names we defined
     internally (see dict above).
 
@@ -97,6 +85,22 @@ def estimator_joint(algorithm, num_features, num_targets, hyperparams):
 
     def train(X_train, y_train):
         instance = _ALGORITHM_MAP[algorithm](**hyperparams)
+        if num_targets > 1 and algorithm in [
+            "bayesian_ridge_regression",
+            "automatic_relevance_determination_regression",
+            "stochastic_gradient_descent_regression",
+            "passive_aggressive_regression",
+            "theil_sen_regression",
+            "huber_regression",
+            "quantile_regression",
+            "svm_regression",
+            "nu_svm_regression",
+            "linear_svm_regression",
+            "ada_boost_regression",
+            "gradient_boosting_trees_regression",
+            "lightgbm_regression",
+        ]:
+            instance = sklearn.multioutput.MultiOutputRegressor(instance)
 
         X_train = np.asarray(X_train).reshape((-1, num_features))
 
diff --git a/pgml-extension/src/bindings/sklearn.rs b/pgml-extension/src/bindings/sklearn.rs
@@ -307,6 +307,7 @@ fn fit(
                     &[
                         String::from(algorithm_task).into_py(py),
                         dataset.num_features.into_py(py),
+                        dataset.num_labels.into_py(py),
                         hyperparams.into_py(py),
                     ],
                 ),
@@ -350,8 +351,8 @@ impl std::fmt::Debug for Estimator {
 
 impl Bindings for Estimator {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
diff --git a/pgml-extension/src/bindings/xgboost.rs b/pgml-extension/src/bindings/xgboost.rs
@@ -227,8 +227,8 @@ impl std::fmt::Debug for Estimator {
 
 impl Bindings for Estimator {
     /// Predict a novel datapoint.
-    fn predict(&self, features: &[f32]) -> f32 {
-        self.predict_batch(features)[0]
+    fn predict(&self, features: &[f32]) -> Vec<f32> {
+        self.predict_batch(features)
     }
 
     /// Predict a novel datapoint.
diff --git a/pgml-extension/src/orm/model.rs b/pgml-extension/src/orm/model.rs
@@ -57,13 +57,17 @@ impl Model {
         // Set the runtime to one we recommend, unless the user knows better.
         let runtime = match runtime {
             Some(runtime) => runtime,
-            None => match algorithm {
-                Algorithm::xgboost => Runtime::rust,
-                Algorithm::lightgbm => Runtime::rust,
-                Algorithm::linear => match project.task {
-                    Task::classification => Runtime::python,
-                    Task::regression => Runtime::rust,
+            None => match snapshot.y_column_name.len() {
+                1 => match algorithm {
+                    Algorithm::xgboost => Runtime::rust,
+                    Algorithm::lightgbm => Runtime::rust,
+                    Algorithm::linear => match project.task {
+                        Task::classification => Runtime::python,
+                        Task::regression => Runtime::rust,
+                    },
+                    _ => Runtime::python,
                 },
+                // Joint regression is only supported in Python
                 _ => Runtime::python,
             },
         };
diff --git a/pgml-extension/src/orm/snapshot.rs b/pgml-extension/src/orm/snapshot.rs
@@ -119,7 +119,7 @@ impl Snapshot {
 
     pub fn create(
         relation_name: &str,
-        y_column_name: &str,
+        y_column_name: Vec<String>,
         test_size: f32,
         test_sampling: Sampling,
     ) -> Snapshot {
@@ -130,7 +130,7 @@ impl Snapshot {
                 Some(1),
                 Some(vec![
                     (PgBuiltInOids::TEXTOID.oid(), relation_name.into_datum()),
-                    (PgBuiltInOids::TEXTARRAYOID.oid(), vec![y_column_name].into_datum()),
+                    (PgBuiltInOids::TEXTARRAYOID.oid(), y_column_name.into_datum()),
                     (PgBuiltInOids::FLOAT4OID.oid(), test_size.into_datum()),
                     (PgBuiltInOids::TEXTOID.oid(), test_sampling.to_string().into_datum()),
                     (PgBuiltInOids::TEXTOID.oid(), status.to_string().into_datum()),

Original file line number	Diff line number	Diff line change
`@@ -78,8 +78,8 @@ fn fit(dataset: &Dataset, hyperparams: &Hyperparams, task: Task) -> Box<dyn Bind`
`78`	`78`
`79`	`79`	`impl Bindings for Estimator {`
`80`	`80`	`/// Predict a novel datapoint.`
`81`		`- fn predict(&self, features: &[f32]) -> f32 {`
`82`		`- self.predict_batch(features)[0]`
	`81`	`+ fn predict(&self, features: &[f32]) -> Vec<f32> {`
	`82`	`+ self.predict_batch(features)`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`/// Predict a novel datapoint.`
Original file line number	Diff line number	Diff line change
`@@ -227,8 +227,8 @@ impl std::fmt::Debug for Estimator {`
`227`	`227`
`228`	`228`	`impl Bindings for Estimator {`
`229`	`229`	`/// Predict a novel datapoint.`
`230`		`- fn predict(&self, features: &[f32]) -> f32 {`
`231`		`- self.predict_batch(features)[0]`
	`230`	`+ fn predict(&self, features: &[f32]) -> Vec<f32> {`
	`231`	`+ self.predict_batch(features)`
`232`	`232`	`}`
`233`	`233`
`234`	`234`	`/// Predict a novel datapoint.`