complete gross hack but works on cubes

stanford-futuredata · Nov 22, 2017 · 8a24c49 · 8a24c49
1 parent d1eeedf
commit 8a24c49
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 11 deletions.
diff --git a/core/demo/cli_cube_predicate.yaml b/core/demo/cli_cube_predicate.yaml
@@ -0,0 +1,19 @@
+pipeline: "BasicBatchPipeline"
+
+inputURI: "csv://core/demo/sample_cubed.csv"
+
+classifier: "predicate"
+countColumn: "count"
+meanColumn: "mean"
+stdColumn: "std"
+
+metric: "mean"
+predicate: "<="
+value: 34.0
+
+summarizer: "apriori"
+attributes:
+  - "location"
+  - "version"
+minRatioMetric: 10.0
+minSupport: 0.2
diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java
@@ -41,15 +41,8 @@ public BasicBatchPipeline (PipelineConfig conf) {
         this.conf = conf;
         inputURI = conf.get("inputURI");
 
-        //classifierType = conf.get("classifier", "percentile");
         metric = conf.get("metric");
-        //cutoff = conf.get("cutoff", 1.0);
-        //pctileHigh = conf.get("includeHi", true);
-        //pctileLow = conf.get("includeLo", true);
-
-        //summarizerType = conf.get("summarizer", "apriori");
         attributes = conf.get("attributes");
-        //ratioMetric = conf.get("ratioMetric", "globalRatio");
         minRiskRatio = conf.get("minRatioMetric", 3.0);
         minSupport = conf.get("minSupport", 0.01);
     }

diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java
@@ -1,9 +1,6 @@
 package edu.stanford.futuredata.macrobase.pipeline;
 
-import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier;
-import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier;
-import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier;
-import edu.stanford.futuredata.macrobase.analysis.classify.RawClassifier;
+import edu.stanford.futuredata.macrobase.analysis.classify.*;
 import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
 import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APExplanation;
 import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APrioriSummarizer;
@@ -26,6 +23,8 @@
 public class CubePipeline implements Pipeline {
     Logger log = LoggerFactory.getLogger("CubePipeline");
 
+    private final PipelineConfig conf;
+
     // Ingest
     private String inputURI;
     private Map<String, String> restHeader;
@@ -48,6 +47,8 @@ public class CubePipeline implements Pipeline {
     private boolean debugDump;
 
     public CubePipeline(PipelineConfig conf) {
+        this.conf = conf;
+
         inputURI = conf.get("inputURI");
         restHeader = conf.get("restHeader", null);
         jsonBody = conf.get("jsonBody", null);
@@ -157,6 +158,14 @@ private CubeClassifier getClassifier() throws MacrobaseException {
                 classifier.setIncludeLow(includeLo);
                 return classifier;
             }
+            case "predicate": {
+                // default values for PredicateClassifier:
+                // {predicate: "==", value: 1.0}
+                final String metric = conf.get("metric");
+                final String predicateStr = conf.get("predicate", "==").trim();
+                final double metricValue = conf.get("value", 1.0);
+                return new PredicateCubeClassifier(countColumn, metric, predicateStr, metricValue);
+            }
             case "raw": {
                 return new RawClassifier(
                         countColumn,

diff --git a/...ain/java/edu/stanford/futuredata/macrobase/analysis/classify/PredicateCubeClassifier.java b/...ain/java/edu/stanford/futuredata/macrobase/analysis/classify/PredicateCubeClassifier.java
@@ -0,0 +1,160 @@
+package edu.stanford.futuredata.macrobase.analysis.classify;
+
+import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
+import edu.stanford.futuredata.macrobase.util.MacrobaseException;
+
+import java.util.function.DoublePredicate;
+
+/**
+ * PredicateClassifier classifies an outlier based on a predicate(e.g., equality, less than, greater than)
+ * and a hard-coded sentinel value. Unlike {@link PercentileClassifier}, outlier values are not determined based on a
+ * proportion of the values in the metric column. Instead, the outlier values are defined explicitly by the user in the
+ * conf.yaml file; for example:
+ * <code>
+ *     classifier: "raw_threshold"
+ *     metric: "usage"
+ *     predicate: "=="
+ *     value: 1.0
+ * </code>
+ * This would instantiate a PredicateClassifier that classifies every value in the "usage" column equal to 1.0
+ * as an outlier. Currently, we support six different predicates: "==", "!=", "<", ">", "<=", and ">=".
+ */
+public class PredicateCubeClassifier extends CubeClassifier {
+
+    private final DoublePredicate predicate;
+    private DataFrame output;
+    private double lowCutoff;
+    private double highCutoff;
+
+    private boolean includeHigh = true;
+    private boolean includeLow = true;
+
+    private final String columnName; //hack
+
+    private enum PredicateType {
+        EQUALS("=="), NOT_EQUALS("!="), LESS_THAN("<"), GREATER_THAN(">"),
+        LEQ("<="), GEQ(">=");
+
+
+        private static PredicateType getEnum(final String str) throws MacrobaseException {
+            switch (str) {
+                case "==":
+                    return EQUALS;
+                case "!=":
+                    return NOT_EQUALS;
+                case "<":
+                    return LESS_THAN;
+                case ">":
+                    return GREATER_THAN;
+                case "<=":
+                    return LEQ;
+                case ">=":
+                    return GEQ;
+                default:
+                    throw new MacrobaseException("PredicateClassifier: Predicate string " + str +
+                            " not suppported.");
+            }
+        }
+
+        final String str;
+        PredicateType(String str) {
+            this.str = str;
+        }
+    }
+
+    /**
+     * @param columnName Column on which to classifier outliers
+     * @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">="
+     * @param sentinel Sentinel value used when evaluating the predicate to determine outlier
+     * @throws MacrobaseException
+     */
+    public PredicateCubeClassifier(final String countColumnName, final String columnName, final String predicateStr, final double sentinel)
+            throws MacrobaseException {
+        super(countColumnName);
+        this.columnName = columnName;
+        this.predicate = getPredicate(predicateStr, sentinel);
+    }
+
+    /**
+     * @return Lambda function corresponding to the ``predicateStr''. The Lambda function takes in a single
+     * argument, which will correspond to the value in the metric column. (A closure is created around the ``sentinel''
+     * parameter.)
+     * @throws MacrobaseException
+     */
+    private DoublePredicate getPredicate(final String predicateStr, final double sentinel) throws MacrobaseException {
+        switch (PredicateType.getEnum(predicateStr)) {
+            default:
+            case EQUALS:
+                return (double x) -> x == sentinel;
+            case NOT_EQUALS:
+                return (double x) -> x != sentinel;
+            case LESS_THAN:
+                return (double x) -> x < sentinel;
+            case GREATER_THAN:
+                return (double x) -> x > sentinel;
+            case LEQ:
+                return (double x) -> x <= sentinel;
+            case GEQ:
+                return (double x) -> x >= sentinel;
+        }
+    }
+
+    /**
+     * Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame
+     * remains unmodified; a copy is created and all modifications are made on the copy.
+     * @throws Exception
+     */
+    @Override
+    public void process(DataFrame input) throws Exception {
+        double[] metrics = input.getDoubleColumnByName(columnName);
+        int len = metrics.length;
+        output = input.copy();
+        double[] resultColumn = new double[len];
+        for (int i = 0; i < len; i++) {
+            final double curVal = metrics[i];
+            if (predicate.test(curVal)) {
+                resultColumn[i] = 1.0;
+            }
+        }
+        output.addDoubleColumn(outputColumnName, resultColumn);
+    }
+
+    @Override
+    public DataFrame getResults() {
+        return output;
+    }
+
+    public double getLowCutoff() {
+        return lowCutoff;
+    }
+    public double getHighCutoff()
+    {
+        return highCutoff;
+    }
+
+    /**
+     * @param includeHigh Whether to count high points as outliers.
+     * @return this
+     */
+    public PredicateCubeClassifier setIncludeHigh(boolean includeHigh) {
+        this.includeHigh = includeHigh;
+        return this;
+    }
+
+    /**
+     * @param includeLow Whether to count low points as outliers
+     * @return this
+     */
+    public PredicateCubeClassifier setIncludeLow(boolean includeLow) {
+        this.includeLow = includeLow;
+        return this;
+    }
+
+    public boolean isIncludeHigh() {
+        return includeHigh;
+    }
+    public boolean isIncludeLow() {
+        return includeLow;
+    }
+
+}