Skip to content

Commit

Permalink
complete gross hack but works on cubes
Browse files Browse the repository at this point in the history
  • Loading branch information
sahaana committed Nov 22, 2017
1 parent d1eeedf commit 8a24c49
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 11 deletions.
19 changes: 19 additions & 0 deletions core/demo/cli_cube_predicate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
pipeline: "BasicBatchPipeline"

inputURI: "csv://core/demo/sample_cubed.csv"

classifier: "predicate"
countColumn: "count"
meanColumn: "mean"
stdColumn: "std"

metric: "mean"
predicate: "<="
value: 34.0

summarizer: "apriori"
attributes:
- "location"
- "version"
minRatioMetric: 10.0
minSupport: 0.2
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,8 @@ public BasicBatchPipeline (PipelineConfig conf) {
this.conf = conf;
inputURI = conf.get("inputURI");

//classifierType = conf.get("classifier", "percentile");
metric = conf.get("metric");
//cutoff = conf.get("cutoff", 1.0);
//pctileHigh = conf.get("includeHi", true);
//pctileLow = conf.get("includeLo", true);

//summarizerType = conf.get("summarizer", "apriori");
attributes = conf.get("attributes");
//ratioMetric = conf.get("ratioMetric", "globalRatio");
minRiskRatio = conf.get("minRatioMetric", 3.0);
minSupport = conf.get("minSupport", 0.01);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package edu.stanford.futuredata.macrobase.pipeline;

import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.RawClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.*;
import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APExplanation;
import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APrioriSummarizer;
Expand All @@ -26,6 +23,8 @@
public class CubePipeline implements Pipeline {
Logger log = LoggerFactory.getLogger("CubePipeline");

private final PipelineConfig conf;

// Ingest
private String inputURI;
private Map<String, String> restHeader;
Expand All @@ -48,6 +47,8 @@ public class CubePipeline implements Pipeline {
private boolean debugDump;

public CubePipeline(PipelineConfig conf) {
this.conf = conf;

inputURI = conf.get("inputURI");
restHeader = conf.get("restHeader", null);
jsonBody = conf.get("jsonBody", null);
Expand Down Expand Up @@ -157,6 +158,14 @@ private CubeClassifier getClassifier() throws MacrobaseException {
classifier.setIncludeLow(includeLo);
return classifier;
}
case "predicate": {
// default values for PredicateClassifier:
// {predicate: "==", value: 1.0}
final String metric = conf.get("metric");
final String predicateStr = conf.get("predicate", "==").trim();
final double metricValue = conf.get("value", 1.0);
return new PredicateCubeClassifier(countColumn, metric, predicateStr, metricValue);
}
case "raw": {
return new RawClassifier(
countColumn,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
package edu.stanford.futuredata.macrobase.analysis.classify;

import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;

import java.util.function.DoublePredicate;

/**
* PredicateClassifier classifies an outlier based on a predicate(e.g., equality, less than, greater than)
* and a hard-coded sentinel value. Unlike {@link PercentileClassifier}, outlier values are not determined based on a
* proportion of the values in the metric column. Instead, the outlier values are defined explicitly by the user in the
* conf.yaml file; for example:
* <code>
* classifier: "raw_threshold"
* metric: "usage"
* predicate: "=="
* value: 1.0
* </code>
* This would instantiate a PredicateClassifier that classifies every value in the "usage" column equal to 1.0
* as an outlier. Currently, we support six different predicates: "==", "!=", "<", ">", "<=", and ">=".
*/
public class PredicateCubeClassifier extends CubeClassifier {

private final DoublePredicate predicate;
private DataFrame output;
private double lowCutoff;
private double highCutoff;

private boolean includeHigh = true;
private boolean includeLow = true;

private final String columnName; //hack

private enum PredicateType {
EQUALS("=="), NOT_EQUALS("!="), LESS_THAN("<"), GREATER_THAN(">"),
LEQ("<="), GEQ(">=");


private static PredicateType getEnum(final String str) throws MacrobaseException {
switch (str) {
case "==":
return EQUALS;
case "!=":
return NOT_EQUALS;
case "<":
return LESS_THAN;
case ">":
return GREATER_THAN;
case "<=":
return LEQ;
case ">=":
return GEQ;
default:
throw new MacrobaseException("PredicateClassifier: Predicate string " + str +
" not suppported.");
}
}

final String str;
PredicateType(String str) {
this.str = str;
}
}

/**
* @param columnName Column on which to classifier outliers
* @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">="
* @param sentinel Sentinel value used when evaluating the predicate to determine outlier
* @throws MacrobaseException
*/
public PredicateCubeClassifier(final String countColumnName, final String columnName, final String predicateStr, final double sentinel)
throws MacrobaseException {
super(countColumnName);
this.columnName = columnName;
this.predicate = getPredicate(predicateStr, sentinel);
}

/**
* @return Lambda function corresponding to the ``predicateStr''. The Lambda function takes in a single
* argument, which will correspond to the value in the metric column. (A closure is created around the ``sentinel''
* parameter.)
* @throws MacrobaseException
*/
private DoublePredicate getPredicate(final String predicateStr, final double sentinel) throws MacrobaseException {
switch (PredicateType.getEnum(predicateStr)) {
default:
case EQUALS:
return (double x) -> x == sentinel;
case NOT_EQUALS:
return (double x) -> x != sentinel;
case LESS_THAN:
return (double x) -> x < sentinel;
case GREATER_THAN:
return (double x) -> x > sentinel;
case LEQ:
return (double x) -> x <= sentinel;
case GEQ:
return (double x) -> x >= sentinel;
}
}

/**
* Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame
* remains unmodified; a copy is created and all modifications are made on the copy.
* @throws Exception
*/
@Override
public void process(DataFrame input) throws Exception {
double[] metrics = input.getDoubleColumnByName(columnName);
int len = metrics.length;
output = input.copy();
double[] resultColumn = new double[len];
for (int i = 0; i < len; i++) {
final double curVal = metrics[i];
if (predicate.test(curVal)) {
resultColumn[i] = 1.0;
}
}
output.addDoubleColumn(outputColumnName, resultColumn);
}

@Override
public DataFrame getResults() {
return output;
}

public double getLowCutoff() {
return lowCutoff;
}
public double getHighCutoff()
{
return highCutoff;
}

/**
* @param includeHigh Whether to count high points as outliers.
* @return this
*/
public PredicateCubeClassifier setIncludeHigh(boolean includeHigh) {
this.includeHigh = includeHigh;
return this;
}

/**
* @param includeLow Whether to count low points as outliers
* @return this
*/
public PredicateCubeClassifier setIncludeLow(boolean includeLow) {
this.includeLow = includeLow;
return this;
}

public boolean isIncludeHigh() {
return includeHigh;
}
public boolean isIncludeLow() {
return includeLow;
}

}

0 comments on commit 8a24c49

Please sign in to comment.