From 7b2e61d24220d74393a454148c102204b4e2098f Mon Sep 17 00:00:00 2001 From: Peter Kraft Date: Wed, 9 May 2018 14:45:10 -0700 Subject: [PATCH] Generalization of encodeAttributesWithSupport --- .../aplinear/APLOutlierSummarizer.java | 15 +++- .../summary/util/AttributeEncoder.java | 82 +++++++++++++------ 2 files changed, 71 insertions(+), 26 deletions(-) diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java index e8bfa40b0..031756f0c 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java @@ -34,8 +34,19 @@ public AggregationOp[] getAggregationOps() { @Override public int[][] getEncoded(List columns, DataFrame input) { - return encoder.encodeAttributesWithSupport(columns, minOutlierSupport, - input.getDoubleColumnByName(outlierColumn), useBitmaps); + List qualityMetrics = getQualityMetricList(); + List thresholds = getThresholds(); + List monotonicQualityMetrics = new ArrayList<>(); + List monotonicThresholds = new ArrayList<>(); + for(int i = 0; i < qualityMetrics.size(); i++) { + if (qualityMetrics.get(i).isMonotonic()) { + monotonicQualityMetrics.add(qualityMetrics.get(i)); + monotonicThresholds.add(thresholds.get(i)); + } + } + return encoder.encodeAttributesWithSupport(columns, useBitmaps, + 0, getAggregateColumns(input), getAggregationOps(), + monotonicQualityMetrics, monotonicThresholds); } @Override diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java index c53de9d87..7d697fdab 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java @@ -3,6 +3,8 @@ import java.util.*; import java.util.stream.Collectors; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.AggregationOp; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.QualityMetric; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -58,8 +60,12 @@ public void setColumnNames(List colNames) { * row i of columns. * @return A two-dimensional array of encoded values. */ - public int[][] encodeAttributesWithSupport(List columns, double minSupport, - double[] outlierColumn, boolean useBitmaps) { + public int[][] encodeAttributesWithSupport(List columns, boolean useBitmaps, + int outlierColumnIdx, + double[][] aggregateColumns, + AggregationOp[] aggregationOps, + List qualityMetrics, + List thresholds) { if (columns.isEmpty()) { return new int[0][0]; } @@ -72,35 +78,63 @@ public int[][] encodeAttributesWithSupport(List columns, double minSup encoder.put(i, new HashMap<>()); } } - // Create a map from strings to the number of times - // each string appears in an outlier. - int numOutliers = 0; - HashMap countMap = new HashMap<>(); + int numAggregates = aggregateColumns.length; + // Quality metrics are initialized with global aggregates to + // allow them to determine the appropriate relative thresholds + double[] globalAggregates = new double[numAggregates]; + for (int j = 0; j < numAggregates; j++) { + AggregationOp curOp = aggregationOps[j]; + globalAggregates[j] = curOp.initValue(); + double[] curColumn = aggregateColumns[j]; + for (int i = 0; i < numRows; i++) { + globalAggregates[j] = curOp.combine(globalAggregates[j], curColumn[i]); + } + } + for (QualityMetric q : qualityMetrics) { + q.initialize(globalAggregates); + } + // Row store for more convenient access + final double[][] aRows = new double[numRows][numAggregates]; + for (int i = 0; i < numRows; i++) { + for (int j = 0; j < numAggregates; j++) { + aRows[i][j] = aggregateColumns[j][i]; + } + } + + // Create a map from strings to aggregates + HashMap countMap = new HashMap<>(); for (int colIdx = 0; colIdx < numColumns; colIdx++) { String[] curCol = columns.get(colIdx); for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { - if (outlierColumn[rowIdx] > 0.0) { - if (colIdx == 0) - numOutliers += outlierColumn[rowIdx]; - // prepend column index as String to column value to disambiguate - // between two identical values in different columns - String colVal = Integer.toString(colIdx) + curCol[rowIdx]; - Double curCount = countMap.get(colVal); - if (curCount == null) - countMap.put(colVal, outlierColumn[rowIdx]); - else - countMap.put(colVal, curCount + outlierColumn[rowIdx]); + // prepend column index as String to column value to disambiguate + // between two identical values in different columns + String colVal = Integer.toString(colIdx) + curCol[rowIdx]; + double[] candidateVal = countMap.get(colVal); + if (candidateVal == null) { + countMap.put(colVal, + Arrays.copyOf(aRows[rowIdx], numAggregates)); + } else { + for (int a = 0; a < numAggregates; a++) { + AggregationOp curOp = aggregationOps[a]; + candidateVal[a] = curOp.combine(candidateVal[a], aRows[rowIdx][a]); + } } } } - // Rank the strings that have minimum support among the outliers - // by the amount of support they have. - double minSupportThreshold = minSupport * numOutliers; + // Filter strings that do not pass all monotonic quality metrics List filterOnMinSupport = countMap.keySet().stream() - .filter(line -> countMap.get(line) >= minSupportThreshold) + .filter(line -> + { + QualityMetric.Action action = QualityMetric.Action.KEEP; + for (int i = 0; i < qualityMetrics.size(); i++) { + QualityMetric q = qualityMetrics.get(i); + double t = thresholds.get(i); + action = QualityMetric.Action.combine(action, q.getAction(countMap.get(line), t)); + } + return action == QualityMetric.Action.KEEP; + }) .collect(Collectors.toList()); - filterOnMinSupport.sort((s1, s2) -> countMap.get(s2).compareTo(countMap.get(s1))); HashMap stringToRank = new HashMap<>(filterOnMinSupport.size()); for (int i = 0; i < filterOnMinSupport.size(); i++) { @@ -129,7 +163,7 @@ public int[][] encodeAttributesWithSupport(List columns, double minSup // Again, prepend column index as String to column value to disambiguate // between two identical values in different columns String colNumAndVal = Integer.toString(colIdx) + colVal; - int oidx = (outlierColumn[rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier + int oidx = (aggregateColumns[outlierColumnIdx][rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier if (!curColEncoder.containsKey(colVal)) { if (stringToRank.containsKey(colNumAndVal)) { int newKey = stringToRank.get(colNumAndVal); @@ -160,7 +194,7 @@ public int[][] encodeAttributesWithSupport(List columns, double minSup if (useBitmaps && colCardinalities[colIdx] < cardinalityThreshold) { for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { String colVal = curCol[rowIdx]; - int oidx = (outlierColumn[rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier + int oidx = (aggregateColumns[outlierColumnIdx][rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier int curKey = curColEncoder.get(colVal); if (curKey != noSupport) { if (bitmap[colIdx][oidx].containsKey(curKey)) {