Skip to content

Commit

Permalink
Generalization of encodeAttributesWithSupport
Browse files Browse the repository at this point in the history
  • Loading branch information
kraftp committed May 9, 2018
1 parent 18354d6 commit 7b2e61d
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 26 deletions.
Expand Up @@ -34,8 +34,19 @@ public AggregationOp[] getAggregationOps() {

@Override
public int[][] getEncoded(List<String[]> columns, DataFrame input) {
return encoder.encodeAttributesWithSupport(columns, minOutlierSupport,
input.getDoubleColumnByName(outlierColumn), useBitmaps);
List<QualityMetric> qualityMetrics = getQualityMetricList();
List<Double> thresholds = getThresholds();
List<QualityMetric> monotonicQualityMetrics = new ArrayList<>();
List<Double> monotonicThresholds = new ArrayList<>();
for(int i = 0; i < qualityMetrics.size(); i++) {
if (qualityMetrics.get(i).isMonotonic()) {
monotonicQualityMetrics.add(qualityMetrics.get(i));
monotonicThresholds.add(thresholds.get(i));
}
}
return encoder.encodeAttributesWithSupport(columns, useBitmaps,
0, getAggregateColumns(input), getAggregationOps(),
monotonicQualityMetrics, monotonicThresholds);
}

@Override
Expand Down
Expand Up @@ -3,6 +3,8 @@
import java.util.*;
import java.util.stream.Collectors;

import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.AggregationOp;
import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.QualityMetric;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -58,8 +60,12 @@ public void setColumnNames(List<String> colNames) {
* row i of columns.
* @return A two-dimensional array of encoded values.
*/
public int[][] encodeAttributesWithSupport(List<String[]> columns, double minSupport,
double[] outlierColumn, boolean useBitmaps) {
public int[][] encodeAttributesWithSupport(List<String[]> columns, boolean useBitmaps,
int outlierColumnIdx,
double[][] aggregateColumns,
AggregationOp[] aggregationOps,
List<QualityMetric> qualityMetrics,
List<Double> thresholds) {
if (columns.isEmpty()) {
return new int[0][0];
}
Expand All @@ -72,35 +78,63 @@ public int[][] encodeAttributesWithSupport(List<String[]> columns, double minSup
encoder.put(i, new HashMap<>());
}
}
// Create a map from strings to the number of times
// each string appears in an outlier.
int numOutliers = 0;
HashMap<String, Double> countMap = new HashMap<>();
int numAggregates = aggregateColumns.length;
// Quality metrics are initialized with global aggregates to
// allow them to determine the appropriate relative thresholds
double[] globalAggregates = new double[numAggregates];
for (int j = 0; j < numAggregates; j++) {
AggregationOp curOp = aggregationOps[j];
globalAggregates[j] = curOp.initValue();
double[] curColumn = aggregateColumns[j];
for (int i = 0; i < numRows; i++) {
globalAggregates[j] = curOp.combine(globalAggregates[j], curColumn[i]);
}
}
for (QualityMetric q : qualityMetrics) {
q.initialize(globalAggregates);
}
// Row store for more convenient access
final double[][] aRows = new double[numRows][numAggregates];
for (int i = 0; i < numRows; i++) {
for (int j = 0; j < numAggregates; j++) {
aRows[i][j] = aggregateColumns[j][i];
}
}

// Create a map from strings to aggregates
HashMap<String, double[]> countMap = new HashMap<>();
for (int colIdx = 0; colIdx < numColumns; colIdx++) {
String[] curCol = columns.get(colIdx);
for (int rowIdx = 0; rowIdx < numRows; rowIdx++) {
if (outlierColumn[rowIdx] > 0.0) {
if (colIdx == 0)
numOutliers += outlierColumn[rowIdx];
// prepend column index as String to column value to disambiguate
// between two identical values in different columns
String colVal = Integer.toString(colIdx) + curCol[rowIdx];
Double curCount = countMap.get(colVal);
if (curCount == null)
countMap.put(colVal, outlierColumn[rowIdx]);
else
countMap.put(colVal, curCount + outlierColumn[rowIdx]);
// prepend column index as String to column value to disambiguate
// between two identical values in different columns
String colVal = Integer.toString(colIdx) + curCol[rowIdx];
double[] candidateVal = countMap.get(colVal);
if (candidateVal == null) {
countMap.put(colVal,
Arrays.copyOf(aRows[rowIdx], numAggregates));
} else {
for (int a = 0; a < numAggregates; a++) {
AggregationOp curOp = aggregationOps[a];
candidateVal[a] = curOp.combine(candidateVal[a], aRows[rowIdx][a]);
}
}
}
}

// Rank the strings that have minimum support among the outliers
// by the amount of support they have.
double minSupportThreshold = minSupport * numOutliers;
// Filter strings that do not pass all monotonic quality metrics
List<String> filterOnMinSupport = countMap.keySet().stream()
.filter(line -> countMap.get(line) >= minSupportThreshold)
.filter(line ->
{
QualityMetric.Action action = QualityMetric.Action.KEEP;
for (int i = 0; i < qualityMetrics.size(); i++) {
QualityMetric q = qualityMetrics.get(i);
double t = thresholds.get(i);
action = QualityMetric.Action.combine(action, q.getAction(countMap.get(line), t));
}
return action == QualityMetric.Action.KEEP;
})
.collect(Collectors.toList());
filterOnMinSupport.sort((s1, s2) -> countMap.get(s2).compareTo(countMap.get(s1)));

HashMap<String, Integer> stringToRank = new HashMap<>(filterOnMinSupport.size());
for (int i = 0; i < filterOnMinSupport.size(); i++) {
Expand Down Expand Up @@ -129,7 +163,7 @@ public int[][] encodeAttributesWithSupport(List<String[]> columns, double minSup
// Again, prepend column index as String to column value to disambiguate
// between two identical values in different columns
String colNumAndVal = Integer.toString(colIdx) + colVal;
int oidx = (outlierColumn[rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier
int oidx = (aggregateColumns[outlierColumnIdx][rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier
if (!curColEncoder.containsKey(colVal)) {
if (stringToRank.containsKey(colNumAndVal)) {
int newKey = stringToRank.get(colNumAndVal);
Expand Down Expand Up @@ -160,7 +194,7 @@ public int[][] encodeAttributesWithSupport(List<String[]> columns, double minSup
if (useBitmaps && colCardinalities[colIdx] < cardinalityThreshold) {
for (int rowIdx = 0; rowIdx < numRows; rowIdx++) {
String colVal = curCol[rowIdx];
int oidx = (outlierColumn[rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier
int oidx = (aggregateColumns[outlierColumnIdx][rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier
int curKey = curColEncoder.get(colVal);
if (curKey != noSupport) {
if (bitmap[colIdx][oidx].containsKey(curKey)) {
Expand Down

0 comments on commit 7b2e61d

Please sign in to comment.