Skip to content

Commit

Permalink
Adding support for mean shift summarization
Browse files Browse the repository at this point in the history
  • Loading branch information
edgan8 committed Oct 20, 2017
1 parent a7b4286 commit bf00fd8
Show file tree
Hide file tree
Showing 12 changed files with 437 additions and 59 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package edu.stanford.futuredata.macrobase.analysis.summary.aplinear;

import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.QualityMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;

import java.util.ArrayList;
import java.util.List;

public class APLExplanation {
private AttributeEncoder encoder;
private List<String> aggregateNames;

private ArrayList<QualityMetric> metrics;
private List<Double> thresholds;
private ArrayList<APLExplanationResult> results;

public APLExplanation(
AttributeEncoder encoder,
List<String> aggregateNames,
List<QualityMetric> metrics,
List<Double> thresholds,
List<APLExplanationResult> results
) {
this.encoder = encoder;
this.aggregateNames = aggregateNames;
this.metrics = new ArrayList<>(metrics);
this.thresholds = new ArrayList<>(thresholds);
this.results = new ArrayList<>(results);
}

public List<APLExplanationResult> getResults() {
return results;
}

String prettyPrint() {
StringBuilder header = new StringBuilder(String.format(
"Outlier Explanation:\n"
));
for (APLExplanationResult is : results) {
header.append(
"---\n"+is.prettyPrint(encoder, aggregateNames)
);
}
return header.toString();
}
}
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
package edu.stanford.futuredata.macrobase.analysis.summary.aplinear;

import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.QualityMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.apriori.IntSet;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;

import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.StringJoiner;

/**
* Subgroup which meets the quality threshold metrics.
*/
public class APLExplanationResult {
private QualityMetric[] metricTypes;
private IntSet matcher;
private double[] aggregates;
private double[] metrics;

public APLExplanationResult(
QualityMetric[] metricTypes,
IntSet matcher,
double[] aggregates,
double[] metrics
) {
this.metricTypes = metricTypes;
this.matcher = matcher;
this.aggregates = aggregates;
this.metrics = metrics;
Expand All @@ -23,4 +34,37 @@ public APLExplanationResult(
public String toString() {
return "a="+Arrays.toString(matcher.values)+":ag="+Arrays.toString(aggregates)+":mt="+Arrays.toString(metrics);
}

public String prettyPrint(
AttributeEncoder encoder,
List<String> aggregateNames
) {
Set<Integer> values = matcher.getSet();

StringJoiner matchJoiner = new StringJoiner(", ");
for (int k : values) {
matchJoiner.add(encoder.decodeColumnName(k)+"="+encoder.decodeValue(k));
}

StringJoiner metricJoiner = new StringJoiner(", ");
for (int i = 0; i < metricTypes.length; i++) {
metricJoiner.add(
String.format("%s: %.3f", metricTypes[i].name(), metrics[i])
);
}

StringJoiner aggregateJoiner = new StringJoiner(", ");
for (int i = 0; i < aggregates.length; i++) {
aggregateJoiner.add(
String.format("%s: %.3f", aggregateNames.get(i), aggregates[i])
);
}

return String.format(
"%s\n%s\n%s\n",
metricJoiner.toString(),
matchJoiner.toString(),
aggregateJoiner.toString()
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package edu.stanford.futuredata.macrobase.analysis.summary.aplinear;

import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.GlobalRatioMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.MeanDevMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.QualityMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.SupportMetric;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* Summarizer that works over cube-based summarization based on mean shift.
*/
public class APLMeanSummarizer extends APLSummarizer {
private Logger log = LoggerFactory.getLogger("APLMeanSummarizer");

private String countColumn = null;

private String meanColumn = "mean";
private String stdColumn = "std";

private double minSupport = 0.1;
private double minStdDev = 3.0;

@Override
public List<String> getAggregateNames() {
return Arrays.asList("count", "m1", "m2");
}

@Override
public double[][] getAggregateColumns(DataFrame input) {
double[] meanCol = input.getDoubleColumnByName(meanColumn);
double[] stdCol = input.getDoubleColumnByName(stdColumn);

int numRows = meanCol.length;
double[] countCol = null;
if (countColumn != null) {
countCol = input.getDoubleColumnByName(countColumn);
} else {
countCol = new double[numRows];
for (int i = 0; i < numRows; i++) {
countCol[i] = 1.0;
}
}

double[] m1Col = new double[numRows];
double[] m2Col = new double[numRows];
for (int i = 0; i < meanCol.length; i++) {
m1Col[i] = meanCol[i]*countCol[i];
m2Col[i] = (stdCol[i]*stdCol[i] + meanCol[i]*meanCol[i])*countCol[i];
}

double[][] aggregateColumns = new double[3][];
aggregateColumns[0] = countCol;
aggregateColumns[1] = m1Col;
aggregateColumns[2] = m2Col;

return aggregateColumns;
}

@Override
public List<QualityMetric> getQualityMetricList() {
List<QualityMetric> qualityMetricList = new ArrayList<>();
qualityMetricList.add(
new SupportMetric(0)
);
qualityMetricList.add(
new MeanDevMetric(0, 1, 2)
);
return qualityMetricList;
}

@Override
public List<Double> getThresholds() {
return Arrays.asList(minSupport, minStdDev);
}

@Override
public APLExplanation getResults() {
return explanation;
}

public String getCountColumn() {
return countColumn;
}
public void setCountColumn(String countColumn) {
this.countColumn = countColumn;
}

public void setMeanColumn(String meanColumn) {
this.meanColumn = meanColumn;
}

public void setStdColumn(String stdColumn) {
this.stdColumn = stdColumn;
}

public void setMinSupport(double minSupport) {
this.minSupport = minSupport;
}

public void setMinStdDev(double minStdDev) {
this.minStdDev = minStdDev;
}

}
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package edu.stanford.futuredata.macrobase.analysis.summary.aplinear;

import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.GlobalRatioMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.QualityMetric;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.SupportMetric;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -11,36 +11,26 @@
import java.util.Arrays;
import java.util.List;

public class APLOutlierSummarizer extends BatchSummarizer{
Logger log = LoggerFactory.getLogger("APLOutlierSummarizer");

private String countColumn = "Count";
private APrioriLinear aplKernel;

/**
* Summarizer that works over both cube and row-based labeled ratio-based
* outlier summarization.
*/
public class APLOutlierSummarizer extends APLSummarizer {
private Logger log = LoggerFactory.getLogger("APLOutlierSummarizer");
private String outlierColumn = "_OUTLIER";
private String countColumn = null;
private double minOutlierSupport = 0.1;
private double minRatioMetric = 2.0;

int numRows;
AttributeEncoder encoder;
List<APLExplanationResult> aplResults;

public APLOutlierSummarizer() {
List<QualityMetric> qualityMetricList = new ArrayList<>();
qualityMetricList.add(
new SupportMetric(0)
);
qualityMetricList.add(
new GlobalRatioMetric(0, 1)
);
aplKernel = new APrioriLinear(
qualityMetricList,
Arrays.asList(minOutlierSupport, minRatioMetric)
);
@Override
public List<String> getAggregateNames() {
return Arrays.asList("Outliers", "Count");
}

@Override
public void process(DataFrame input) throws Exception {
public double[][] getAggregateColumns(DataFrame input) {
double[] outlierCol = input.getDoubleColumnByName(outlierColumn);
numRows = outlierCol.length;
int numRows = outlierCol.length;
double[] countCol = null;
if (countColumn != null) {
countCol = input.getDoubleColumnByName(countColumn);
Expand All @@ -51,41 +41,51 @@ public void process(DataFrame input) throws Exception {
}
}

encoder = new AttributeEncoder();
encoder.setColumnNames(attributes);
long startTime = System.currentTimeMillis();
List<int[]> encoded = encoder.encodeAttributes(
input.getStringColsByName(attributes)
);
long elapsed = System.currentTimeMillis() - startTime;
log.debug("Encoded in: {}", elapsed);
log.debug("Encoded Categories: {}", encoder.getNextKey());

double[][] aggregateColumns = new double[2][];
aggregateColumns[0] = outlierCol;
aggregateColumns[1] = countCol;
aplResults = aplKernel.explain(encoded, aggregateColumns);
System.out.println(aplResults);

return aggregateColumns;
}

@Override
public Explanation getResults() {
return null;
public List<QualityMetric> getQualityMetricList() {
List<QualityMetric> qualityMetricList = new ArrayList<>();
qualityMetricList.add(
new SupportMetric(0)
);
qualityMetricList.add(
new GlobalRatioMetric(0, 1)
);
return qualityMetricList;
}

@Override
public List<Double> getThresholds() {
return Arrays.asList(minOutlierSupport, minRatioMetric);
}

@Override
public APLExplanation getResults() {
return explanation;
}

public String getCountColumn() {
return countColumn;
}

public void setCountColumn(String countColumn) {
this.countColumn = countColumn;
}

public double getMinRatioMetric() {
return minRatioMetric;
}

public void setMinRatioMetric(double minRatioMetric) {
this.minRatioMetric = minRatioMetric;
}
public void setMinSupport(double minSupport) {
this.minOutlierSupport = minSupport;
}
public void setOutlierColumn(String outlierColumn) {
this.outlierColumn = outlierColumn;
}
}

0 comments on commit bf00fd8

Please sign in to comment.