Skip to content

Commit

Permalink
Separating fpg and apriori code, cleanup
Browse files Browse the repository at this point in the history
Adding new explanation result classes that keep track of
raw counts for both interpretability and extensibility. This involved
separating the fpg and apriori dependencies, in the future we can merge
fpgrowth to use the new code.
  • Loading branch information
edgan8 committed Aug 3, 2017
1 parent 6407090 commit 19654c2
Show file tree
Hide file tree
Showing 36 changed files with 457 additions and 281 deletions.
1 change: 1 addition & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cd lib; mvn install; cd ../core ; mvn clean ; mvn package ; cd ../
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import edu.stanford.futuredata.macrobase.analysis.classify.Classifier;
import edu.stanford.futuredata.macrobase.analysis.classify.PercentileClassifier;
import edu.stanford.futuredata.macrobase.analysis.summary.APrioriSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APrioriSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGExplanation;
import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGrowthEmerging;
import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGrowthSummarizer;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.datamodel.Schema;
import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameLoader;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -76,6 +78,15 @@ public BatchSummarizer getSummarizer(String outlierColumnName) throws MacrobaseE
summarizer.setMinRiskRatio(minRiskRatio);
return summarizer;
}
case "fpgrowth": {
FPGrowthSummarizer summarizer = new FPGrowthSummarizer();
summarizer.setOutlierColumn(outlierColumnName);
summarizer.setAttributes(attributes);
summarizer.setMinSupport(minSupport);
summarizer.setMinRiskRatio(minRiskRatio);
summarizer.setUseAttributeCombinations(true);
return summarizer;
}
default: {
throw new MacrobaseException("Bad Summarizer Type");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier;
import edu.stanford.futuredata.macrobase.analysis.summary.APrioriSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APrioriSummarizer;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.datamodel.Schema;
import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameLoader;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.stanford.futuredata.macrobase.pipeline;

import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGExplanation;
import org.junit.Test;

import static org.junit.Assert.*;
Expand All @@ -13,7 +14,7 @@ public void testDemoQuery() throws Exception {
);
BasicBatchPipeline p = new BasicBatchPipeline(conf);
Explanation e = p.results();
assertEquals(3, e.getNumInliers());
assertEquals(3.0, e.numTotal(), 1e-10);
}

}
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package edu.stanford.futuredata.macrobase.analysis.summary;

import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.operator.Operator;

import java.util.ArrayList;
import java.util.List;
import java.util.function.DoublePredicate;

/**
* Takes a dataframe with binary classification and searches for explanations
Expand Down
Original file line number Diff line number Diff line change
@@ -1,111 +1,7 @@
package edu.stanford.futuredata.macrobase.analysis.summary;

import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Represents a summarization result, which contains a list of attribute values
* and other statistics about the underlying process, e.g. num of tuples observed
* so far.
*/
public class Explanation {
private final long numOutliers;
private final long numInliers;
private List<AttributeSet> itemsets;
private final long creationTimeMs;

public Explanation(List<AttributeSet> resultList,
long numInliers,
long numOutliers,
long creationTimeMs) {
itemsets = new ArrayList<>(resultList);
this.numInliers = numInliers;
this.numOutliers = numOutliers;
this.creationTimeMs = creationTimeMs;
}

/**
* Removes redundant explanations
* @return New explanation with redundant itemsets removed.
*/
public Explanation prune() {
itemsets.sort((AttributeSet a, AttributeSet b) -> -a.compareTo(b));
List<AttributeSet> newItemsets = new ArrayList<>();
int n = itemsets.size();
for (int i = 0; i < n; i++) {
AttributeSet aSet = itemsets.get(i);
boolean redundant = false;
// an explanation is redundant if it has lower risk ratio (occurs after since sorted)
// than an explanation that involves a subset of the same attributes
for (int j = 0; j < i; j++) {
AttributeSet comparisonSet = itemsets.get(j);
if (aSet.contains(comparisonSet)) {
redundant = true;
break;
}
}
if (!redundant) {
newItemsets.add(aSet);
}
}

Explanation newExplanation = new Explanation(
newItemsets,
numInliers,
numOutliers,
creationTimeMs
);
return newExplanation;
}

public void sortByRiskRatio() {
itemsets.sort((AttributeSet a, AttributeSet b) -> -a.compareTo(b));
}

public void sortBySupport() {
itemsets.sort((AttributeSet a, AttributeSet b) -> -Double.compare(a.getSupport(),b.getSupport()));
}

public List<AttributeSet> getItemsets() {
return itemsets;
}

public long getNumOutliers() {
return numOutliers;
}

public long getNumInliers() {
return numInliers;
}

public long getCreationTimeMs() {
return creationTimeMs;
}

public String prettyPrint() {
StringBuilder header = new StringBuilder(String.format(
"Outlier Explanation:\n"
+ "numOutliers: %d\n"
+ "numInliers: %d\n"
+ "Itemsets: \n"
+ "--------\n",
numOutliers,
numInliers));
for (AttributeSet is : itemsets) {
header.append(is.prettyPrint());
}
return header.toString();
}

@Override
public String toString() {
return "Explanation{" +
"numOutliers=" + numOutliers +
", numInliers=" + numInliers +
", itemsets=" + itemsets +
'}';
}
public interface Explanation {
String prettyPrint();
double numOutliers();
double numTotal();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package edu.stanford.futuredata.macrobase.analysis.summary.apriori;

import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;

import java.util.ArrayList;
import java.util.List;

/**
* An explanation attempts to summarize the differences between the outliers and inliers.
* It is composed of multiple ExplanationResult results, each of which has a matcher which
* is an atomic classifier for rows in a dataset.
*/
public class APExplanation implements Explanation {
private ArrayList<ExplanationResult> results;
private double numOutliers;
private double numTotal;

public APExplanation(
List<ExplanationResult> results,
double numOutliers,
double numTotal
) {
this.results = new ArrayList<>(results);
this.numOutliers = numOutliers;
this.numTotal = numTotal;
}

public void sortBySupport() {
results.sort(
(ExplanationResult a, ExplanationResult b) -> -Double.compare(a.support(), b.support())
);
}
public ArrayList<ExplanationResult> getResults() {
return results;
}

public double numOutliers() {
return numOutliers;
}
public double numTotal() {
return numTotal;
}

@Override
public String prettyPrint() {
StringBuilder header = new StringBuilder(String.format(
"Outlier Explanation:\n"
+ "numOutliers: %d\n"
+ "numTotal: %d\n"
+ "Results: \n",
(long)numOutliers,
(long)numTotal));
for (ExplanationResult is : results) {
header.append(is.prettyPrint());
}
return header.toString();
}
}
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
package edu.stanford.futuredata.macrobase.analysis.summary;
package edu.stanford.futuredata.macrobase.analysis.summary.apriori;

import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder;
import edu.stanford.futuredata.macrobase.analysis.summary.itemset.IntSet;
import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet;
import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetResult;
import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;

/**
* Simple, direct itemset mining with pruning that is limited to low-order
* Simple, direct apriori mining with pruning that is limited to low-order
* interactions.
*/
public class APrioriSummarizer extends BatchSummarizer {
Expand Down Expand Up @@ -236,7 +234,7 @@ private void countSet(List<int[]> encoded, double[] countCol, double[] outlierCo
if (oCount < suppCount) {
numPruned++;
} else {
double ratio = oCount * 1.0 / (count * baseRate);
double ratio = computeRatio(oCount, count);
if (ratio > minRiskRatio) {
saved.add(curSet);
} else {
Expand Down Expand Up @@ -280,7 +278,7 @@ private void countSingles(List<int[]> encoded, double[] countCol, double[] outli
if (singleOCounts[i] < suppCount) {
numPruned++;
} else {
double ratio = singleOCounts[i]*1.0 / (singleCounts[i] * baseRate);
double ratio = computeRatio(singleOCounts[i], singleCounts[i]);
if (ratio > minRiskRatio) {
singleSaved.add(i);
} else {
Expand Down Expand Up @@ -313,8 +311,8 @@ private void countSingles(List<int[]> encoded, double[] countCol, double[] outli
}

@Override
public Explanation getResults() {
List<AttributeSet> results = new ArrayList<>();
public APExplanation getResults() {
List<ExplanationResult> results = new ArrayList<>();
for (int o = 1; o <= 3; o++) {
HashSet<IntSet> curResults = setSaved.get(o);
HashMap<IntSet, Integer> idxMapping = setIdxMapping.get(o);
Expand All @@ -324,28 +322,32 @@ public Explanation getResults() {
int idx = idxMapping.get(vs);
int oCount = oCounts[idx];
int count = counts[idx];
double lift = (oCount*1.0/count) / baseRate;
double support = oCount*1.0 / numOutliers;
ItemsetResult iResult = new ItemsetResult(
support,
ExplanationResult result = new ExplanationResult(
encoder.decodeSet(vs.getSet()),
oCount,
count,
lift,
vs.getSet()
numOutliers,
numEvents
);
AttributeSet aSet = new AttributeSet(iResult, encoder);
results.add(aSet);
results.add(result);
}
}
Explanation finalExplanation = new Explanation(
APExplanation finalExplanation = new APExplanation(
results,
numEvents - numOutliers,
numOutliers,
timings[1]+timings[2]+timings[3]
numEvents
);
finalExplanation.sortBySupport();
return finalExplanation;
}

private double computeRatio(
double oCount,
double count
) {
return oCount / (count * baseRate);
}

/**
* Set the column which indicates the number of raw rows in each cubed group.
* @param countColumn count column.
Expand Down

0 comments on commit 19654c2

Please sign in to comment.