Adding comments

stanford-futuredata · Apr 26, 2017 · 738c78f · 738c78f
1 parent c7a2e8e
commit 738c78f
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 12 deletions.
diff --git a/...in/java/edu/stanford/futuredata/macrobase/analysis/transform/MetricBucketTransformer.java b/...in/java/edu/stanford/futuredata/macrobase/analysis/transform/MetricBucketTransformer.java
@@ -6,16 +6,31 @@
 
 import java.util.*;
 
+/**
+ * Transform real valued columns into categorical string columns.
+ * Transformed columns are added to a copy of the dataframe and appended with
+ * a suffix to distinguish them from the original column.
+ * Useful for using correlated metrics as explanatory values.
+ * By default transforms columns by bucketing them into low-med-high values
+ * based on percentile.
+ */
 public class MetricBucketTransformer implements Transformer {
+    // Transformed columns are added to the dataframe under a suffix.
     private String columnSuffix = "_a";
+    // Boundaries of the buckets used for classification. n boundaries -> n+1 buckets
     private double[] boundaryPercentiles = {10.0, 90.0};
-    private boolean useSimpleNames = false;
+    // The strings used to encode which bucket a value falls in can be either a simple index
+    // or an encoding of the range of the bucket.
+    private boolean simpleBucketValues = false;
 
     private List<String> metricColumns;
     private List<String> transformedColumnNames;
 
     private DataFrame transformedDF;
 
+    /**
+     * @param columns set of columns to transform
+     */
     public MetricBucketTransformer(List<String> columns) {
         this.metricColumns = columns;
         int d = columns.size();
@@ -47,7 +62,7 @@ public void process(DataFrame input) throws Exception {
             }
 
             String[] bucketNames = new String[k+1];
-            if (useSimpleNames) {
+            if (simpleBucketValues) {
                 for (int i = 0; i < k+1; i++) {
                     bucketNames[i] = String.format("%s:%d", colName, i);
                 }
@@ -95,12 +110,12 @@ public void setBoundaryPercentiles(double[] boundaryPercentiles) {
         this.boundaryPercentiles = boundaryPercentiles;
     }
 
-    public boolean isUseSimpleNames() {
-        return useSimpleNames;
+    public boolean isSimpleBucketValues() {
+        return simpleBucketValues;
     }
 
-    public void setUseSimpleNames(boolean useSimpleNames) {
-        this.useSimpleNames = useSimpleNames;
+    public void setSimpleBucketValues(boolean simpleBucketValues) {
+        this.simpleBucketValues = simpleBucketValues;
     }
 
     public List<String> getTransformedColumnNames() {

diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java
@@ -8,15 +8,23 @@
 import java.util.function.Predicate;
 
 /**
- * Column-based dataframe object.
- * addColumn methods mutate the dataframe and are the primary
- * ways of initializing the data in the dataframe.
+ * Column-based DataFrame object.
+ * DataFrames are primarily meant for data transfer across operators while
+ * preserving column names and types. Complex processing should be done by
+ * extracting columns as arrays and operating on arrays directly.
+ *
+ * The addColumn methods are the primary means of mutating a dataframe and are
+ * especially useful during dataframe construction. DataFrames can also be
+ * initialized from a schema and a set of rows.
  */
 public class DataFrame {
     private Schema schema;
 
     private ArrayList<String[]> stringCols;
     private ArrayList<double[]> doubleCols;
+    // external indices define a global ordering on columns, but internally each
+    // column is stored with other columns of its type. Thus external indices must be
+    // converted into internal type-specific indices.
     private ArrayList<Integer> indexToTypeIndex;
 
     private int numRows;
@@ -61,7 +69,9 @@ public DataFrame(Schema schema, List<Row> rows) {
     }
 
     /**
-     * @return shallow copy of dataframe
+     * Shallow copy of the dataframe: the schema is recreated but the arrays backing the
+     * columns are reused.
+     * @return shallow DataFrame copy
      */
     public DataFrame copy() {
         DataFrame other = new DataFrame();

diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/MetricAsExplanationTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/MetricAsExplanationTest.java
@@ -59,16 +59,18 @@ public void testFindMetricExplanation() throws Exception {
         c.process(df);
         DataFrame cdf = c.getResults();
 
-        List<String> metricExplanationColumns = new ArrayList<String>();
+        List<String> metricExplanationColumns = new ArrayList<>();
         for (int i = 1; i < d; i++) {
             metricExplanationColumns.add("m"+i);
         }
         MetricBucketTransformer t = new MetricBucketTransformer(metricExplanationColumns);
         double[] boundaries = {5.0, 95.0};
         t.setBoundaryPercentiles(boundaries);
         t.process(cdf);
+        // tcdf now contains new columns with metrics transformed into bucketed attributes
         DataFrame tcdf = t.getResults();
 
+        // Need to retrieve the new names of the transformed columns
         List<String> bucketExplanationColumns = t.getTransformedColumnNames();
 
         BatchSummarizer bs = new BatchSummarizer();

diff --git a/...ava/edu/stanford/futuredata/macrobase/analysis/transform/MetricBucketTransformerTest.java b/...ava/edu/stanford/futuredata/macrobase/analysis/transform/MetricBucketTransformerTest.java
@@ -29,7 +29,7 @@ public void testSingleMetric() throws Exception {
         Collections.addAll(distinct, newCol);
         assertEquals(3, distinct.size());
 
-        t.setUseSimpleNames(true);
+        t.setSimpleBucketValues(true);
         t.process(df);
         tdf = t.getResults();
         assertEquals(2, tdf.getSchema().getNumColumns());