Skip to content

Commit

Permalink
Adding comments
Browse files Browse the repository at this point in the history
  • Loading branch information
edgan8 committed Apr 26, 2017
1 parent c7a2e8e commit 738c78f
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,31 @@

import java.util.*;

/**
* Transform real valued columns into categorical string columns.
* Transformed columns are added to a copy of the dataframe and appended with
* a suffix to distinguish them from the original column.
* Useful for using correlated metrics as explanatory values.
* By default transforms columns by bucketing them into low-med-high values
* based on percentile.
*/
public class MetricBucketTransformer implements Transformer {
// Transformed columns are added to the dataframe under a suffix.
private String columnSuffix = "_a";
// Boundaries of the buckets used for classification. n boundaries -> n+1 buckets
private double[] boundaryPercentiles = {10.0, 90.0};
private boolean useSimpleNames = false;
// The strings used to encode which bucket a value falls in can be either a simple index
// or an encoding of the range of the bucket.
private boolean simpleBucketValues = false;

private List<String> metricColumns;
private List<String> transformedColumnNames;

private DataFrame transformedDF;

/**
* @param columns set of columns to transform
*/
public MetricBucketTransformer(List<String> columns) {
this.metricColumns = columns;
int d = columns.size();
Expand Down Expand Up @@ -47,7 +62,7 @@ public void process(DataFrame input) throws Exception {
}

String[] bucketNames = new String[k+1];
if (useSimpleNames) {
if (simpleBucketValues) {
for (int i = 0; i < k+1; i++) {
bucketNames[i] = String.format("%s:%d", colName, i);
}
Expand Down Expand Up @@ -95,12 +110,12 @@ public void setBoundaryPercentiles(double[] boundaryPercentiles) {
this.boundaryPercentiles = boundaryPercentiles;
}

public boolean isUseSimpleNames() {
return useSimpleNames;
public boolean isSimpleBucketValues() {
return simpleBucketValues;
}

public void setUseSimpleNames(boolean useSimpleNames) {
this.useSimpleNames = useSimpleNames;
public void setSimpleBucketValues(boolean simpleBucketValues) {
this.simpleBucketValues = simpleBucketValues;
}

public List<String> getTransformedColumnNames() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,23 @@
import java.util.function.Predicate;

/**
* Column-based dataframe object.
* addColumn methods mutate the dataframe and are the primary
* ways of initializing the data in the dataframe.
* Column-based DataFrame object.
* DataFrames are primarily meant for data transfer across operators while
* preserving column names and types. Complex processing should be done by
* extracting columns as arrays and operating on arrays directly.
*
* The addColumn methods are the primary means of mutating a dataframe and are
* especially useful during dataframe construction. DataFrames can also be
* initialized from a schema and a set of rows.
*/
public class DataFrame {
private Schema schema;

private ArrayList<String[]> stringCols;
private ArrayList<double[]> doubleCols;
// external indices define a global ordering on columns, but internally each
// column is stored with other columns of its type. Thus external indices must be
// converted into internal type-specific indices.
private ArrayList<Integer> indexToTypeIndex;

private int numRows;
Expand Down Expand Up @@ -61,7 +69,9 @@ public DataFrame(Schema schema, List<Row> rows) {
}

/**
* @return shallow copy of dataframe
* Shallow copy of the dataframe: the schema is recreated but the arrays backing the
* columns are reused.
* @return shallow DataFrame copy
*/
public DataFrame copy() {
DataFrame other = new DataFrame();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,18 @@ public void testFindMetricExplanation() throws Exception {
c.process(df);
DataFrame cdf = c.getResults();

List<String> metricExplanationColumns = new ArrayList<String>();
List<String> metricExplanationColumns = new ArrayList<>();
for (int i = 1; i < d; i++) {
metricExplanationColumns.add("m"+i);
}
MetricBucketTransformer t = new MetricBucketTransformer(metricExplanationColumns);
double[] boundaries = {5.0, 95.0};
t.setBoundaryPercentiles(boundaries);
t.process(cdf);
// tcdf now contains new columns with metrics transformed into bucketed attributes
DataFrame tcdf = t.getResults();

// Need to retrieve the new names of the transformed columns
List<String> bucketExplanationColumns = t.getTransformedColumnNames();

BatchSummarizer bs = new BatchSummarizer();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public void testSingleMetric() throws Exception {
Collections.addAll(distinct, newCol);
assertEquals(3, distinct.size());

t.setUseSimpleNames(true);
t.setSimpleBucketValues(true);
t.process(df);
tdf = t.getResults();
assertEquals(2, tdf.getSchema().getNumColumns());
Expand Down

0 comments on commit 738c78f

Please sign in to comment.