Skip to content

Commit

Permalink
by column instead of by row, System.gc
Browse files Browse the repository at this point in the history
  • Loading branch information
fabuzaid21 committed Apr 11, 2018
1 parent 75d35e5 commit c0b8d54
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public void process(DataFrame input) throws Exception {
int[][] encoded = getEncoded(input.getStringColsByName(attributes), input);
long elapsed = System.currentTimeMillis() - startTime;
log.info("Encoded in: {} ms", elapsed);
log.info("Encoded Categories: {}", encoder.getNextKey() - 1);
log.info("Distinct values encoded: {}", encoder.getNextKey() - 1);

thresholds = getThresholds();
qualityMetricList = getQualityMetricList();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package edu.stanford.futuredata.macrobase.analysis.summary.util;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -167,16 +168,14 @@ public int[][] encodeAttributesAsArray(List<String[]> columns) {
* encode Primary Key and Values as row-based
* @param foreignKeys
* @param primaryKeyAndValues
* @param encodedForeignKeys
* @param encodedPrimaryKeyAndValues
*/
public void encodeKeyValueAttributes(final List<String[]> foreignKeys,
final List<String[]> primaryKeyAndValues, Builder<int[]> encodedForeignKeys,
int[][] encodedPrimaryKeyAndValues) {
public List<int[]> encodeKeyValueAttributes(final List<String[]> foreignKeys,
final List<String[]> primaryKeyAndValues, int[][] encodedPrimaryKeyAndValues) {
final Builder<int[]> builder = ImmutableList.builder();
if (foreignKeys.isEmpty() && primaryKeyAndValues.isEmpty()) {
return;
return builder.build();
}

final int numKeys = foreignKeys.size() + 1; // add one for primary key
final int numColumns = numKeys + primaryKeyAndValues.size() - 1;
// one decoder for all the key columns
Expand Down Expand Up @@ -206,17 +205,18 @@ public void encodeKeyValueAttributes(final List<String[]> foreignKeys,
encodedCol[rowIdx] = curKey;
++rowIdx;
}
encodedForeignKeys.add(encodedCol);
builder.add(encodedCol);
++colIdx;
}

if (primaryKeyAndValues.isEmpty()) {
return;
return builder.build();
}

final int numRows = encodedPrimaryKeyAndValues.length;
final int numRows = encodedPrimaryKeyAndValues[0].length;
for (String[] curCol : primaryKeyAndValues) {
Map<String, Integer> curColEncoder = encoder.get(colIdx);
final int[] encodedColumn = encodedPrimaryKeyAndValues[colIdx - numKeys + 1];
for (int rowIdx = 0; rowIdx < numRows; rowIdx++) {
String colVal = curCol[rowIdx];
//noinspection Duplicates
Expand All @@ -226,11 +226,11 @@ public void encodeKeyValueAttributes(final List<String[]> foreignKeys,
columnDecoder.put(nextKey, colIdx);
nextKey++;
}
int curKey = curColEncoder.get(colVal);
encodedPrimaryKeyAndValues[rowIdx][colIdx - numKeys + 1] = curKey;
encodedColumn[rowIdx] = curColEncoder.get(colVal);
}
++colIdx;
}
return builder.build();
}

public List<int[]> encodeAttributes(List<String[]> columns) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ private void executeQueries(final String queries, final boolean fromFile) {
Statement stmt = parser.createStatement(statementStr);
log.debug(stmt.toString());
final DataFrame result;
System.gc();
if (stmt instanceof ImportCsv) {
final ImportCsv importStatement = (ImportCsv) stmt;
result = queryEngine.importTableFromCsv(importStatement);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,19 +278,16 @@ private DataFrame evaluateDiffJoin(final Join first, final Join second,
final String[] inlierProjected = inlierDf.project(joinColumnName).getStringColumn(0);
final AttributeEncoder encoder = new AttributeEncoder();
final int numExplainColumns = explainColumnNames.size();
final int[][] encodedPrimaryKeyAndValues = new int[common.getNumRows()][
numExplainColumns + 1]; // include primaryKey column
final int[][] encodedPrimaryKeyAndValues = new int[numExplainColumns + 1][
common.getNumRows()]; // include primaryKey column
final List<String> keyValueColumns = ImmutableList.<String>builder().add(joinColumnName)
.addAll(explainColumnNames).build();
final Builder<int[]> encodedForeignKeyBuilder = ImmutableList.builder();
final long encodingTime = System.currentTimeMillis();
log.info("Starting encoding");
encoder.encodeKeyValueAttributes(
final List<int[]> encodedForeignKeys = encoder.encodeKeyValueAttributes(
ImmutableList.of(outlierProjected, inlierProjected),
common.getStringColsByName(keyValueColumns),
encodedForeignKeyBuilder,
encodedPrimaryKeyAndValues);
final List<int[]> encodedForeignKeys = encodedForeignKeyBuilder.build();
log.info("Encoding time: {} ms", System.currentTimeMillis() - encodingTime);

// 1) Execute \delta(\proj_{A1} R, \proj_{A1} S);
Expand Down Expand Up @@ -571,15 +568,16 @@ private Map<Integer, Pair<RoaringBitmap, RoaringBitmap>> semiJoinAndMerge(
}
// 1) R \semijoin T: Go through the primary key column and see what candidateForeignKeys are contained.
// For every match, save the corresponding values
final int numCols = encodedValues[0].length;
for (int[] encodedValue : encodedValues) {
final int primaryKey = encodedValue[0];
final int numCols = encodedValues.length;
final int numRows = encodedValues[0].length;
for (int i = 0; i < numRows; ++i) {
final int primaryKey = encodedValues[0][i];
if (candidateForeignKeys.contains(primaryKey)) {
final Pair<RoaringBitmap, RoaringBitmap> foreignKeyBitmapPair = foreignKeyBitmapPairs
.get(primaryKey); // this always exists, never need to check for null
// extract the corresponding values for the candidate key
for (int j = 1; j < numCols; ++j) {
final int val = encodedValue[j];
final int val = encodedValues[j][i];
final Pair<RoaringBitmap, RoaringBitmap> valueBitmapPair = valueBitmapPairs
.get(val);
if (valueBitmapPair == null) {
Expand All @@ -598,17 +596,18 @@ private Map<Integer, Pair<RoaringBitmap, RoaringBitmap>> semiJoinAndMerge(
// 2) Go through again and check which saved values from the first pass map to new
// primary keys. If we find any new ones, merge their foreign key bitmaps
// with the existing value bitmap
for (int[] encodedValue : encodedValues) {
for (int j = 1; j < numCols; ++j) {
final int val = encodedValue[j];
final int[] encodedColumn = encodedValues[j];
for (int i = 0; i < numRows; ++i) {
final int val = encodedColumn[i];
final Pair<RoaringBitmap, RoaringBitmap> valueBitmapPair = valueBitmapPairs
.get(val);
if (valueBitmapPair == null) {
// never found in the first pass
continue;
}
// extract the corresponding foreign key, merge the foreign key bitmaps
final int primaryKey = encodedValue[0];
final int primaryKey = encodedValues[0][i];
if (candidateForeignKeys.contains(primaryKey)) {
// found in the first pass, but already included in valueBitmapPair
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,6 @@ public static void runQueriesFromFile(final SqlParser parser, final QueryEngine
if (stmt instanceof Query) {
final QueryBody q = ((Query) stmt).getQueryBody();
final DataFrame result = queryEngine.executeQuery(q);
try (OutputStreamWriter outFile = new OutputStreamWriter(
new FileOutputStream("output.csv"))) {
new CSVDataFrameWriter(",", "\n").writeToStream(result, outFile);
} catch (IOException e) {
e.printStackTrace();
}
assertTrue(expected.equals(result));
} else if (stmt instanceof ImportCsv) {
final ImportCsv importStatement = (ImportCsv) stmt;
Expand Down

0 comments on commit c0b8d54

Please sign in to comment.