In [4]:
var { DataFrame, Series, Int32 } = require('@rapidsai/cudf');
var { Field, Vector, Float32, Struct, List, FixedSizeList } = require('apache-arrow');

function readTracts() {

    const { features } = JSON.parse(require('fs').readFileSync(
        'modules/cudf/notebooks/data/san_francisco_censustracts.json',
        { encoding: 'utf8' }
    ));

    // console.log(features[0])
    // console.log(features[0].geometry)
    // console.log(features[0].geometry.coordinates)
    // console.log(features[0].geometry.coordinates[0][0][0])
    // console.log(features[0].geometry.coordinates[0][0])
    // console.log(features[0].geometry.coordinates[0])

    const polygons = features
        .filter((f) => f.geometry.type === 'MultiPolygon')
        .reduce((x, { geometry }) => x.concat(geometry.coordinates), []);

    return new DataFrame({
        id: Series.sequence({ type: new Int32, init: 0, size: polygons.length }),
        polygons: Series.new(featureToVector(polygons))
    });

    function featureToVector(coordinates) {
        return Vector.from({
            values: coordinates,
            highWaterMark: Number.POSITIVE_INFINITY,
            type: new List(Field.new({
                name: 'rings', type: new List(Field.new({
                    name: 'coords', type: new List(Field.new({
                        name: 'points', type: new Float32()
                    }))
                }))
            })),
        });
    }
}

console.time("read tracts GeoJSON")

var tracts = readTracts()

console.timeEnd("read tracts GeoJSON")

console.log(tracts.toArrow().slice(0, 5).toArray())

read tracts GeoJSON: 936.177ms
[
  { "id": 0, "polygons": [[[-121.59510803222656,36.11125946044922],[-121.54019165039062,36.11149978637695],[-121.5009536743164,36.11168670654297],[-121.50101470947266,36.112701416015625],[-121.46171569824219,36.112178802490234],[-121.46073150634766,36.109676361083984],[-121.45922088623047,36.109466552734375],[-121.45166015625,36.10475540161133],[-121.44945526123047,36.102691650390625],[-121.44661712646484,36.100765228271484],[-121.44343566894531,36.09928894042969],[-121.43986511230469,36.09914779663086],[-121.4381332397461,36.0999641418457],[-121.43785858154297,36.1020622253418],[-121.43653869628906,36.103065490722656],[-121.43529510498047,36.10282516479492],[-121.43417358398438,36.10136032104492],[-121.43099975585938,36.10008239746094],[-121.42717742919922,36.09919357299805],[-121.42394256591797,36.09545135498047],[-121.41561889648438,36.09150314331055],[-121.4128189086914,36.089027404785156],[-121.41139221191406,36.08846664428711],[-121.40650939941406

In [5]:
function readTrips() {
    const trips = DataFrame.readCSV({
        header: 0,
        sourceType: 'files',
        sources: ['modules/cudf/notebooks/data/san_fran_uber.csv']
        // dataTypes: {
        //     sourceid: 'int',
        //     dstid: 'int16',
        //     month: 'int8',
        //     day: 'int8',
        //     start_hour: 'int8',
        //     end_hour: 'int8',
        //     travel_time: 'float32'
        // }
    });
    return new DataFrame({
        id: Series.sequence({ type: new Int32, init: 0, size: trips.numRows }),
        sourceid: trips.get('sourceid'),
        dstid: trips.get('dstid'),
        month: trips.get('month'),
        day: trips.get('day'),
        start_hour: trips.get('start_hour'),
        end_hour: trips.get('end_hour'),
        travel_time: trips.get('mean_travel_time'),
    });
}

console.time("read trips CSV")

var trips = readTrips()

console.timeEnd("read trips CSV")

console.log('number of trips:',
    trips.numRows.toLocaleString())

console.log(trips.toArrow().slice(0, 5).toArray())

read trips CSV: 800.404ms
number of trips: 40,134,414
[
  { "id": undefined, "sourceid": 0, "dstid": 0, "month": 0, "day": 0, "start_hour": 0, "end_hour": 0, "travel_time": undefined },
  { "id": undefined, "sourceid": 0, "dstid": 0, "month": 0, "day": 0, "start_hour": 0, "end_hour": 0, "travel_time": undefined },
  { "id": undefined, "sourceid": 0, "dstid": 0, "month": 0, "day": 0, "start_hour": 0, "end_hour": 0, "travel_time": undefined },
  { "id": undefined, "sourceid": 0, "dstid": 0, "month": 0, "day": 0, "start_hour": 0, "end_hour": 0, "travel_time": undefined },
  { "id": undefined, "sourceid": 0, "dstid": 0, "month": 0, "day": 0, "start_hour": 0, "end_hour": 0, "travel_time": undefined }
]


In [6]:
console.time("groupBy sourceId and compute average trip time")

var averageTripTimesBySourceId = trips
    .select(['sourceid', 'travel_time'])
    .groupBy({ by: 'sourceid' })
    .mean()
    .sortValues({ sourceid: { ascending: true }})

console.timeEnd("groupBy sourceId and compute average trip time")

console.log('number of unique trips:',
    averageTripTimesBySourceId.numRows.toLocaleString())

console.log(averageTripTimesBySourceId.toArrow().slice(0, 10).toArray())

groupBy sourceId and compute average trip time: 14.773ms
number of unique trips: 1,795
[
  { "sourceid": 2, "travel_time": 712.0635998043698 },
  { "sourceid": 3, "travel_time": 864.1627137286222 },
  { "sourceid": 5, "travel_time": 99.9375 },
  { "sourceid": 6, "travel_time": 916.0432786167607 },
  { "sourceid": 7, "travel_time": 737.430311715476 },
  { "sourceid": 8, "travel_time": 693.9200012333446 },
  { "sourceid": 9, "travel_time": 864.5123458394589 },
  { "sourceid": 10, "travel_time": 162.76000000000002 },
  { "sourceid": 11, "travel_time": 1054.9460462926518 },
  { "sourceid": 12, "travel_time": 748.0697037037044 }
]


In [7]:
var sourceTracts = tracts.gather(averageTripTimesBySourceId.get('sourceid'))

console.log('number of source tracts:',
    sourceTracts.numRows.toLocaleString())

console.log(sourceTracts.toArrow().slice(0, 5).toArray())

number of source tracts: 1,795
[
  { "id": 2, "polygons": [[[-121.67977905273438,37.158592224121094],[-121.67198181152344,37.1640625],[-121.67018127441406,37.165279388427734],[-121.66748809814453,37.167030334472656],[-121.66325378417969,37.16239547729492],[-121.6556167602539,37.156436920166016],[-121.64507293701172,37.14554214477539],[-121.64090728759766,37.1412353515625],[-121.64082336425781,37.141151428222656],[-121.64242553710938,37.13981246948242],[-121.64337921142578,37.139041900634766],[-121.6526870727539,37.13172912597656],[-121.66268920898438,37.1414794921875],[-121.663330078125,37.14345169067383],[-121.66718292236328,37.146278381347656],[-121.67707824707031,37.155975341796875],[-121.67977905273438,37.158592224121094]]] },
  { "id": 3, "polygons": [[[-121.35921478271484,38.57175064086914],[-121.3462905883789,38.5777473449707],[-121.34430694580078,38.57838439941406],[-121.33655548095703,38.579227447509766],[-121.3363037109375,38.578094482421875],[-121.33531951904297,38.578014373