In [None]:
var { DataFrame, Series, Int32 } = require('@rapidsai/cudf');
var { Field, Vector, Float32, Struct, List, FixedSizeList } = require('apache-arrow');

function readTracts() {

    const { features } = JSON.parse(require('fs').readFileSync(
        'data/san_francisco_censustracts.geojson',
        { encoding: 'utf8' }
    ));

    // console.log(features[0])
    // console.log(features[0].geometry)
    // console.log(features[0].geometry.coordinates)
    // console.log(features[0].geometry.coordinates[0][0][0])
    // console.log(features[0].geometry.coordinates[0][0])
    // console.log(features[0].geometry.coordinates[0])

    const polygons = features
        .filter((f) => f.geometry.type === 'MultiPolygon')
        .reduce((x, { geometry }) => x.concat(geometry.coordinates), []);

    return new DataFrame({
        id: Series.sequence({ type: new Int32, init: 0, size: polygons.length }),
        polygons: Series.new(featureToVector(polygons))
    });

    function featureToVector(coordinates) {
        return Vector.from({
            values: coordinates,
            highWaterMark: Number.POSITIVE_INFINITY,
            type: new List(Field.new({
                name: 'rings', type: new List(Field.new({
                    name: 'coords', type: new List(Field.new({
                        name: 'points', type: new Float32()
                    }))
                }))
            })),
        });
    }
}

console.time("read tracts GeoJSON")

var tracts = readTracts()

console.timeEnd("read tracts GeoJSON")

console.log(tracts.toArrow().slice(0, 5).toArray())

In [None]:
function readTrips() {
    const trips = DataFrame.readCSV({
        header: 0,
        sourceType: 'files',
        sources: [('data/san_fran_uber.csv')],
        dataTypes: {
            sourceid: 'int16',
            dstid: 'int16',
            month: 'int8',
            day: 'int8',
            start_hour: 'int8',
            end_hour: 'int8',
            travel_time: 'float32'
        }
    });
    return new DataFrame({
        id: Series.sequence({ type: new Int32, init: 0, size: trips.numRows }),
        sourceid: trips.get('sourceid'),
        dstid: trips.get('dstid'),
        month: trips.get('month'),
        day: trips.get('day'),
        start_hour: trips.get('start_hour'),
        end_hour: trips.get('end_hour'),
        travel_time: trips.get('travel_time'),
    });
}

console.time("read trips CSV")

var trips = readTrips()

console.timeEnd("read trips CSV")

console.log('number of trips:',
    trips.numRows.toLocaleString())

console.log(trips.toArrow().slice(0, 5).toArray())

In [None]:
console.time("groupBy sourceId and compute average trip time")

var averageTripTimesBySourceId = trips
    .select(['sourceid', 'travel_time'])
    .groupBy({ by: 'sourceid' })
    .mean()
    .sortValues({ sourceid: { ascending: true }})

console.timeEnd("groupBy sourceId and compute average trip time")

console.log('number of unique trips:',
    averageTripTimesBySourceId.numRows.toLocaleString())

console.log(averageTripTimesBySourceId.toArrow().slice(0, 10).toArray())

In [None]:
var sourceTracts = tracts.gather(averageTripTimesBySourceId.get('sourceid'))

console.log('number of source tracts:',
    sourceTracts.numRows.toLocaleString())

console.log(sourceTracts.toArrow().slice(0, 5).toArray())