In [1]:
import pandas as pd
import geopandas
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client
import distributed
import shapely
from shapely.geometry import Point

client = Client(set_as_default=True)

In [2]:
# citibike = dd.read_parquet('/data/citibike.parquet')
# subway = dd.read_parquet('/data/subway.parquet')

In [None]:
yellow = dd.read_parquet('/data/yellow.parquet')
yellow['triptype'] = 'yellow'
green = dd.read_parquet('/data/green.parquet')
green['triptype'] = 'green'
uber = dd.read_parquet('/data/uber.parquet')
uber['triptype'] = 'uber'


taxi = yellow.append(green).append(uber).set_index('pickup_datetime', npartitions=2000, compute=False)
taxi.to_parquet('/data/alltaxi.parquet', compression='SNAPPY', object_encoding='json')

In [3]:
location_fix = taxi[(taxi.pickup_latitude.notnull()) | (taxi.pickup_longitude.notnull())
                  | (taxi.dropoff_latitude.notnull()) | (taxi.dropoff_longitude.notnull())]

In [26]:
smallframe = location_fix.get_partition(2).compute()
smallframe.head()

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_location_id,dropoff_longitude,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,pickup_longitude,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id,triptype
0,2010-01-09 21:45:00,40.7356,-999,-73.998953,,0.5,12.5,,0.5,1,...,-73.958018,1,,0.0,0.0,13.5,4.22,-999,VTS,yellow
1,2010-01-13 13:47:00,40.78718,-999,-73.947842,,0.0,10.5,,0.5,1,...,-73.974443,1,,0.0,0.0,11.0,2.9,-999,VTS,yellow
2,2010-01-06 17:36:00,40.744515,-999,-73.98488,,1.0,6.1,,0.5,2,...,-73.972717,1,,2.0,0.0,9.6,1.07,-999,VTS,yellow
3,2010-01-17 01:04:00,40.740017,-999,-73.985965,,0.5,6.9,,0.5,1,...,-73.967425,1,,0.0,0.0,7.9,1.8,-999,VTS,yellow
4,2010-01-14 00:34:00,40.736997,-999,-74.007117,,0.5,4.1,,0.5,1,...,-73.999807,1,,1.0,0.0,6.1,0.7,-999,VTS,yellow


In [66]:
%%time
geom1 = [Point(xy) for xy in zip(smallframe.dropoff_longitude, smallframe.dropoff_latitude)]
geom2 = [Point(xy) for xy in zip(smallframe.pickup_longitude, smallframe.pickup_latitude)]

CPU times: user 56 s, sys: 836 ms, total: 56.8 s
Wall time: 55.7 s


In [47]:
smallframe2 = geopandas.GeoDataFrame(smallframe[['geometry']], crs={'init': 'epsg:4326'}, geometry=geom1)
smallframe2

Unnamed: 0,geometry
0,POINT (-73.998953 40.7356)
1,POINT (-73.94784199999998 40.78718)
2,POINT (-73.98488 40.744515)
3,POINT (-73.98596499999998 40.740017)
4,POINT (-74.00711699999998 40.736997)
5,POINT (-73.995925 40.744153)
6,POINT (-73.95941500000001 40.780097)
7,POINT (-73.953935 40.613213)
8,POINT (-73.99626499999998 40.715855)
9,POINT (-73.99329 40.739867)


In [48]:
smallframe3 = geopandas.GeoDataFrame(smallframe[['geometry']], crs={'init': 'epsg:4326'}, geometry=geom2)
smallframe3

Unnamed: 0,geometry
0,POINT (-73.95801799999998 40.7787)
1,POINT (-73.97444299999998 40.750512)
2,POINT (-73.972717 40.752623)
3,POINT (-73.96742500000001 40.75656)
4,POINT (-73.999807 40.739613)
5,POINT (-74.000597 40.729088)
6,POINT (-73.98012699999998 40.781337)
7,POINT (-74.01854199999998 40.632278)
8,POINT (-74.016042 40.705033)
9,POINT (-74.00465800000001 40.741877)


In [42]:
shapefile = geopandas.read_file('../shapefiles/taxi_zones_latlon.shp')
shapefile.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"], axis=1, inplace=True)
shapefile.head()

Unnamed: 0,LocationID,geometry
0,1,"POLYGON ((-74.18445299999998 40.6949959999999,..."
1,2,(POLYGON ((-73.82337597260664 40.6389870471767...
2,3,POLYGON ((-73.84792614099986 40.87134223399993...
3,4,"POLYGON ((-73.9717741096532 40.72582128133706,..."
4,5,POLYGON ((-74.17421738099991 40.56256808599988...


In [None]:
def geomerge(df):
    shapefile = geopandas.read_file('../shapefiles/taxi_zones_latlon.shp')
    

In [52]:
%%time
joined1 = geopandas.sjoin(smallframe2, shapefile, op='intersects')
joined2 = geopandas.sjoin(smallframe3, shapefile, op='intersects')
joined1 = joined1.drop(['geometry', 'index_right'], axis=1)
joined2 = joined2.drop(['geometry', 'index_right'], axis=1)

CPU times: user 6min 42s, sys: 55.9 s, total: 7min 38s
Wall time: 6min 35s


In [59]:
joined1.sort_index(inplace=True)
joined1

Unnamed: 0,LocationID
0,249
1,75
2,164
3,107
4,158
5,90
6,236
7,165
8,148
9,234


In [60]:
joined2.sort_index(inplace=True)
joined2

Unnamed: 0,LocationID
0,236
1,233
2,233
3,229
4,90
5,114
6,239
7,14
8,261
9,68


In [63]:
smallframe.dropoff_location_id = joined1.LocationID.astype(np.int64)
smallframe.pickup_location_id = joined2.LocationID.astype(np.int64)

In [64]:
smallframe

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_location_id,dropoff_longitude,ehail_fee,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,rate_code_id,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,trip_type,vendor_id,triptype,geometry
0,2010-01-09 21:45:00,40.735600,249.0,-73.998953,,0.5,12.5,,0.5,1,...,1,,0.00,0.00,13.50,4.22,-999,VTS,yellow,POINT (-73.998953 40.7356)
1,2010-01-13 13:47:00,40.787180,75.0,-73.947842,,0.0,10.5,,0.5,1,...,1,,0.00,0.00,11.00,2.90,-999,VTS,yellow,POINT (-73.94784199999998 40.78718)
2,2010-01-06 17:36:00,40.744515,164.0,-73.984880,,1.0,6.1,,0.5,2,...,1,,2.00,0.00,9.60,1.07,-999,VTS,yellow,POINT (-73.98488 40.744515)
3,2010-01-17 01:04:00,40.740017,107.0,-73.985965,,0.5,6.9,,0.5,1,...,1,,0.00,0.00,7.90,1.80,-999,VTS,yellow,POINT (-73.98596499999998 40.740017)
4,2010-01-14 00:34:00,40.736997,158.0,-74.007117,,0.5,4.1,,0.5,1,...,1,,1.00,0.00,6.10,0.70,-999,VTS,yellow,POINT (-74.00711699999998 40.736997)
5,2010-01-10 01:44:00,40.744153,90.0,-73.995925,,0.5,6.1,,0.5,1,...,1,,0.00,0.00,7.10,1.33,-999,VTS,yellow,POINT (-73.995925 40.744153)
6,2010-01-09 13:54:00,40.780097,236.0,-73.959415,,0.0,7.7,,0.5,1,...,1,,0.00,0.00,8.20,1.67,-999,VTS,yellow,POINT (-73.95941500000001 40.780097)
7,2010-01-24 17:40:00,40.613213,165.0,-73.953935,,0.0,13.7,,0.5,5,...,1,,0.00,0.00,14.20,4.43,-999,VTS,yellow,POINT (-73.953935 40.613213)
8,2010-01-03 17:21:00,40.715855,148.0,-73.996265,,0.0,6.9,,0.5,1,...,1,,2.00,0.00,9.40,1.67,-999,VTS,yellow,POINT (-73.99626499999998 40.715855)
9,2010-01-12 18:49:00,40.739867,234.0,-73.993290,,1.0,6.5,,0.5,2,...,1,,1.00,0.00,9.00,1.44,-999,VTS,yellow,POINT (-73.99329 40.739867)


In [None]:
# yellow = spark.sql("SELECT *, 'yellow' from parquet.`/data/yellow.parquet`").withColumnRenamed("yellow", 'triptype')
# yellow.createOrReplaceTempView('yellow')

# green = spark.sql("SELECT *, 'green' from parquet.`/data/green.parquet`").withColumnRenamed("green", 'triptype')
# green.createOrReplaceTempView('green')

# uber = spark.sql("SELECT *, 'uber' from parquet.`/data/uber.parquet`").withColumnRenamed("uber", 'triptype')
# uber.createOrReplaceTempView('uber')
# taxi_trips = yellow.unionAll(green).unionAll(uber)
# taxi_trips.createOrReplaceTempView('taxi_trips')

# subway = spark.sql('SELECT * from parquet.`/data/subway.parquet`')
# subway.createOrReplaceTempView('subway')

# citibike = spark.sql('SELECT * from parquet.`/data/citibike.parquet`')
# citibike.createOrReplaceTempView('citibike')

In [None]:
# spark.sql("SELECT dropoff_latitude, dropoff_longitude from taxi_trips where triptype='uber' limit 10").show()