In [1]:
import dask.dataframe as dd
from dask.distributed import Client

In [9]:
import os, os.path
import numpy as np
from glob import glob
import json

In [12]:
config = json.load(open('config.json'))

In [2]:
c = Client()

In [26]:

def assign_taxi_zones(df, lon_var, lat_var, locid_var):
    """Joins DataFrame with Taxi Zones shapefile.

    This function takes longitude values provided by `lon_var`, and latitude
    values provided by `lat_var` in DataFrame `df`, and performs a spatial join
    with the NYC taxi_zones shapefile. 

    The shapefile is hard coded in, as this function makes a hard assumption of
    latitude and longitude coordinates. It also assumes latitude=0 and 
    longitude=0 is not a datapoint that can exist in your dataset. Which is 
    reasonable for a dataset of New York, but bad for a global dataset.

    Only rows where `df.lon_var`, `df.lat_var` are reasonably near New York,
    and `df.locid_var` is set to np.nan are updated. 

    Parameters
    ----------
    df : pandas.DataFrame or dask.DataFrame
        DataFrame containing latitudes, longitudes, and location_id columns.
    lon_var : string
        Name of column in `df` containing longitude values. Invalid values 
        should be np.nan.
    lat_var : string
        Name of column in `df` containing latitude values. Invalid values 
        should be np.nan
    locid_var : string
        Name of column in `df` containing taxi_zone location ids. Rows with
        valid, nonzero values are not overwritten. 
    """

    import geopandas
    from shapely.geometry import Point


    localdf = df[[lon_var, lat_var, locid_var]].copy()
    # localdf = localdf.reset_index()
    localdf[lon_var] = localdf[lon_var].fillna(value=0.)
    localdf[lat_var] = localdf[lat_var].fillna(value=0.)
    localdf['replace_locid'] = (localdf[locid_var].isnull()
                                & (localdf[lon_var] != 0.)
                                & (localdf[lat_var] != 0.))

    if (np.any(localdf['replace_locid'])):
        shape_df = geopandas.read_file('../shapefiles/taxi_zones_latlon.shp')
        shape_df.drop(['OBJECTID', "Shape_Area", "Shape_Leng", "borough", "zone"],
                      axis=1, inplace=True)

        try:
            local_gdf = geopandas.GeoDataFrame(
                localdf, crs={'init': 'epsg:4326'},
                geometry=[Point(xy) for xy in
                          zip(localdf[lon_var], localdf[lat_var])])

            local_gdf = geopandas.sjoin(
                local_gdf, shape_df, how='left', op='intersects')

            # one point can intersect more than one zone -- for example if on
            # the boundary between two zones. Deduplicate by taking first valid.
            local_gdf = local_gdf[~local_gdf.index.duplicated(keep='first')]

            local_gdf.LocationID.values[~local_gdf.replace_locid] = (
                (local_gdf[locid_var])[~local_gdf.replace_locid]).values

            return local_gdf.LocationID.rename(locid_var)
        except ValueError as ve:
            print(ve)
            print(ve.stacktrace())
            return df[locid_var]
    else:
        return df[locid_var]


In [13]:
def get_uber():
    uber_schema_2014="pickup_datetime,pickup_latitude,pickup_longitude,junk1"
    uber_glob_2014 = glob(os.path.join(config['uber_raw_data_path'],'uber*-???14.csv'))

    uber_schema_2015="junk1,pickup_datetime,junk2,pickup_location_id"
    uber_glob_2015 = glob(os.path.join(config['uber_raw_data_path'],'uber*15.csv'))

    dtype_list = { 
        # 'dropoff_datetime': np.int64,
        'dropoff_latitude': np.float64,
        'dropoff_location_id': np.int64,
        'dropoff_longitude': np.float64,
        'ehail_fee': np.float64,
        'extra': np.float64,
        'fare_amount': np.float64,
        'improvement_surcharge': np.float64,
        'junk1': object,
        'junk2': object,
        'mta_tax': np.float64,
        'passenger_count': np.int64,
        'payment_type': object,
    #     'pickup_datetime': object, # set by parse_dates in pandas read_csv
        'pickup_latitude': np.float64,
        'pickup_location_id': np.int64,
        'pickup_longitude': np.float64,
        'rate_code_id': np.int64,
        'store_and_fwd_flag': object,
        'tip_amount': np.float64,
        'tolls_amount': np.float64,
        'total_amount': np.float64,
        'trip_distance': np.float64,
        'trip_type': object,
        'vendor_id': object
    }

    uber1 = dd.read_csv(uber_glob_2014, header=0,
                         na_values=["NA"], 
                         parse_dates=[0,],
                         infer_datetime_format = True,
                         dtype=dtype_list,
                         names=uber_schema_2014.split(','))
    uber1 = uber1.drop(['junk1',], axis=1)
    uber1 = uber1.assign(pickup_location_id=-999)

    uber2 = dd.read_csv(uber_glob_2015, header=0,
                         na_values=["NA"], 
                         parse_dates=[1,],
                         infer_datetime_format = True,
                         dtype=dtype_list,
                         names=uber_schema_2015.split(','))
    uber2 = uber2.drop(['junk1', 'junk2'], axis=1)
    uber2 = uber2.assign(pickup_latitude=np.nan, pickup_longitude=np.nan)

    uber1 = uber1[sorted(uber1.columns)]
    uber2 = uber2[sorted(uber2.columns)]

    uberdf = uber1.append(uber2)

    default_values = {np.float64: np.nan, np.int64: -999, object: ""}


    for field in dtype_list:
        if (field in uberdf.columns):
            uberdf[field] = uberdf[field].astype(dtype_list[field])
        elif field == 'pickup_datetime':
            pass
        else:
            uberdf = uberdf.assign(**{field: default_values[dtype_list[field]]})


    uberdf = uberdf.drop(['junk1', 'junk2'], axis=1)

    uberdf['dropoff_datetime'] = np.datetime64("1970-01-01 00:00:00")
    #uberdf = uberdf.repartition(npartitions=20)

    uberdf['trip_type'] = 'uber'

    uberdf = uberdf[sorted(uberdf.columns)]

    return uberdf

In [27]:
u = get_uber()

u['dropoff_location_id'] = u.map_partitions(
    assign_taxi_zones, "dropoff_longitude", "dropoff_latitude",
    "dropoff_location_id", meta=('dropoff_location_id', np.float64))
u['pickup_location_id'] = u.map_partitions(
    assign_taxi_zones, "pickup_longitude", "pickup_latitude",
    "pickup_location_id", meta=('pickup_location_id', np.float64))

In [29]:
u['pickup_datetime,pickup_latitude,pickup_longitude,pickup_location_id'.split(',')].tail()

Unnamed: 0,pickup_datetime,pickup_latitude,pickup_longitude,pickup_location_id
1026663,2015-05-08 15:43:00,,,186
1026664,2015-05-08 15:43:00,,,263
1026665,2015-05-08 15:43:00,,,90
1026666,2015-05-08 15:44:00,,,45
1026667,2015-05-08 15:44:00,,,144


In [30]:
u.to_csv('/bigdata/csv/uber-*.csv', index=False)