In [10]:
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
from datetime import datetime
from dateutil import parser
from azureml.opendatasets import NycTlcYellow


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


### Preprocess taxi
Note: This takes ~2.5 minutes when report_missing=False, and ~5 minutes when report_missing=True

In [37]:
def preprocess_filter(filename):
    filter = gpd.read_file(filename)
    filter = filter[filter['boro_name'] == 'Manhattan']
    return filter

def filter_taxi_trips(df):
    # Make sure rides are longer than one minute
    df = df[df['tpepDropoffDateTime'] - df['tpepPickupDateTime'] > np.timedelta64(1, 'm')]
    # Make sure rides are longer than .1 mile
    df = df[df['tripDistance'] > 0.1]
    # Make sure fare is non-zero 
    df = df[df['fareAmount'] > 0.0]
    return df

# TODO: redo for taxi trips lol
# TODO: filter start and end so that they are in manhattan
# TODO: let's see how many trips we lose when we filter out start/end
def preprocess_taxi(year, month, filter, report_missing=False):
    import calendar
    month_last_day = calendar.monthrange(year=int(year),month=int(month))[1]
    start_date = parser.parse(str(year)+'-'+str(month)+'-01')
    end_date = parser.parse(str(year)+'-'+str(month)+'-'+str(month_last_day))

    nyc_tlc = NycTlcYellow(start_date=start_date, end_date=end_date)
    taxi = nyc_tlc.to_pandas_dataframe()
    taxi = filter_taxi_trips(taxi)
    
    taxi = gpd.GeoDataFrame(taxi)
    taxi['ride_ID'] = taxi.index

    total_start_rides = -1
    total_end_rides = -1
    full_taxi = None
    for type in ['start', 'end']:
        # For tracking start/end routes outside manhattan
        if type == 'start' and report_missing:
            full_taxi = taxi.copy(deep=True)
            full_taxi = gpd.GeoDataFrame(full_taxi)

        taxi[f'{type}_geom'] = gpd.points_from_xy(taxi[f'{type}Lon'], taxi[f'{type}Lat'])
        taxi.set_geometry(f'{type}_geom', inplace=True)
        taxi = taxi.sjoin(filter)

        if 'index_right' in taxi.columns:
            taxi.drop(columns=['index_right'], inplace=True)
        taxi.drop_duplicates(subset=['ride_ID'], keep='first', inplace=True)

        # For tracking start/end routes outside manhattan
        if type == 'start' and report_missing:
            total_start_rides = len(taxi)
            print('Total start: ' + str(total_start_rides))
        elif report_missing:
            full_taxi[f'{type}_geom'] = gpd.points_from_xy(full_taxi[f'{type}Lon'], full_taxi[f'{type}Lat'])
            full_taxi.set_geometry(f'{type}_geom', inplace=True)
            end_taxi = full_taxi.sjoin(filter)
            end_taxi.drop_duplicates(subset=['ride_ID'], keep='first', inplace=True)
            total_end_rides = len(end_taxi)
            print('Total end: ' + str(total_end_rides))

    # Track how many rides either start OR end in manhattan, but not both   
    dropped = ((total_start_rides - len(taxi)) + (total_end_rides - len(taxi))) / (total_start_rides + total_end_rides - len(taxi))
    print('Dropped fraction: ' + str(dropped))

    taxi['starttime'] = pd.to_datetime(taxi['tpepPickupDateTime'])
    taxi['starttime_rounded'] = taxi['starttime'].dt.floor('d')

    return taxi

manhattan_filter = preprocess_filter('data_unwrangled/2010 Neighborhood Tabulation Areas (NTAs).geojson')
taxi = preprocess_taxi('2013', '7', manhattan_filter, report_missing=True)

[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpf0lmau3b/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=7/part-00000-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426339-60.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpf0lmau3b/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=7/part-00008-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426341-60.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpf0lmau3b/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=7/part-00016-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426328-58.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpf0lmau3b/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=7/part-00001-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)


Total start: 691138


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  return geopandas.sjoin(left_df=self, right_df=df, *args, **kwargs)


Total end: 691138
Dropped fraction: 0.15196331403665922
