In [46]:
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
import numpy.linalg as linalg
import matplotlib.pyplot as plt
import pickle
import momepy
import requests 
import zipfile
import os.path

# Data sources
# Default file names
filename_infrastructure = 'data_unwrangled/20221017_Centerline-clipped.shp'
filename_collisions = 'data_unwrangled/Motor_Vehicle_Collisions_-_Crashes.csv'
filename_boundaries = 'data_unwrangled/2010 Neighborhood Tabulation Areas (NTAs).geojson'
filename_saved = 'data_unwrangled/flows.pickle'
filename_weather = 'data_unwrangled/weather.csv'

In [11]:
def preprocess_infrastructure(filename, restrict_bicycle=False, show_plot=False):
    # Load data
    infrastructure = gpd.read_file(filename)

    # All in Manhattan
    assert infrastructure['borocode'].unique().item() == '1'

    # Restrict to segments with in bicycle network
    if restrict_bicycle:
        infrastructure = infrastructure[infrastructure['BIKE_LANE'].notna()]

    # Add ID column
    infrastructure['segment_ID'] = infrastructure.index 

    graph = momepy.gdf_to_nx(infrastructure, approach="primal")

    for subgraph in nx.connected_components(graph):
        if len(subgraph) > 1000:
            graph = graph.subgraph(subgraph)

    _, infrastructure = momepy.nx_to_gdf(graph)

    graph = momepy.gdf_to_nx(infrastructure, approach="primal")
    _, infrastructure = momepy.nx_to_gdf(graph)

    if show_plot:
        positions = {n: [n[0], n[1]] for n in list(graph.nodes)}
        f, ax = plt.subplots(1, 1, figsize=(6, 10), sharex=True, sharey=True)
        nx.draw(graph, positions, ax=ax, node_size=5)

    return infrastructure, graph

# Additional constraints like which way to turn, which roads meet at intersection

In [12]:
def preprocess_collisions(filename, filter, infrastructure):
    collisions = pd.read_csv(filename).dropna(subset=['LATITUDE', 'LONGITUDE', 'CRASH DATE'])

    # Restrict to collisions involving bicycles
    markers = ['bike', 'bicyc', 'e - b', 'e-bik', 'e-unicycle', 'bk']
    mask = collisions['VEHICLE TYPE CODE 1'].str.contains('bike') # placeholder
    for i in [1,2,3,4,5]:
        for marker in markers:
            mask = mask | collisions[f'VEHICLE TYPE CODE {i}'].str.contains(marker, case=False)
    collisions = collisions.loc[mask]

    # Restrict to filter
    collisions = collisions[collisions.LONGITUDE != 0] # remove 0,0 coordinates
    collisions = gpd.GeoDataFrame(collisions, geometry=gpd.points_from_xy(collisions.LONGITUDE, collisions.LATITUDE))

    collisions = collisions.sjoin(filter)
    if 'index_right' in collisions.columns:
        collisions.drop(columns=['index_right'], inplace=True)

    # Add ID column
    collisions['collision_ID'] = collisions.index 

    # Connect collisions to infrastructure
    collisions = collisions.sjoin_nearest(infrastructure, max_distance=0.0001, how='inner')
    if 'index_right' in collisions.columns:
        collisions.drop(columns=['index_right'], inplace=True)
    if 'index_left' in collisions.columns:
        collisions.drop(columns=['index_left'], inplace=True)

    collisions.drop_duplicates(subset=['collision_ID'], keep='first', inplace=True)

    ## Let's start in 2016 since previous years of citibike have far fewer rides
    start_date = pd.Timestamp('2016-01-01')
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'])
    collisions = collisions.loc[collisions['CRASH DATE'] >= start_date]

    return collisions

In [13]:
def preprocess_citibike(year, month, filter):
    year = '2016'
    month = '01'
    filename_citibike = f'data_unwrangled/citibike/{year}{month}-citibike-tripdata.csv'
    citibike = pd.read_csv(filename_citibike)

    citibike = gpd.GeoDataFrame(citibike)

    citibike['ride_ID'] = citibike.index 

    for type in ['start', 'end']:
        citibike[f'{type}_geom'] = gpd.points_from_xy(citibike[f'{type} station longitude'], citibike[f'{type} station latitude'])
        citibike.set_geometry(f'{type}_geom', inplace=True)
        citibike = citibike.sjoin(filter)
        if 'index_right' in citibike.columns:
            citibike.drop(columns=['index_right'], inplace=True)
        citibike.drop_duplicates(subset=['ride_ID'], keep='first', inplace=True)
        

    citibike['starttime'] = pd.to_datetime(citibike['starttime'])
    citibike['starttime_rounded'] = citibike['starttime'].dt.floor('d')
    citibike = citibike[citibike['tripduration'] <= 24 * 60 * 60]

    return citibike



In [14]:
def unique_stations(citibike, infrastructure, nodes):

    # Combine both start and end stations
    stations = {'start': {}, 'end': {}}
    for type in ['start', 'end']:
        renaming = {f'{type} station id': 'station_id', f'{type}_geom': 'geometry'}
        stations[type] = citibike.drop_duplicates(subset=[f'{type} station id'], keep='first').rename(columns=renaming)
        stations[type] = stations[type][renaming.values()]
    stations = pd.concat([stations['start'], stations['end']])
    
    # Remove duplicates
    stations.drop_duplicates(subset=['station_id'], keep='first', inplace=True)

    # Find nearby segments
    stations = stations.set_geometry('geometry')
    stations = stations.sjoin_nearest(infrastructure, max_distance=0.01, how='left')
    if 'index_right' in stations.columns:
        stations.drop(columns=['index_right'], inplace=True)
    stations.drop_duplicates(subset=[f'station_id'], keep='first', inplace=True)

    # Get corresponding node in graph
    node_points = np.array(nodes)
    node_geometry = gpd.GeoDataFrame(nodes, geometry=gpd.points_from_xy(node_points[:,0], node_points[:,1]))
    stations = stations.sjoin_nearest(node_geometry, max_distance=0.01, how='left').rename(columns={'index_right': 'node_index'})
    stations.drop_duplicates(subset=[f'station_id'], keep='first', inplace=True)

    return stations

In [18]:

# edges correspond to road segments
# nodes correspond to intersections

def calculate_flow(start, end, station_to_node, nodes, Lpinv, infrastructure, saved):
    assert start != end
    start = station_to_node[start]
    end = station_to_node[end]
    key = (start, end) if start < end else (end, start)
    if key not in saved:
        Lpinv_vector = Lpinv[nodes.index(start)] - Lpinv[nodes.index(end)]
        resistance = Lpinv_vector[0, nodes.index(start)] - Lpinv_vector[0, nodes.index(end)]
        if resistance != 0:
            voltages = (Lpinv_vector/resistance).round(5)[0]

            def letitflow(node_start, node_end):
                return (voltages[node_start]-voltages[node_end]) ** 2

            vectorized = np.vectorize(lambda x, y : letitflow(x, y))

            saved[key] = vectorized(infrastructure.node_start, infrastructure.node_end)
        else: 
            saved[key] = np.zeros(len(infrastructure), dtype=np.float64)
    
    return saved[key], saved

def flow_on_month(citibike, infrastructure, station_to_node, nodes, Lpinv):
    days = citibike.starttime_rounded.unique()

    with open(filename_saved, 'rb') as pickle_file:
        saved = pickle.load(pickle_file)

    month = {}
    for day in days:
        citibike_day = citibike[citibike['starttime_rounded']==day][['start station id', 'end station id']]
        grouped = citibike_day.groupby(['start station id', 'end station id']).size().reset_index(name='count')

        grouped = grouped[grouped['start station id'] != grouped['end station id']]

        total = np.zeros(len(infrastructure['segment_ID']), dtype=np.float64)
        for (start, end, count) in zip(grouped['start station id'], grouped['end station id'], grouped['count']):
            current, saved = calculate_flow(start, end, station_to_node, nodes, Lpinv, infrastructure, saved)
            total += count * current
        # Ensure each segment has at least some flow
        minimum = total[total != 0].min()
        total[total == 0] = minimum
        month[day] = total    
    with open(filename_saved, 'wb') as pickle_file:
        pickle.dump(saved, pickle_file)
        
    return month

In [None]:
# Wrap downloading
def download_zip(url, filename):
    if not os.path.isfile(filename):
        print('Downloading...')
        r = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(r.content)
        print('Downloaded')
    else:
        print('Already downloaded')

def unzip(filename_zipped, directory, filename_unzipped):
    if not os.path.isfile(filename_unzipped):
        print('Unzipping...')
        with zipfile.ZipFile(filename_zipped, 'r') as zip_ref:
            zip_ref.extractall(directory)
        print('Unzipped')
    else:
        print('Already unzipped')

def download_citibike(year, month):
    url = f"https://s3.amazonaws.com/tripdata/{year}{month}-citibike-tripdata.zip"
    save_path_zip = f"data_unwrangled/citibike/{year}{month}-citibike-tripdata.zip"
    save_path = f"data_unwrangled/citibike/{year}{month}-citibike-tripdata.csv"
    directory = 'data_unwrangled/citibike/'
    download_zip(url, save_path_zip)
    unzip(save_path_zip, directory, save_path)

def delete_citibike(year, month):
    save_path_zip = f"data_unwrangled/citibike/{year}{month}-citibike-tripdata.zip"
    save_path = f"data_unwrangled/citibike/{year}{month}-citibike-tripdata.csv"
    os.remove(save_path)
    os.remove(save_path_zip)

In [53]:
years = [2016, 2017, 2018, 2019, 2020, 2021]
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

# Load filter of Manhattan
# Perhaps get a *connected* filter?
print('Filter loading...')
filter = gpd.read_file(filename_boundaries)
filter = filter[filter['boro_name'] == 'Manhattan']

print('Infrastructure loading...')
infrastructure, graph = preprocess_infrastructure(filename_infrastructure)
nodes = list(graph.nodes())
Laplacian = nx.laplacian_matrix(graph, nodelist=nodes)
print('Laplacian being inverted...')
Lpinv = linalg.pinv(Laplacian.todense()) # 4 min to run :(

print('Collisions loading...')
collisions = preprocess_collisions(filename_collisions, filter, infrastructure)

print('Citibike loading...')
month = '2016'
year = '01'
download_citibike(year, month)
citibike = preprocess_citibike(year=year, month=month, filter=filter)
stations = unique_stations(citibike, infrastructure, nodes)
delete_citibike(year, month)

print('Calculating flow...')
station_to_node = {station_id : nodes[index] for (station_id, index) in zip(stations.station_id, stations.node_index) }
monthly_flow = flow_on_month(citibike, infrastructure, station_to_node, nodes, Lpinv)

# Load weather data
print('Weather loading...')
weather = pd.read_csv(filename_weather)
weather.DATE = pd.to_datetime(weather.DATE)

Filter loading...
Calculating flow...
