In [1]:
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
from datetime import datetime
from dateutil import parser
# pip install azureml-opendatasets-runtimeusing
from azureml.opendatasets import NycTlcYellow
import calendar
import numpy.linalg as linalg
import matplotlib.pyplot as plt
import pickle
import momepy
import itertools
# torch stuff
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.optim import Adam, lr_scheduler
from tqdm import tqdm
import copy


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Only need to run this function once
def preprocess_lion():
    # Download data from https://www.dropbox.com/sh/927yoof5wq6ukeo/AAA--Iyb7UUDhfWIF2fncppba?dl=0
    # Put all files into 'data_unwrangled/LION' or change path below
    lion_folder = 'data_unwrangled/LION/'
    # Load all LION data
    links = gpd.read_file(lion_folder+'links.shp')
    # Only consider links in Manhattan
    links = links[links['LBoro']==1]
    # Only consider links that are normal streets
    links = links[links['FeatureTyp']=='0']
    # Only consider constructed links
    links = links[links['Status']=='2']
    # Only consider links that have vehicular traffic
    links = links[links['TrafDir'] != 'P']
    # Make sure there is a speed limit for each link
    links = links[links['POSTED_SPE'].notnull()]
    # Expected time to travel link at posted speed
    links['expected_time'] = links['POSTED_SPE'].astype(int)*links['SHAPE_Leng']
    # Ensure *undirected* graph is connected
    # Note: We could do this for directed graph but maximum size
    # of strongly connected component is 430
    graph = momepy.gdf_to_nx(links, approach="primal", directed=False)
    for component in nx.connected_components(graph):
        if len(component) > 10000:
            graph = graph.subgraph(component)
    # Use resulting links as infrastructure
    _, links = momepy.nx_to_gdf(graph)
    links.drop(columns=['node_start', 'node_end'], inplace=True)
    # Save both links so we can use it to construct directed graph
    links.to_file('data/links.json', driver='GeoJSON')
    # Load nodes
    nodes = gpd.read_file(lion_folder+'nodes.shp')
    # Drop unnecessary columns
    nodes.drop(columns=['OBJECTID_1', 'OBJECTID', 'GLOBALID', 'VIntersect'], inplace=True)
    # Find nodes that are connected to surviving links
    node_IDs = np.union1d(links['NodeIDFrom'], links['NodeIDTo']).astype(int)
    # Select nodes that are connected to surviving links
    selected_nodes = nodes[nodes['NODEID'].isin(node_IDs)]
    # Save to file
    selected_nodes.to_file('data/nodes.json', driver='GeoJSON')

def load_filter():
    filename_filter = 'data_unwrangled/2010 Neighborhood Tabulation Areas (NTAs).geojson'
    filter = gpd.read_file(filename_filter)
    filter = filter[filter['boro_name'] == 'Manhattan']
    return filter

def connect_collisions_to_nodes(collisions, nodes):
    collisions.to_crs(nodes.crs, inplace=True)
    return collisions.sjoin_nearest(nodes).drop(columns=['index_right'])

# Only need to run this function once for each year
def preprocess_collisions(year=2013):
    filename_collisions = 'data_unwrangled/Motor_Vehicle_Collisions_-_Crashes.csv'
    # Load collisions and drop empty rows
    df = pd.read_csv(filename_collisions, low_memory=False).dropna(subset=['LATITUDE', 'LONGITUDE', 'CRASH DATE'])
    # Drop empty location data
    df = df[df.LONGITUDE != 0] # remove 0,0 coordinates
    # Convert date to datetime
    df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
    # Get year
    df['year'] = df['CRASH DATE'].dt.year
    # Convert to geodataframe
    gdf = gpd.GeoDataFrame(df, crs='epsg:4326', geometry=gpd.points_from_xy(df.LONGITUDE, df.LATITUDE))
    # Filter to Manhattan
    gdf = gdf.sjoin(load_filter()).drop(columns=['index_right'])
    # Subset to year
    gdf_year = gdf[gdf['year']==year]
    # Load nodes
    nodes = gpd.read_file('data/nodes.json')
    # Connect collisions to nodes
    gdf_year = connect_collisions_to_nodes(gdf_year, nodes)
    # Save to file
    gdf_year.to_file(f'data/collisions_{year}.json', driver='GeoJSON')

In [3]:
def preprocess_taxi(df):
    # Make sure rides are longer than one minute
    df = df[df['tpepDropoffDateTime'] - df['tpepPickupDateTime'] > np.timedelta64(1, 'm')]
    # Make sure rides are shorter than 12 hours
    df = df[df['tpepDropoffDateTime'] - df['tpepPickupDateTime'] <= np.timedelta64(12, 'h')]
    # Make sure rides are longer than .1 mile
    df = df[df['tripDistance'] > 0.1]
    # Make sure fare is non-zero 
    df = df[df['fareAmount'] > 0.0]
    # Convert to geopandas
    gdf = gpd.GeoDataFrame(df)
    # Reset index ID (there are duplicate indices)
    gdf.reset_index(inplace=True)
    # Create ride ID
    gdf['ride_id'] = gdf.index
    # Make start time date time type
    gdf['start_time'] = pd.to_datetime(gdf['tpepPickupDateTime'])
    # Round start time to day
    gdf['start_day'] = gdf['start_time'].dt.round('d')
    return gdf

def filter_location(type, filter, taxi, make_copy=True):
    # Create a geometry column from the type coordinates
    taxi[f'{type}_geom'] = gpd.points_from_xy(taxi[f'{type}Lon'], taxi[f'{type}Lat'])
    taxi.set_geometry(f'{type}_geom', crs='epsg:4326', inplace=True)
    taxi = taxi.sjoin(filter).drop(columns=['index_right'])
    return taxi

def restrict_start_end(taxi, check_ratio=False):        
    # Load Manhattan objects
    filter_manhattan = load_filter()
    # Restrict to rides that start in Manhattan
    taxi_start = filter_location('start', filter_manhattan, taxi)
    # Restrict to rides that start and end in Manhattan
    taxi_start_end = filter_location('end', filter_manhattan, taxi_start)
    if check_ratio:
        # Check number of rides that start AND end in Manhattan / number of rides that start OR end in Manhattan
        taxi_end = filter_location('end', filter_manhattan, taxi)
        print(len(taxi_start_end)/(len(taxi_start)+len(taxi_end)-len(taxi_start_end))) # About 85%
    return taxi_start_end

def get_taxi_data(year, month):
    # Get query for first and last day of month in year
    month_last_day = calendar.monthrange(year=int(year),month=int(month))[1]
    start_date = parser.parse(str(year)+'-'+str(month)+'-01')
    end_date = parser.parse(str(year)+'-'+str(month)+'-'+str(month_last_day))
    end_date = parser.parse(str(year)+'-'+str(month)+'-04')
    print('Loading taxi data...', end=' ')
    nyc_tlc = NycTlcYellow(start_date=start_date, end_date=end_date)
    taxi_all = nyc_tlc.to_pandas_dataframe()
    print('complete!')
    print('Preprocessing data...', end=' ')
    taxi = preprocess_taxi(taxi_all)
    print('complete!')
    print('Restricting start and end...', end=' ')
    taxi_start_end = restrict_start_end(taxi)
    print('complete!')

    return taxi_start_end

In [4]:
def get_directed_graph(links):
    # Edges from NodeIDFrom to NodeIDTo for one-way "with" streets and two-way streets
    graph1 = nx.from_pandas_edgelist(
        links[np.logical_or(links['TrafDir'] == 'W', links['TrafDir'] == 'T')],
        source='NodeIDFrom', target='NodeIDTo', edge_attr=True, create_using=nx.DiGraph()
    )
    # Edges from NodeIDTo to NodeIDFrom for one-way "against" streets and two-way streets
    graph2 = nx.from_pandas_edgelist(
        links[np.logical_or(links['TrafDir'] == 'A', links['TrafDir'] == 'T')],
        source='NodeIDTo', target='NodeIDFrom', edge_attr=True, create_using=nx.DiGraph()
    )
    return nx.compose(graph1, graph2)

def connect_taxi_to_nodes(taxi, type_name, nodes):    
    taxi.set_geometry(type_name+'_geom', inplace=True)
    taxi.to_crs(nodes.crs, inplace=True)
    result = taxi.sjoin_nearest(nodes).drop(columns=['index_right'])
    result.rename(columns={'NODEID': type_name+'_NODEID'}, inplace=True)
    return result



In [5]:
# About 8 minutes for one million trips
def get_flows(taxi, graph, links):
    # Initialize dictionary for fast access
    flows = {np.datetime_as_string(day, unit='D') : {edge_id : 0 for edge_id in links['OBJECTID']} for day in taxi['start_day'].unique()}
    # Sort by start node so we can re-use predecessor graph
    taxi_sorted = taxi.sort_values(by='start_NODEID')
    previous_source = None
    for source, target, day in zip(taxi_sorted['start_NODEID'], taxi_sorted['end_NODEID'], taxi_sorted['start_day']):
        # Networkx pads node ID with leading zeroes
        source_padded = str(source).zfill(7)
        target_padded = str(target).zfill(7)
        day_pretty = np.datetime_as_string(np.datetime64(day), unit='D')
        # If we haven't already computed the predecessor graph
        if previous_source != source_padded:
            # Compute predecessor graph
            pred, dist = nx.dijkstra_predecessor_and_distance(graph, source=source_padded, weight='expected_time') 
            previous_source = source_padded
        # We ignore taxi rides that appear infeasible in the directed graph
        if target_padded in pred:
            current, previous = target_padded, None
            # Work our way backwards through the predecessor graph until we find the source
            while current != source_padded:
                current, previous = pred[current][0], current
                edge_id = graph.edges[current, previous]['OBJECTID']   
                # Update flows data structure as we go
                flows[day_pretty][edge_id] += 1
    # Convert to dataframe
    return pd.DataFrame.from_dict(flows)

In [6]:
class TrafficDataset(Dataset):
    def __init__(self):
        # Should take under a minute to load
        self.links = gpd.read_file('data/links.json')        
        self.nodes = gpd.read_file('data/nodes.json')
        self.graph = get_directed_graph(self.links)
        self.collisions = gpd.read_file('data/collisions_2013.json')
        self.weather = pd.read_csv('data/weather.csv')
        self.weather['date'] = pd.to_datetime(self.weather.DATE)
        years = ['2013']
        months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
        self.year_months = [(year, month) for year in years for month in months]
    
    def __len__(self):
        return len(self.year_months)
    
    def __getitem__(self, idx):
        year, month = self.year_months[idx]
        # If you're getting throttled, reset router IP address and computer IP address
        taxi = get_taxi_data(year, month)
        taxi = connect_taxi_to_nodes(taxi, 'start', self.nodes)
        taxi = connect_taxi_to_nodes(taxi, 'end', self.nodes)
        # Takes 8 minutes to run on 1 million trips
        flows = get_flows(taxi, self.graph, self.links)
        return torch.tensor(pd.DataFrame(taxi)), torch.tensor(pd.DataFrame(flows))

In [7]:
dataset = TrafficDataset()
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [8]:
_, flows = next(iter(dataloader))

Loading taxi data... [Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpcj8ekdbo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=12/part-00000-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426339-65.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpcj8ekdbo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=12/part-00008-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426341-65.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpcj8ekdbo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=12/part-00016-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426328-63.c000.snappy.parquet
[Info] read from /var/folders/qj/gh_j11514m37mqtfrlr885k40000gn/T/tmpcj8ekdbo/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2013/puMonth=12/part-00001-tid-889885883265882340

ValueError: could not determine the shape of object type 'DataFrame'

In [None]:
flows # row corresponds to link, column corresponds to day (number of people routed through that link on that day)
# link in each direction 
# make data loader load vectors we care about

In [41]:
for taxi, flows in dataloader:
    print(taxi)
    print(flows)

Loading taxi data... 

KeyboardInterrupt: 

In [38]:
class ConvGraphNet(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, hidden_count=1, dropout_percent=0.05):
        super(ConvGraphNet, self).__init__()
        self.dropout_percent = dropout_percent

        # Input layer
        self.input_layer = GCNConv(input_dim, hidden_dim)

        # Scalable hidden layers
        hidden_layers = []
        for _ in range(hidden_count):
            hidden_layers.append(GCNConv(hidden_dim, hidden_dim))
        self.hidden_layers = hidden_layers

        # Output layer
        self.output_layer = GCNConv(hidden_dim, output_dim)

    def forward(self, input, edge_index, labels=None):
        input = F.dropout(input, self.dropout_percent, self.training)
        input = F.relu(self.input_layer(input, edge_index, edge_weight=None))

        for hidden_layer in self.hidden_layers:
            input = F.relu(hidden_layer(input, edge_index))

        input = F.dropout(input, self.dropout_percent, self.training)
        input = self.output_conv(input, edge_index)

        if labels is None:
            return input

        loss = nn.CrossEntropyLoss()(input, labels)
        return input, loss


In [None]:
num_epochs = 100
num_batches = 1
learning_rate = 0.01
weight_decay = 0.001
warmup_steps = 2
training_steps = num_batches * num_epochs

def train(self, model, features, train_labels, validation_labels, edge_matrix, device):
    # put all to device
    features = features.to(device)
    train_labels = train_labels.to(device)
    model = model.to(device)
    edge_matrix = edge_matrix.to(device)  

    optimizer = Adam(self.model.parameters(), 
                     lr=learning_rate, 
                     weight_decay=weight_decay) # weight decay for L2 reg

    # Suggested learning rate warmup
    def warmup(current_step):
        if current_step < warmup_steps:
            return float(current_step / warmup_steps)
        else:                                 
            return max(0.0, float(training_steps - current_step) / float(max(1, training_steps - warmup_steps)))

    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup)

    print("Begin training.")

    lowest_loss = float("inf")
    best_accuracy = 0
    best_model_version = None
    
    for epoch in range(num_epochs):
        model.train()
        outputs = model(features, edge_matrix, train_labels)
        loss = outputs[1]

        model.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        validation_loss, validation_accuracy = self.evaluate(features, validation_labels, edge_matrix, device)
        print(f"Training loss: {loss.item()}")
        print(f"Validation loss: {validation_loss}") 
        print(f"Validation accuracy: {validation_accuracy}")
        print()

        if validation_loss < lowest_loss:
            lowest_loss = validation_loss
            best_accuracy = validation_accuracy
            best_model_version = copy.deepcopy(model.state_dict())

    print(f"Lowest loss: {lowest_loss}")
    print(f"Best accuracy: {best_accuracy}")
    print()

    # Return the best model we found at any point in training
    # not sure if this'll be a memory issue at some point
    return model.load_state_dict(best_model_version)

def evaluate(self, model, features, labels, edge_matrix, device):
    edge_matrix = edge_matrix.to(device)
    features = features.to(device)
    test_labels = test_labels.to(device)

    model.eval()

    outputs = model(features, edge_matrix, labels)
    loss = outputs[1].item()

    ignore_label = nn.CrossEntropyLoss().ignore_index
    predicted_label = torch.max(outputs[0], dim=1).indices[test_labels != ignore_label]
    true_label = labels[labels != -100]
    accuracy = torch.mean((true_label == predicted_label).type(torch.FloatTensor)).item()

    return loss, accuracy

In [None]:
features, labels, train_labels, validation_labels, test_labels, edge_matrix = # TODO: load_data(dataset)
device = None
num_classes = 2

model = ConvGraphNet(
    input_size = features.size(1),
    hidden_size = 32,
    output_size = num_classes,
    dropout = 0.2 # 0.5
)

train(model, features, train_labels, validation_labels, edge_matrix, device)

loss, accuracy = evaluate(model, features, test_labels, edge_matrix, device)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

In [None]:
# Link attributes are always the same -> connect that to get item function
# Different parts are weather and flows (day specific)
# All edge specific
# Collision are at nodes

In [None]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
# Data(edge_index=[2, 4], x=[3, 1])

In [None]:
import torch
from torch_geometric.data import Data

edge_index = torch.tensor([[0, 1],
                           [1, 0],
                           [1, 2],
                           [2, 1]], dtype=torch.long)
x = torch.tensor([[-1], [0], [1]], dtype=torch.float)

data = Data(x=x, edge_index=edge_index.t().contiguous())
# Data(edge_index=[2, 4], x=[3, 1])