In [24]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from math import sin, cos, sqrt, atan2, radians
import osrm
import time
import datetime
from tqdm import tqdm_notebook as tqdm

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_columns', 5000)

In [58]:
# Read in the taxi data
path = os.getcwd()
taxi_path = path + '\\Taxi_Data\\'

save_path = path + '\\Notebook_Data\\'

df = pd.read_csv(taxi_path + 'train.csv', float_precision = 'round_trip')

In [26]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [27]:
# Configure the host site that will be used to generate the routing instructions

osrm.RequestConfig.host = "router.project-osrm.org"

In [46]:
# Initalise the routing dataframe
routing_df = df.copy()
routing_df['route_distance'] = np.nan

In [53]:
route_distance_list = list()

# Go through all rows with no routing information
for row in tqdm(routing_df[routing_df['route_distance'].isna()].itertuples()):
    # The while loop is to retry the scraping in case of a server failure, this was very common#
    while True:
        try:
            # Get the estimeted route
            result = osrm.simple_route([row.pickup_longitude, row.pickup_latitude],
                                       [row.dropoff_longitude, row.dropoff_latitude],
                                       output = 'route', overview = "full", geometry = 'wkt')
            
            # Replace it in the dataframe
            routing_df.at[row.Index, 'route_distance'] = result[0]['distance']
        except:
            # This part was used to rest the API, to many requests caused an error
            print(datetime.datetime.now())
            time.sleep(20)
            continue
        break


In [60]:
# Save the new dataframe for later use
routing_df.to_csv(save_path + 'routing_df.csv', index = False)