# Installation steps


 1.Install the latest JRE and get GraphHopper Server as zip from <a href=https://graphhopper.com/public/releases/graphhopper-web-0.10.3-bin.zip>Graphhopper API</a>. Unzip it.


2.Copy this OSM file into the SAME unzipped directory: <a href=https://download.geofabrik.de/north-america/us/new-york-latest.osm.pbf >new-york-latest.osm.pbf</a>


3.Start GraphHopper Maps via: java -jar graphhopper-web-0.10.3-with-dep.jar jetty.resourcebase=webapp config=config-example.properties datareader.file=new-york-latest.osm.pbf. 


4.Test to see if its running after you see 'Started server at HTTP 8989' by going to http://localhost:8989/ and you should see a map of New York.


5.Keep this running when executing our program because this is the API

In [81]:
# Check how graphhopper works
a,b,c,d = df['pickup_latitude'].loc[1],df['pickup_longitude'].loc[1],df['dropoff_latitude'].loc[1],df['dropoff_longitude'].loc[1]
request_str = 'http://localhost:8989/route?point=' + str(a) + '%2C' + str(b) + '&point=' + str(c) + '%2C' + str(d) + '&vehicle=car'
request = Request(request_str)
res=requests.get(request_str)
print("Distance = {}".format(json.loads(res.text)['paths'][0]['distance']))
print("Time = {}".format(json.loads(res.text)['paths'][0]['time']))
# Distance = 1158.322
# Time = 128375


Distance = 1049.667
Time = 111745


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from h3 import h3
import json
from urllib.request import URLError, Request, urlopen
from itertools import combinations
from itertools import permutations
from dateutil import parser
from datetime import datetime, timedelta
import math
import networkx as nx

In [2]:
def get(dataframe,pool_time_window):
    a,b,c,d=[],[],[],[]
    df_distance =  pd.DataFrame(columns = ['pickup_h3','dropoff_h3','distance','duration'])
    for node_a, node_b in list(permutations(dataframe.index, 2)):
        temp_curr, temp_next = [], []

        from_location = dataframe.iloc[node_a]['pickup_h3']
        to_location = dataframe.iloc[node_b]['pickup_h3']

        e, f, g, h = dataframe.iloc[node_a]['pickup_latitude'], dataframe.iloc[node_a]['pickup_longitude'],\
        dataframe.iloc[node_b]['pickup_latitude'],dataframe.iloc[node_b]['pickup_longitude']
        
        request_str = 'http://localhost:8989/route?point=' + str(e) + '%2C' + str(f) + '&point=' + str(
            g) + '%2C' + str(h) + '&vehicle=car'
        request = Request(request_str)
        res = requests.get(request_str)
        distance = json.loads(res.text)['paths'][0]['distance']

        time = json.loads(res.text)['paths'][0]['time']
        minute, msec = divmod(time, 60000)
        if (distance / 1609.344) <=3:
            a.append(from_location)
            b.append(to_location)
            c.append(distance / 1609.344)  # convert meters to miles
            d.append(minute + (msec / 100000))  # convert ms to s and add to min

    df_distance['pickup_h3'] = a
    df_distance['dropoff_h3'] = b
    df_distance['distance'] = c
    df_distance['duration'] = d
    return df_distance
# df_distance = df_distance.set_index(['pickup_h3','dropoff_h3'])
# df_distance.to_csv('29jan2016_30jan2016.csv') 

# Run the below cell only if distance is not precomputed

In [3]:
def get_distance(pool_time_window,df):
    final_distance=[]
    for _,trips in df.groupby(['pool_window']):
        trips = trips.reset_index()
        df_distance=  get(trips,pool_time_window)
        final_distance.append(df_distance)
    df_distance = pd.concat(final_distance)
    df_distance.reset_index(drop=True,inplace=True) 
    return df_distance

# The nodes of the graph

In [4]:
class Node:
    def __init__(self,idx,data):
        self.id = idx
        self.pickup_location = (data.pickup_latitude,data.pickup_longitude,data.pickup_h3)
        self.dropoff_location = (data.dropoff_latitude,data.dropoff_longitude,data.dropoff_h3)
        self.pickup_time = data.pickup_time
        self.dropoff_time = data.dropoff_time
        self.distance = data.trip_distance
        self.duration = data.duration
        self.delay = data.delay
        self.passenger_count = data.passenger_count

In [5]:
def get_distance_duration(node_a,node_b,trip_type):
    if trip_type==2: 
        e, f, g, h = node_a.pickup_location[0], node_a.pickup_location[1], node_b.pickup_location[0],node_b.pickup_location[1]
    else:
        e, f, g, h = node_a.dropoff_location[0], node_a.dropoff_location[1], node_b.dropoff_location[0],node_b.dropoff_location[1]
    request_str = 'http://localhost:8989/route?point=' + str(e) + '%2C' + str(f) + '&point=' + str(
        g) + '%2C' + str(h) + '&vehicle=car'
    request = Request(request_str)
    res = requests.get(request_str)
    if 'paths' in json.loads(res.text):
        distance = json.loads(res.text)['paths'][0]['distance']
        time = json.loads(res.text)['paths'][0]['time']
        minute, msec = divmod(time, 60000)
        return distance / 1609.344 , minute + (msec / 100000)
    else:
        return float('inf'),float('inf')
    

In [6]:
def get_all_pairs(node_a,node_b,trip_type):
    if trip_type == 1:
        #Combination LGA--> a -->b
        #if no distance call graphhopper 
        if (node_a.dropoff_location[2],node_b.dropoff_location[2]) not in df_distance.index:
            a_b_distance,a_b_duration = get_distance_duration(node_a,node_b,trip_type)
        else:
            a_b_distance = df_distance.loc[(node_a.dropoff_location[2],node_b.dropoff_location[2])]['distance']
            a_b_duration = df_distance.loc[(node_a.dropoff_location[2],node_b.dropoff_location[2])]['duration']
        
        LGA_a_dist = node_a.distance
        a_b_dist   = a_b_distance
        LGA_a_dur  = node_a.duration
        a_b_dur    = a_b_duration
        
        #Combination LGA--> b -->a
        if (node_b.dropoff_location[2],node_a.dropoff_location[2]) not in df_distance.index:
            b_a_distance,b_a_duration = get_distance_duration(node_a,node_b,trip_type)
        else:
            b_a_distance = df_distance.loc[(node_b.dropoff_location[2],node_a.dropoff_location[2])]['distance']
            b_a_duration = df_distance.loc[(node_b.dropoff_location[2],node_a.dropoff_location[2])]['duration']
            
        LGA_b_dist = node_b.distance
        b_a_dist = b_a_distance
        LGA_b_dur = node_b.duration
        b_a_dur = b_a_duration
        
        path_1_total_dis,path_1_total_dur = LGA_a_dist + a_b_dist,LGA_a_dur + a_b_dur 
        path_1_a_dur,path_1_b_dur = LGA_a_dur,path_1_total_dur
        
        path_2_total_dis,path_2_total_dur = LGA_b_dist+b_a_dist,LGA_b_dur+b_a_dur
        path_2_a_dur,path_2_b_dur         = path_2_total_dur ,LGA_b_dur
               
    else:
        #Combination a--> b --> LGA
        if (node_a.pickup_location[2],node_b.pickup_location[2]) not in df_distance.index:
            a_b_distance,a_b_duration = get_distance_duration(node_a,node_b,trip_type)
        else:
            a_b_distance = df_distance.loc[(node_a.pickup_location[2],node_b.pickup_location[2])]['distance']
            a_b_duration = df_distance.loc[(node_a.pickup_location[2],node_b.pickup_location[2])]['duration']
        
        a_b_dist   = a_b_distance
        b_LGA_dist = node_b.distance 
        a_b_dur    = a_b_duration
        b_LGA_dur  = node_b.duration
        
        #Combination b--> a --> LGA
        if (node_b.pickup_location[2],node_a.pickup_location[2]) not in df_distance.index:
            b_a_distance,b_a_duration = get_distance_duration(node_b,node_a,trip_type)
        else:
            b_a_distance = df_distance.loc[(node_b.pickup_location[2],node_a.pickup_location[2])]['distance']
            b_a_duration = df_distance.loc[(node_b.pickup_location[2],node_a.pickup_location[2])]['duration']
        
        b_a_dist   = b_a_distance
        a_LGA_dist = node_a.distance 
        b_a_dur    = b_a_duration
        a_LGA_dur  = node_a.duration
        
        path_1_total_dis,path_1_total_dur = a_b_dist + b_LGA_dist,a_b_dur + b_LGA_dur 
        path_1_a_dur,path_1_b_dur = path_1_total_dur,b_LGA_dur
        
        path_2_total_dis,path_2_total_dur, = b_a_dist+a_LGA_dist,b_a_dur+a_LGA_dur
        path_2_a_dur,path_2_b_dur         = a_LGA_dur,path_2_total_dur
        
    return ((path_1_total_dis,path_1_total_dur,path_1_a_dur,path_1_b_dur),( path_2_total_dis,path_2_total_dur,path_2_a_dur,path_2_b_dur))
    

In [7]:
def calculate_edge_weight(node_a,node_b,trip_type):
    path1,path2 = get_all_pairs(node_a,node_b,trip_type)
    minimum_distance = float('inf')
    for path in (path1,path2):
        distance_contraint = (path[0] <= node_a.distance + node_b.distance)
        delay_constraint = (path[2] <= node_a.duration + node_a.delay) & (path[3] <= node_b.duration + node_b.delay)
        #add social constraint too...
        
        
        if distance_contraint and delay_constraint and path[0]< minimum_distance:
            minimum_distance = path[0]
    distance_saved = node_a.distance + node_b.distance - minimum_distance
    return distance_saved

In [8]:
def get_rsg(G,trip_type):
    for node_a,node_b in list(combinations(G,2)):
        if (node_a.passenger_count+node_b.passenger_count)<=4:
            distance_saved = calculate_edge_weight(node_a,node_b,trip_type)
            if distance_saved!= float('-inf') :
                G.add_edge(node_a,node_b, weight=distance_saved)
    return G

# Average distance saved per pool as a % of total distance of individual rides

In [9]:
def Average_distance_saved(merged_trips,Final_Graph):
    with_sharing , without_sharing = [],[]
    for i in range(len(merged_trips)):
        all_nodes =  set()
        total_dis_before_merging = 0
        total_dis_after_merging = 0
        for each_node in Final_Graph[i].nodes:
            total_dis_before_merging += each_node.distance
            all_nodes.add(each_node)
        #remove merged nodes from orginal rga graph
        for u,v in merged_trips[i].edges:
            all_nodes.remove(u)
            all_nodes.remove(v)
            total_dis_after_merging += Final_Graph[i].get_edge_data(u,v)['weight']
        #add unmerged solo trips also
        for solo in all_nodes:
            total_dis_after_merging += solo.distance
        with_sharing.append(total_dis_after_merging)
        without_sharing.append(total_dis_before_merging)

    return(sum([(1-x/y) for x, y in zip(with_sharing, without_sharing)])/len(without_sharing) * 100)   

# Average number of trips saved per pool as a % of number of individual trips

In [10]:
def Average_trip_saved(merged_trips,Final_Graph):
    saved_rides = []
    for idx in range(len(Final_Graph)):
        num_ind_trips = len(Final_Graph[idx].nodes)
        num_pooled_trips = len(merged_trips[idx].edges)
        saved_rides.append(num_pooled_trips/num_ind_trips * 100)
    return(sum(saved_rides)/len(saved_rides))

In [11]:
from tqdm import tqdm
def main_algoritm(trip_type):
    Final_Graph = []
    for _,trips in df.groupby(['pool_window']):
        nodes = []
        trips = trips.reset_index()
        for idx, row in trips.iterrows():
            nodes.append(Node(idx,trips.iloc[idx]))
        G = nx.Graph()
        G.add_nodes_from(nodes)
        Final_Graph.append(G)

    #Start of the code
    merged_trips = []
    cn=0
    for individual_graph in tqdm(Final_Graph,total=len(Final_Graph)):
        ride_sharing_graph = get_rsg(individual_graph,trip_type)
        #maximum weighted algorithm
        maximum_weighted_graph = nx.max_weight_matching(ride_sharing_graph, maxcardinality=True)
        g_match = nx.Graph()
        for u,v in maximum_weighted_graph:
            g_match.add_edge(u,v)

        merged_trips.append(g_match)
    print("Number of pools processed :{}".format(len(merged_trips)))
    average_distance_saved = Average_distance_saved(merged_trips,Final_Graph)
    average_trip_saved = Average_trip_saved(merged_trips,Final_Graph)
    print("Average distance saved for poolwindow {} is :{}".format(pool_time_window,average_distance_saved))
    print("Average trip saved for poolwindow {} is :{}".format(pool_time_window,average_trip_saved))     

# FROM LGA 
ps (Lots of room for improvement. Please feel free to change any part)

In [12]:
import time
def ceil_dt(dt, delta):
        return datetime.min + math.ceil((dt - datetime.min) / delta) * delta
mon = dict({'Jan':'01','Feb':'02','Mar':'03','Apr':'04','May':'05','June':'06','July':'07','Aug':'08','Sep':'09',\
            'Oct':'10','Nov':'11','Dec':'12'})
year = input("Enter year:")
Month = input ("Enter month:")
day = input("Enter day:")
month = mon[Month]
file_name = 'Data/LGA as pickup/LaGuardia_as_pickup_'+str(year)+'-'+str(Month)+'.csv'
distance_file_name = 'Distance/LGA_as_pickup/'+year+'-'+month+'-'+day+'.csv'
df = pd.read_csv(file_name)

start_date='2016-'+str(month)+'-'+str(day)+' 00:00:00'
end_date='2016-'+str(month)+'-'+str(day)+' 23:59:59'
columns = ['tpep_pickup_datetime', 'tpep_dropoff_datetime','passenger_count',\
       'trip_distance', 'pickup_longitude','pickup_latitude','dropoff_longitude', 'dropoff_latitude']
df = df[columns]
df.rename(columns={'tpep_pickup_datetime':'pickup_time',
       'tpep_dropoff_datetime':'dropoff_time'},inplace=True)
drop_index=df[(df.pickup_latitude==0)|(df.pickup_longitude==0)].index
df.drop(drop_index,inplace=True)
df['pickup_time'] = pd.to_datetime(df['pickup_time'])
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'])
df['pickup_h3'] = df.apply(lambda x: h3.geo_to_h3(x['pickup_latitude'], x['pickup_longitude'], 10), axis=1)
df['dropoff_h3'] = df.apply(lambda x: h3.geo_to_h3(x['dropoff_latitude'], x['dropoff_longitude'], 8), axis=1)

df=df[(df['pickup_time'] >= start_date) & (df['dropoff_time'] <= end_date)]
df.reset_index(drop=True,inplace=True)
df['duration'] = (df['pickup_time']-df['dropoff_time']).dt.seconds
df['delay'] = df['duration'].apply(lambda x: x*0.20)


for pool_time_window in [5,10]:
    start_time = time.time()
    df['pool_window'] = df['pickup_time'].apply(lambda x: ceil_dt(x.to_pydatetime(), timedelta(minutes=pool_time_window)))
    df_distance = pd.read_csv(distance_file_name)
    print("Starting main algorithm...")
    main_algoritm(1)
    total_time = (time.time()-start_time)/60.0
    print("algorithm time taken for {} pool window is :{} minutes\n ".format(pool_time_window,total_time))

Enter year:2016
Enter month:May
Enter day:09
Starting main algorithm...


100%|██████████| 220/220 [10:55<00:00,  2.98s/it]


Number of pools processed :220
Average distance saved for poolwindow 5 is :nan
Average trip saved for poolwindow 5 is :41.51892467876708
algorithm time taken for 5 pool window is :10.984185445308686 minutes
 
Starting main algorithm...


 38%|███▊      | 44/116 [08:01<13:07, 10.94s/it]


KeyboardInterrupt: 

# TO LGA

In [None]:
file_name = 'Data/LGA as pickup/LaGuardia_as_pickup_'+str(year)+'-'+str(Month)+'.csv'
distance_file_name = 'Distance/LGA_drop_off/'+year+'-'+month+'-'+day+'.csv'
df = pd.read_csv(file_name)
columns = ['tpep_pickup_datetime', 'tpep_dropoff_datetime','passenger_count',\
       'trip_distance', 'pickup_longitude','pickup_latitude','dropoff_longitude', 'dropoff_latitude']
df = df[columns]
df.rename(columns={'tpep_pickup_datetime':'pickup_time',
       'tpep_dropoff_datetime':'dropoff_time'},inplace=True)
drop_index=df[(df.pickup_latitude==0)|(df.pickup_longitude==0)].index
df.drop(drop_index,inplace=True)
df['pickup_time'] = pd.to_datetime(df['pickup_time'])
df['dropoff_time'] = pd.to_datetime(df['dropoff_time'])
df['pickup_h3'] = df.apply(lambda x: h3.geo_to_h3(x['pickup_latitude'], x['pickup_longitude'], 10), axis=1)
df['dropoff_h3'] = df.apply(lambda x: h3.geo_to_h3(x['dropoff_latitude'], x['dropoff_longitude'], 8), axis=1)

df=df[(df['pickup_time'] >= start_date) & (df['dropoff_time'] <= end_date)]
df.reset_index(drop=True,inplace=True)
df['duration'] = (df['pickup_time']-df['dropoff_time']).dt.seconds
df['delay'] = df['duration'].apply(lambda x: x*0.20)
for pool_time_window in (5,10):
    start_time = time.time()
    df['pool_window'] = df['pickup_time'].apply(lambda x: ceil_dt(x.to_pydatetime(), timedelta(minutes=pool_time_window)))
    df_distance = pd.read_csv(distance_file_name)
    print("Starting main algorithm...")
    main_algoritm(2)
    total_time = (time.time()-start_time)/60.0
    print("algorithm time taken for {} pool window is :{} minutes\n ".format(pool_time_window,total_time))