# Generate Chains

In [1]:
import geopandas as gpd
import pandas as pd
import os
import datetime as dt
import time
from copy import deepcopy
import numpy as np
import osmnx as ox
import networkx as nx
import scipy
import sys
import pickle
import math
from shapely.geometry import Point
from pyproj import Transformer
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

os.chdir(os.path.join(os.getcwd(), ".."))
print(os.getcwd())

/home/jovyan/work/code/paratransit-mdp


In [2]:
# global vars

#NUM_TRAIN_CHAINS = 100
NUM_TRAIN_CHAINS = 3
NUM_TEST_CHAINS = 15

In [3]:
# load trips

dtype = {'distance': float, 
         'pickup_lat': float, 
         'pickup_lon': float, 
         'dropoff_lat': float, 
         'dropoff_lon': float, 
         'am_wc': str, 
         'pickup_node_id': int, 
         'pickup_osmid': int, 
         'dropoff_node_id': int, 
         'dropoff_osmid': int, 
         'travel_time': float, 
         'pickup_datetime': str, 
         'dropoff_datetime': str}

file_path = os.path.join(os.getcwd(), "data", "CARTA", "processed", "para_transit_trips_2021.csv")
trips = pd.read_csv(file_path, dtype=dtype)

trips['pickup_datetime'] = trips['pickup_datetime'].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
trips['dropoff_datetime'] = trips['dropoff_datetime'].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

trips['pickup_time_since_midnight'] = trips['pickup_datetime'].apply(lambda x: int((x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()))
trips['dropoff_time_since_midnight'] = trips['dropoff_datetime'].apply(lambda x: int((x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()))

trips['date'] = trips['pickup_datetime'].apply(lambda x: x.date())
trips['dow'] = trips['date'].apply(lambda x: x.weekday())
print(len(trips))
trips = trips[trips['dow'].isin([0,1,2,3,4])]
print(len(trips))
trips = trips[(trips['pickup_time_since_midnight']>14400) & (trips['pickup_time_since_midnight']<79200)]
print(len(trips))
trips_copy = trips.copy(deep=True)
trips.head(2)

24523
23052
22831


Unnamed: 0,distance,pickup_lat,pickup_lon,dropoff_lat,dropoff_lon,am_wc,pickup_node_id,pickup_osmid,dropoff_node_id,dropoff_osmid,travel_time,pickup_datetime,dropoff_datetime,pickup_time_since_midnight,dropoff_time_since_midnight,date,dow
0,9.76,35.045644,-85.319982,35.022033,-85.241765,WC,5839,202694168,595,202579218,551.921936,2021-01-01 04:15:00,2021-01-01 04:24:11,15300,15851,2021-01-01,4
1,7.99,35.032584,-85.316599,35.022087,-85.241776,AM,236,202546601,595,202579218,524.678406,2021-01-01 04:15:00,2021-01-01 04:23:44,15300,15824,2021-01-01,4


In [4]:
# get trips per day distribution

trips_per_day = trips.groupby(by=['date']).size()
day_mean, day_std = trips_per_day.mean(), trips_per_day.std()
print(f"mean: {day_mean}, std: {day_std}")
num_samples = np.random.normal(day_mean, day_std, NUM_TRAIN_CHAINS)

mean: 176.984496124031, std: 26.090882176575366


In [5]:
# trip weights

columns = ['pickup_node_id', 'dropoff_node_id', 'pickup_time_since_midnight', 'dropoff_time_since_midnight']

trips = trips.groupby(by=columns).size().reset_index()
trips['count'] = trips[0]
trips = trips.drop(columns=[0])

print(len(trips), len(trips[trips['count']>1]), trips['count'].sum())

trips.head(2)

7510 1674 22831


Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,count
0,206,517,31500,32081,1
1,206,517,32400,32981,1


In [6]:
# create the chains

chains = []
for i in range(len(num_samples)):
    chain = trips.sample(n=int(num_samples[i]), weights=trips['count'])
    chain['chain_id'] = i
    chain = chain.sort_values(by=['pickup_time_since_midnight'])
    chain['chain_order'] = list(range(len(chain)))
    chains.append(chain)
chains = pd.concat(chains, ignore_index=True)
chains.head(2)

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,count,chain_id,chain_order
0,5123,8175,18900,19122,57,0,0
1,9775,4650,18900,19052,20,0,1


In [7]:
# format and save chains

cols = columns + ['chain_id', 'chain_order']
chains = chains[cols]
chains['pickup_node_id'] = chains['pickup_node_id'].astype(int)
chains['dropoff_node_id'] = chains['dropoff_node_id'].astype(int)
chains['pickup_time_since_midnight'] = chains['pickup_time_since_midnight'].astype(int)
chains['dropoff_time_since_midnight'] = chains['dropoff_time_since_midnight'].astype(int)
chains['chain_id'] = chains['chain_id'].astype(int)
chains['chain_order'] = chains['chain_order'].astype(int)

file_path = os.path.join(os.getcwd(), "data", "CARTA", "processed", "train_chains.csv")
#file_path = os.path.join(os.getcwd(), "data", "CARTA", "processed", "test_chains_gen.csv")
#file_path = os.path.join(os.getcwd(), "data", "CARTA", "processed", "val_chains.csv")
chains.to_csv(file_path, index=False)
chains.head(2)

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,chain_id,chain_order
0,5123,8175,18900,19122,0,0
1,9775,4650,18900,19052,0,1


# Generate Test Chains - Real

In [8]:
cols = columns + ['date']
trips_copy = trips_copy[cols]
print(len(trips_copy))
trips_copy = trips_copy.drop_duplicates()
print(len(trips_copy))

chains = []
dates = random.sample(trips_copy['date'].unique().tolist(), NUM_TEST_CHAINS)
for i in range(len(dates)):
    temp = trips_copy[trips_copy['date']==dates[i]].copy(deep=True)
    temp = temp.sort_values(by=['pickup_time_since_midnight'])
    temp['chain_id'] = i
    temp['chain_order'] = list(range(len(temp)))
    chains.append(temp)

chains = pd.concat(chains, ignore_index=True)

22831
22667


In [None]:
cols = columns + ['date']
trips_copy = trips_copy[cols]
print(len(trips_copy))
trips_copy = trips_copy.drop_duplicates()
print(len(trips_copy))

chains = []
dates = random.sample(trips_copy['date'].unique().tolist(), NUM_TEST_CHAINS)
for i in range(len(dates)):
    temp = trips_copy[trips_copy['date']==dates[i]].copy(deep=True)
    temp = temp.sort_values(by=['pickup_time_since_midnight'])
    temp['chain_id'] = i
    temp['chain_order'] = list(range(len(temp)))
    chains.append(temp)

chains = pd.concat(chains, ignore_index=True)

In [9]:
cols = columns + ['chain_id', 'chain_order']
chains = chains[cols]

chains['pickup_node_id'] = chains['pickup_node_id'].astype(int)
chains['dropoff_node_id'] = chains['dropoff_node_id'].astype(int)
chains['pickup_time_since_midnight'] = chains['pickup_time_since_midnight'].astype(int)
chains['dropoff_time_since_midnight'] = chains['dropoff_time_since_midnight'].astype(int)
chains['chain_id'] = chains['chain_id'].astype(int)
chains['chain_order'] = chains['chain_order'].astype(int)

file_path = os.path.join(os.getcwd(), "data", "CARTA", "processed", "test_chains.csv")
chains.to_csv(file_path, index=False)
chains.head(2)

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,chain_id,chain_order
0,229,2898,18900,19169,0,0
1,5346,2898,19800,19878,0,1


# Format Test Set for MA-RTV Simulator

* 5840,-85.319982,35.045644,596,-85.241765,35.022033,04:15:00,2021-01-01

In [40]:
def get_pickup_time(x):
    y = dt.timedelta(seconds=x)
    n = dt.datetime.strptime(str(y), "%H:%M:%S")
    return n.time().strftime("%H:%M:%S")

file_path = os.path.join(os.getcwd(), "data/CARTA/processed/test_chains.csv")
df = pd.read_csv(file_path)
df['pickup_time'] = df['pickup_time_since_midnight'].apply(lambda x: get_pickup_time(x))
df = df[['pickup_node_id', 'dropoff_node_id', 'pickup_time', 'chain_id', 'chain_order']]
df.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time,chain_id,chain_order
0,229,2898,05:15:00,0,0
1,5346,2898,05:30:00,0,1
2,1905,10243,06:30:00,0,2
3,5139,4446,07:15:00,0,3
4,8879,1446,07:20:00,0,4


In [69]:
trips_temp = trips[['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_datetime', 'date']]
trips_temp['pickup_time'] = trips_temp['pickup_datetime'].apply(lambda x: x.strftime("%H:%M:%S"))
trips_temp = trips_temp[['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_time', 'date']]
trips_temp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trips_temp['pickup_time'] = trips_temp['pickup_datetime'].apply(lambda x: x.strftime("%H:%M:%S"))


Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date
0,5839,-85.319982,35.045644,595,-85.241765,35.022033,04:15:00,2021-01-01
1,236,-85.316599,35.032584,595,-85.241776,35.022087,04:15:00,2021-01-01
2,5897,-85.247725,35.024964,595,-85.241765,35.022033,04:20:00,2021-01-01
3,5586,-85.218661,35.015012,595,-85.241776,35.022087,04:30:00,2021-01-01
4,3053,-85.236001,35.047207,595,-85.241765,35.022033,04:40:00,2021-01-01


In [70]:
chain_dates = []
for chain_id in sorted(df['chain_id'].unique()):
    temp = df[df['chain_id']==chain_id]
    temp_j = temp.merge(trips_temp, on=['pickup_node_id', 'dropoff_node_id', 'pickup_time'], how='left', validate='one_to_many')
    r = temp_j['date'].value_counts()
    chain_dates.append(r.index[0])
df['date'] = df['chain_id'].apply(lambda x: chain_dates[x])
df.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time,chain_id,chain_order,date
0,229,2898,05:15:00,0,0,2021-02-02
1,5346,2898,05:30:00,0,1,2021-02-02
2,1905,10243,06:30:00,0,2,2021-02-02
3,5139,4446,07:15:00,0,3,2021-02-02
4,8879,1446,07:20:00,0,4,2021-02-02


In [71]:
file_path = os.path.join(os.getcwd(), "data/travel_time_matrix/nodes.csv")
nodes = pd.read_csv(file_path, index_col='node_id')
nodes = nodes[['lat', 'lon']]
nodes.head()

Unnamed: 0_level_0,lat,lon
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,34.984104,-85.145232
1,34.986203,-85.15179
2,34.985842,-85.151119
3,34.984425,-85.147417
4,34.979835,-85.144704


In [72]:
df = df.merge(nodes, left_on='pickup_node_id', right_index=True, how='left', validate='many_to_one')
df['pickup_lon'] = df['lon']
df['pickup_lat'] = df['lat']
df = df.drop(columns=['lon', 'lat'])
df = df.merge(nodes, left_on='dropoff_node_id', right_index=True, how='left', validate='many_to_one')
df['dropoff_lon'] = df['lon']
df['dropoff_lat'] = df['lat']
df = df.drop(columns=['lon', 'lat'])
df.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time,chain_id,chain_order,date,pickup_lon,pickup_lat,dropoff_lon,dropoff_lat
0,229,2898,05:15:00,0,0,2021-02-02,-85.306354,35.03478,-85.324314,35.015021
1,5346,2898,05:30:00,0,1,2021-02-02,-85.330106,35.008756,-85.324314,35.015021
2,1905,10243,06:30:00,0,2,2021-02-02,-85.248042,35.130594,-85.245774,35.133442
3,5139,4446,07:15:00,0,3,2021-02-02,-85.197954,35.020674,-85.245239,35.026334
4,8879,1446,07:20:00,0,4,2021-02-02,-85.309246,35.125555,-85.281457,35.001205


5840,-85.319982,35.045644,596,-85.241765,35.022033,04:15:00,2021-01-01

In [73]:
df = df[['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_time', 'date']]
df['date'] = df['date'].apply(lambda x: x.isoformat())
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date
0,229,-85.306354,35.03478,2898,-85.324314,35.015021,05:15:00,2021-02-02
1,5346,-85.330106,35.008756,2898,-85.324314,35.015021,05:30:00,2021-02-02
2,1905,-85.248042,35.130594,10243,-85.245774,35.133442,06:30:00,2021-02-02
3,5139,-85.197954,35.020674,4446,-85.245239,35.026334,07:15:00,2021-02-02
4,8879,-85.309246,35.125555,1446,-85.281457,35.001205,07:20:00,2021-02-02


In [76]:
df['pickup_node_id'] = df['pickup_node_id'].apply(lambda x: x + 1)
df['dropoff_node_id'] = df['dropoff_node_id'].apply(lambda x: x + 1)
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:15:00,2021-02-02
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:30:00,2021-02-02
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:30:00,2021-02-02
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:15:00,2021-02-02
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:20:00,2021-02-02


In [78]:
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'test_set.csv')
df.to_csv(file_path, header=False, index=False)

In [87]:
df['pickup_time_dt'] = df['pickup_time'].apply(lambda x: dt.datetime.strptime(x, '%H:%M:%S'))
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date,pickup_time_dt
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:15:00,2021-02-02,1900-01-01 05:15:00
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:30:00,2021-02-02,1900-01-01 05:30:00
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:30:00,2021-02-02,1900-01-01 06:30:00
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:15:00,2021-02-02,1900-01-01 07:15:00
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:20:00,2021-02-02,1900-01-01 07:20:00


In [88]:
df['pickup_time_dt'] = df['pickup_time_dt'].apply(lambda x: x - dt.timedelta(minutes=15))
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date,pickup_time_dt
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:15:00,2021-02-02,1900-01-01 05:00:00
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:30:00,2021-02-02,1900-01-01 05:15:00
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:30:00,2021-02-02,1900-01-01 06:15:00
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:15:00,2021-02-02,1900-01-01 07:00:00
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:20:00,2021-02-02,1900-01-01 07:05:00


In [89]:
df['pickup_time_dt'] = df['pickup_time_dt'].apply(lambda x: x.time().strftime('%H:%M:%S'))
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date,pickup_time_dt
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:15:00,2021-02-02,05:00:00
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:30:00,2021-02-02,05:15:00
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:30:00,2021-02-02,06:15:00
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:15:00,2021-02-02,07:00:00
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:20:00,2021-02-02,07:05:00


In [90]:
df = df[['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_time_dt', 'date']]
df.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time_dt,date
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:00:00,2021-02-02
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:15:00,2021-02-02
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:15:00,2021-02-02
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:00:00,2021-02-02
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:05:00,2021-02-02


In [91]:
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'test_set.csv')
df.to_csv(file_path, header=False, index=False)

# More Tests

In [14]:
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'test_set.csv')
names = ['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_time', 'date']
requests = pd.read_csv(file_path, header=None, names=names)
requests.head()

Unnamed: 0,pickup_node_id,pickup_lon,pickup_lat,dropoff_node_id,dropoff_lon,dropoff_lat,pickup_time,date
0,230,-85.306354,35.03478,2899,-85.324314,35.015021,05:00:00,2021-02-02
1,5347,-85.330106,35.008756,2899,-85.324314,35.015021,05:15:00,2021-02-02
2,1906,-85.248042,35.130594,10244,-85.245774,35.133442,06:15:00,2021-02-02
3,5140,-85.197954,35.020674,4447,-85.245239,35.026334,07:00:00,2021-02-02
4,8880,-85.309246,35.125555,1447,-85.281457,35.001205,07:05:00,2021-02-02


In [15]:
def check_row(row):
    if row['pickup_node_id'] == row['dropoff_node_id']:
        return 0
    else:
        return 1

requests['keep'] = requests.apply(lambda row: check_row(row), axis=1)
print(len(requests))
requests = requests[requests['keep']==1]
print(len(requests))

2693
2685


In [16]:
requests = requests[['pickup_node_id', 'pickup_lon', 'pickup_lat', 'dropoff_node_id', 'dropoff_lon', 'dropoff_lat', 'pickup_time', 'date']]
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'test_set_r2.csv')
requests.to_csv(file_path, header=False, index=False)


In [4]:
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'nodes.csv')
names = ['node_id', 'lat', 'lon']
nodes = pd.read_csv(file_path, header=None, names=names)
nodes.head()

Unnamed: 0,node_id,lat,lon
0,1,34.984104,-85.145232
1,2,34.986203,-85.15179
2,3,34.985842,-85.151119
3,4,34.984425,-85.147417
4,5,34.979835,-85.144704


In [5]:
file_path = os.path.join(os.getcwd(), 'data', 'format_samitha', 'edges.csv')
names = ['source_node', 'target_node', 'travel_time']
edges = pd.read_csv(file_path, header=None, names=names)
edges.head()

Unnamed: 0,source_node,target_node,travel_time
0,6608,9596,1
1,9596,6608,1
2,857,9842,1
3,9842,857,1
4,10159,8559,1


In [6]:
# get that node id's exist
for k, v in requests.iterrows():
    if len(nodes[nodes['node_id']==v['pickup_node_id']]) != 1:
        print(f"Pickup node id {v['pickup_node_id']} not in nodes.csv")
    if len(nodes[nodes['node_id']==v['dropoff_node_id']]) != 1:
        print(f"Pickup node id {v['pickup_node_id']} not in nodes.csv")

In [7]:
edgelist = []
for k, v in edges.iterrows():
    edgelist.append((v['source_node'], v['target_node'], v['travel_time']))

DG = nx.DiGraph()
DG.add_weighted_edges_from(edgelist)

In [9]:
for k, v in requests.iterrows():
    if nx.has_path(DG, v['pickup_node_id'], v['dropoff_node_id']) is False:
        print(f"No path for {v['pickup_node_id']}-{v['dropoff_node_id']}")

In [13]:
for k, v in requests.iterrows():
    leng = nx.shortest_path_length(DG, source=v['pickup_node_id'], target=v['dropoff_node_id'], weight='weight')
    if leng < 1:
        print(f"{v['pickup_node_id']}-{v['dropoff_node_id']}: {leng}")

3420-3420: 0
3420-3420: 0
3879-3879: 0
3420-3420: 0
3879-3879: 0
3329-3329: 0
3329-3329: 0
1191-1191: 0


In [32]:
nx.is_strongly_connected(DG)

False

In [33]:
nx.is_weakly_connected(DG)

True

In [34]:
# travel time matrix

file_path = os.path.join(os.getcwd(), 'data', 'travel_time_matrix', 'travel_time_matrix.csv')
with open(file_path, 'rb') as fd:
    travel_time_matrix = np.loadtxt(fd, delimiter=",", dtype=float)
    travel_time_matrix = np.rint(travel_time_matrix) 

In [35]:
np.count_nonzero(np.isnan(travel_time_matrix))

0

In [36]:
np.where(travel_time_matrix < 0)

(array([    0,     0,     0, ..., 10787, 10787, 10787]),
 array([   20,   286,   442, ..., 10353, 10354, 10355]))

In [37]:
(travel_time_matrix < 0).sum()

709639

In [38]:
(travel_time_matrix > 0).sum()

115660436

In [39]:
709639/115660436

0.006135537998490685

In [None]:
file_path = os.path.join(os.getcwd(), 'data', 'CARTA', 'processed', 'val_chains.csv')
requests = pd.read_csv(file_path)
requests.head()

# induced

In [41]:
def get_pickup_time(x):
    y = dt.timedelta(seconds=x)
    n = dt.datetime.strptime(str(y), "%H:%M:%S")
    return n.time().strftime("%H:%M:%S")

file_path = os.path.join(os.getcwd(), "data/CARTA/processed/test_chains.csv")
df = pd.read_csv(file_path)
df['pickup_time'] = df['pickup_time_since_midnight'].apply(lambda x: get_pickup_time(x))
df = df[['pickup_node_id', 'dropoff_node_id', 'pickup_time', 'chain_id', 'chain_order']]
df.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time,chain_id,chain_order
0,229,2898,05:15:00,0,0
1,5346,2898,05:30:00,0,1
2,1905,10243,06:30:00,0,2
3,5139,4446,07:15:00,0,3
4,8879,1446,07:20:00,0,4


In [42]:
file_path = os.path.join(os.getcwd(), "data/travel_time_matrix/nodes.csv")
nodes = pd.read_csv(file_path)
nodes.head()

Unnamed: 0,node_id,osmid,lat,lon
0,0,66923001,34.984104,-85.145232
1,1,66937537,34.986203,-85.15179
2,2,66937546,34.985842,-85.151119
3,3,66942787,34.984425,-85.147417
4,4,66943797,34.979835,-85.144704


In [44]:
file_path = os.path.join(os.getcwd(), "data/travel_time_matrix/edges.csv")
edges = pd.read_csv(file_path)
edges.head()

Unnamed: 0,source_osmid,target_osmid,source_node,target_node,travel_time
0,202724336,2689933658,6607,9595,0.106671
1,2689933658,202724336,9595,6607,0.106671
2,202595928,3639277853,856,9841,0.112868
3,3639277853,202595928,9841,856,0.112868
4,5863378097,1432394908,10158,8558,0.117527


In [45]:
edgelist = []
for k, v in edges.iterrows():
    edgelist.append((v['source_node'], v['target_node'], v['travel_time']))

DG = nx.DiGraph()
DG.add_weighted_edges_from(edgelist)

In [46]:
nx.is_strongly_connected(DG)

False

In [47]:
node_list = []
for x in df['pickup_node_id'].unique():
    node_list.append(x)
for x in df['dropoff_node_id'].unique():
    node_list.append(x)
node_list = list(set(node_list))

In [48]:
DG2 = DG.subgraph(node_list)

In [49]:
nx.is_strongly_connected(DG2)

False

# Cleanup Val set

In [28]:
file_path = os.path.join(os.getcwd(), 'data', 'CARTA', 'processed', 'val_chains.csv')
requests = pd.read_csv(file_path)
requests.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,chain_id,chain_order
0,5123,8175,18900,19122,0,0
1,9775,4650,18900,19052,0,1
2,229,2898,18900,19169,0,2
3,855,8263,19200,19602,0,3
4,2588,8175,19800,19924,0,4


In [29]:
def check_row(row):
    if row['pickup_node_id'] == row['dropoff_node_id']:
        return 0
    else:
        return 1

requests['keep'] = requests.apply(lambda row: check_row(row), axis=1)
print(len(requests))
requests = requests[requests['keep']==1]
print(len(requests))

558
557


In [30]:
result = []
for chain_id in requests['chain_id'].unique():
    temp = requests[requests['chain_id']==chain_id]
    temp = temp.sort_values(by='chain_order', ascending=True)
    temp['chain_order_2'] = [x for x in range(len(temp))]
    result.append(temp)
result = pd.concat(result)
result.head()

Unnamed: 0,pickup_node_id,dropoff_node_id,pickup_time_since_midnight,dropoff_time_since_midnight,chain_id,chain_order,keep,chain_order_2
0,5123,8175,18900,19122,0,0,1,0
1,9775,4650,18900,19052,0,1,1,1
2,229,2898,18900,19169,0,2,1,2
3,855,8263,19200,19602,0,3,1,3
4,2588,8175,19800,19924,0,4,1,4


In [31]:
file_path = os.path.join(os.getcwd(), 'data', 'CARTA', 'processed', 'validation_chains.csv')
result['chain_order'] = result['chain_order_2']
result = result[['pickup_node_id', 'dropoff_node_id', 'pickup_time_since_midnight', 'dropoff_time_since_midnight', 'chain_id', 'chain_order']]
result.to_csv(file_path, index=False)
