# Find Arrival Timings of Various Buses

## Load Data

For now experiment with one day of data: 2018-03-05 

In [8]:
# import folium
import pandas as pd
import json
from math import *
import numpy as np

busLocationFile = "Data/BusLocation/2018-03-05.csv"
busStopFile = "Data/busstops.txt"

In [3]:
df = pd.read_csv(busLocationFile)
print(df.head())

   node_id vehicle_serial                  gps_time  latitude  longitude  \
0     2025        PC3957P  2018-03-04T22:59:01.000Z  1.309950  103.77200   
1     2026        PA9558D  2018-03-04T22:59:01.000Z  1.315100  103.69067   
2     2031        PC3785T  2018-03-04T22:59:01.000Z  1.291883  103.78049   
3     2043        PC4038K  2018-03-04T22:59:01.000Z  1.294683  103.77486   
4     2054        PC3989Y  2018-03-04T22:59:01.000Z  1.294500  103.77513   

   altitude  speed  heading  
0        28     14   192.07  
1        25      0    77.53  
2        70      0   139.86  
3        37      0   329.73  
4        48      0   290.10  


In [4]:
# get unique bus
df_id = df['node_id']
buses = set()
for busId in df_id:
    buses.add(busId)
buses = list(buses)

In [5]:
print(buses)

[2048, 2054, 2055, 2058, 2060, 2062, 2063, 2068, 2077, 2078, 2079, 2129, 2024, 2025, 2026, 2027, 2029, 2031, 2032, 2033, 2038, 2042, 2043, 2047]


In [6]:
df_2048 = df[df['node_id'] == 2048]
print(df_2048.head())

       node_id vehicle_serial                  gps_time  latitude  longitude  \
10143     2048        PC4019R  2018-03-05T13:34:01.000Z  1.296467  103.78330   
10156     2048        PC4019R  2018-03-05T13:34:02.000Z  1.296533  103.78325   
10169     2048        PC4019R  2018-03-05T13:34:03.000Z  1.296583  103.78318   
10182     2048        PC4019R  2018-03-05T13:34:04.000Z  1.296633  103.78312   
10195     2048        PC4019R  2018-03-05T13:34:05.000Z  1.296667  103.78308   

       altitude  speed  heading  
10143        39     34   307.96  
10156        41     32   306.98  
10169        42     30   305.21  
10182        42     26   305.44  
10195        42     21   304.02  


In [9]:
# load bus stop geolocation data
busStopData = json.load(open(busStopFile))

In [10]:
busStopLocations = {}
for d in busStopData:
    busStopLocations[d['name']] = (d['lat'], d['long'])

In [12]:
def dist(lat1, lon1, lat2, lon2):
    '''
    returns distance in meters
    '''
    R = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * 1000
    return distance

def near(locA, locB):
    EPS = 10
    return abs(dist(locA[0], locA[1], locB[0], locB[1])) < EPS
    
    

In [13]:
def nearestBusStop(lat, long):
    for stop, loc in busStopLocations.items():
        if(near(loc, (lat, long))):
            return stop
    return None

In [14]:
for idx, row in df_2048.iterrows():
    nearestBusStop(row['latitude'], row['longitude'])

In [15]:
def arrivalTimes(df):
    times = [] # (bus stop, arrival time, depature time)
    isWaitingDepart = False
    for idx, row in df.iterrows():
        busstop = nearestBusStop(row['latitude'], row['longitude'])
        if busstop:
            # Entering
            if len(times) == 0 or times[-1][0] != busstop:
                times.append([busstop, row['gps_time'], None])
        elif len(times) and times[-1][-1] is None:
            # Leaving
            times[-1][-1] = row['gps_time']
    return times
    

In [16]:
arrivalTimes2048 = arrivalTimes(df_2048)
arrivalTimes2048

[['LT29', '2018-03-05T13:34:44.000Z', '2018-03-05T13:35:08.000Z'],
 ['University Hall', '2018-03-05T13:36:31.000Z', '2018-03-05T13:36:34.000Z'],
 ['Computer Centre', '2018-03-05T13:39:23.000Z', '2018-03-05T13:39:25.000Z'],
 ['Central Library', '2018-03-05T13:39:40.000Z', '2018-03-05T13:39:44.000Z'],
 ['LT13', '2018-03-05T13:41:10.000Z', '2018-03-05T13:41:29.000Z'],
 ['COM2', '2018-03-05T13:43:31.000Z', '2018-03-05T13:44:01.000Z'],
 ['BIZ2', '2018-03-05T13:46:08.000Z', '2018-03-05T13:46:22.000Z'],
 ['House 12', '2018-03-05T13:46:58.000Z', '2018-03-05T14:10:01.000Z'],
 ['COM2', '2018-03-05T14:11:02.000Z', '2018-03-05T14:11:17.000Z'],
 ['BIZ2', '2018-03-05T14:13:36.000Z', '2018-03-05T14:14:01.000Z'],
 ['House 12', '2018-03-05T14:14:27.000Z', '2018-03-05T14:14:29.000Z'],
 ['Between House 14 & 15',
  '2018-03-05T14:14:39.000Z',
  '2018-03-05T14:14:40.000Z'],
 ['PGP Terminal', '2018-03-05T14:15:17.000Z', '2018-03-05T14:15:20.000Z'],
 ['After Science Park Drive',
  '2018-03-05T14:31:14.000Z',

# Classify all buses

In [17]:
arrivalTimesofBuses = {b: arrivalTimes(df[df['node_id']==b]) for b in buses}

In [28]:
# create df for each
def createEventdf(bus):
    arrivalTimes = arrivalTimesofBuses[bus]
    df = pd.DataFrame(arrivalTimes, columns=['Bus Stop', 'Arrival', 'Departure'])
    df['bus']=bus
    return df

def createArrivalTimesDataFrame(arrivalTimesofBuses):
    

firstdf = createEventdf(buses[0])
for b in buses[1:]:
    print(b)
    firstdf.append(createEventdf(b))

2054
2055
2058
2060
2062
2063
2068
2077
2078
2079
2129
2024
2025
2026
2027
2029
2031
2032
2033
2038
2042
2043
2047


In [29]:
firstdf

Unnamed: 0,Bus Stop,Arrival,Departure,bus
0,LT29,2018-03-05T13:34:44.000Z,2018-03-05T13:35:08.000Z,2048
1,University Hall,2018-03-05T13:36:31.000Z,2018-03-05T13:36:34.000Z,2048
2,Computer Centre,2018-03-05T13:39:23.000Z,2018-03-05T13:39:25.000Z,2048
3,Central Library,2018-03-05T13:39:40.000Z,2018-03-05T13:39:44.000Z,2048
4,LT13,2018-03-05T13:41:10.000Z,2018-03-05T13:41:29.000Z,2048
5,COM2,2018-03-05T13:43:31.000Z,2018-03-05T13:44:01.000Z,2048
6,BIZ2,2018-03-05T13:46:08.000Z,2018-03-05T13:46:22.000Z,2048
7,House 12,2018-03-05T13:46:58.000Z,2018-03-05T14:10:01.000Z,2048
8,COM2,2018-03-05T14:11:02.000Z,2018-03-05T14:11:17.000Z,2048
9,BIZ2,2018-03-05T14:13:36.000Z,2018-03-05T14:14:01.000Z,2048


In [22]:
df1 = createEventdf(2048)
df2 = createEventdf(2129)

In [23]:
df1.head()

Unnamed: 0,Bus Stop,Arrival,Departure,bus
0,LT29,2018-03-05T13:34:44.000Z,2018-03-05T13:35:08.000Z,2048
1,University Hall,2018-03-05T13:36:31.000Z,2018-03-05T13:36:34.000Z,2048
2,Computer Centre,2018-03-05T13:39:23.000Z,2018-03-05T13:39:25.000Z,2048
3,Central Library,2018-03-05T13:39:40.000Z,2018-03-05T13:39:44.000Z,2048
4,LT13,2018-03-05T13:41:10.000Z,2018-03-05T13:41:29.000Z,2048


In [24]:
df2.head()

Unnamed: 0,Bus Stop,Arrival,Departure,bus
0,After Science Park Drive,2018-03-05T13:45:42.000Z,2018-03-05T13:45:44.000Z,2129
1,KR MRT Station,2018-03-05T13:46:05.000Z,2018-03-05T13:46:07.000Z,2129
2,PGP Terminal,2018-03-05T14:10:01.000Z,2018-03-05T14:14:04.000Z,2129
3,KR MRT Station,2018-03-05T14:16:25.000Z,2018-03-05T14:16:28.000Z,2129
4,NUH,2018-03-05T14:17:42.000Z,2018-03-05T14:17:44.000Z,2129


In [25]:
df1.append(df2)

Unnamed: 0,Bus Stop,Arrival,Departure,bus
0,LT29,2018-03-05T13:34:44.000Z,2018-03-05T13:35:08.000Z,2048
1,University Hall,2018-03-05T13:36:31.000Z,2018-03-05T13:36:34.000Z,2048
2,Computer Centre,2018-03-05T13:39:23.000Z,2018-03-05T13:39:25.000Z,2048
3,Central Library,2018-03-05T13:39:40.000Z,2018-03-05T13:39:44.000Z,2048
4,LT13,2018-03-05T13:41:10.000Z,2018-03-05T13:41:29.000Z,2048
5,COM2,2018-03-05T13:43:31.000Z,2018-03-05T13:44:01.000Z,2048
6,BIZ2,2018-03-05T13:46:08.000Z,2018-03-05T13:46:22.000Z,2048
7,House 12,2018-03-05T13:46:58.000Z,2018-03-05T14:10:01.000Z,2048
8,COM2,2018-03-05T14:11:02.000Z,2018-03-05T14:11:17.000Z,2048
9,BIZ2,2018-03-05T14:13:36.000Z,2018-03-05T14:14:01.000Z,2048


In [20]:
# filter out empty buses
# arrivalTimesofBuses = {k:v for k,v in arrivalTimesofBuses.items() if len(v)}
len(arrivalTimesofBuses)

16

Based on manual classification

| Bus   | Service|
|-------|--------|
| 2048  |  A2/A1 |	
| 2055  | 		 |		
| 2058  | 		 |	
| 2060  | 		 |	
| 2063  | 	D2	 |		
| 2068  | 	DEAD |		
| 2078  | 		 |		
| 2079  | 	D2	 |		
| 2129  | 	A1	 |			
| 2025  | 	B2	 |	
| 2026  | 	B1	 |			
| 2027  | 	D1	 |		
| 2031  | 	DEAD |		
| 2033  | 	A2	 |		
| 2038  | 	A2	 |		
| 2042  | 	A2	 |		


In [33]:
spots = list(busStopLocations.keys())

In [29]:
A1 = set(['PGP Terminal', 'KR MRT Station', 'LT29', 'University Hall', 'Opp. University Health Centre', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'Opp. Hon Sui Sen Memorial Library', 'BIZ2', 'Opp. House 12', 'House 7'])
A1E= set(['KR MRT Station', 'LT29', 'Central Library', 'BIZ2', 'PGP Terminal'])
A2 = set(['PGP Terminal', 'Opp. Hon Sui Sen Memorial Library', 'COM2', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Health Centre', 'Opp. University Hall', 'Opp. KR MRT Station'])
B1 = set(['Kent Ridge Terminal','Computer Centre', 'Opp. Yusof Ishak House', 'University Town', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'BIZ2'])
B2 = set(['Opp. Hon Sui Sen Memorial Library', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'University Town', 'Raffles Hall', 'Opp. Block EA', 'Kent Ridge Terminal'])
C = set(['Kent Ridge Terminal''Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Health Centre', 'Opp. University Hall', 'Block S17', 'LT29', 'University Hall', 'Raffles Hall', 'Opp. Block EA'])
D1 = set(['Opp. Hon Sui Sen Memorial Library', 'COM2', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Town', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'BIZ2'])
D2 = set(['PGP Terminal', 'KR MRT Station', 'LT29', 'University Hall', 'Opp. University Health Centre', 'Museum', 'University Town', 'University Health Centre', 'Opp. University Hall', 'Block S17', 'Opp. KR MRT Station', 'BIZ2'])

In [25]:
services = {'A1':A1,'A1E':A1E,'A2':A2,'B1':B1,'B2':B2,'C':C,'D1':D1,'D2':D2}

In [43]:
import plotly.plotly as py
import plotly.graph_objs as go

header = list(services.keys())
rows = buses

def getStopsForBus(arrivalTimesofBuses, bus):
    '''
    returns set of bus stops that bus passes through
    '''
    return set([tup[0] for tup in arrivalTimesofBuses[bus]])

def percentageCloseness(stops, service):
    '''
    input: both sets of bus stops
    returns: intersect(stops,service)/service*100
    '''
    return len(stops.intersection(service))/len(service)*100
    
    
data = [[percentageCloseness(getStopsForBus(arrivalTimesofBuses, bus), services[s]) 
         for s in header] for bus in buses]


In [44]:
data = np.array(data)

In [45]:
df_confidence = pd.DataFrame(data, index=rows, columns=header)

In [46]:
df_confidence

Unnamed: 0,A1,A1E,A2,B1,B2,C,D1,D2
2048,61.538462,80.0,60.0,50.0,25.0,30.0,58.333333,58.333333
2054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2055,0.0,0.0,0.0,0.0,12.5,0.0,0.0,0.0
2058,0.0,0.0,10.0,0.0,25.0,0.0,8.333333,0.0
2060,7.692308,20.0,10.0,12.5,12.5,0.0,16.666667,8.333333
2062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2063,23.076923,20.0,30.0,12.5,12.5,50.0,16.666667,66.666667
2068,0.0,0.0,10.0,0.0,0.0,10.0,8.333333,8.333333
2077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We will focus on A2 since there are multiple buses, which makes modeling more interestin. Ignore the presense of other buses for now

In [45]:
A2 = set([2048,2033,2038,2042])

dfs = [createEventdf(2048), createEventdf(2033), createEventdf(2038), createEventdf(2042)] 
A2_events = dfs[0]
for df in dfs[1:]:
    A2_events = A2_events.append(df)
# sort by bus stop
A2_events = A2_events.sort_values('Bus Stop')
A2_events

Unnamed: 0,Bus Stop,Arrival,Departure,bus
13,After Science Park Drive,2018-03-05T14:31:14.000Z,2018-03-05T14:31:16.000Z,2048
3,BIZ2,2018-03-05T14:18:36.000Z,2018-03-05T14:18:40.000Z,2042
24,BIZ2,2018-03-05T14:43:11.000Z,2018-03-05T14:43:29.000Z,2048
22,BIZ2,2018-03-05T14:32:39.000Z,2018-03-05T14:32:42.000Z,2038
9,BIZ2,2018-03-05T14:13:36.000Z,2018-03-05T14:14:01.000Z,2048
6,BIZ2,2018-03-05T13:46:08.000Z,2018-03-05T13:46:22.000Z,2048
12,BIZ2,2018-03-05T14:47:01.000Z,2018-03-05T14:47:05.000Z,2042
0,BIZ2,2018-03-04T23:09:34.000Z,2018-03-04T23:09:39.000Z,2033
2,BIZ2,2018-03-04T23:18:41.000Z,2018-03-04T23:18:48.000Z,2038
26,Between House 14 & 15,2018-03-05T14:44:15.000Z,2018-03-05T14:44:16.000Z,2048


In [40]:
# see if list of stops makes sense
set(A2_events['Bus Stop'])

{'After Science Park Drive',
 'BIZ2',
 'Between House 14 & 15',
 'Block S17',
 'COM2',
 'Central Library',
 'Computer Centre',
 'House 12',
 'Kent Ridge Terminal',
 'LT13',
 'LT29',
 'Museum',
 'Opp. Block EA',
 'Opp. KR MRT Station',
 'Opp. NUH',
 'Opp. University Health Centre',
 'Opp. Yusof Ishak House',
 'PGP Terminal',
 'Raffles Hall',
 'University Hall',
 'University Health Centre',
 'Ventus (Opp. LT13)',
 'Yusof Ishak House'}