%matplotlib inline
# NUS Shuttle Bus Servive Modeling

1. label all GPS readings with nearest bus stop(s)
2. For each bus find the stops it passes ans try label with bus service
3. Find arrival times for each bus stop for service A1 for now

## Load Data

In [1]:
%matplotlib inline
import pandas as pd
import json
from math import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

busLocationFile = "Data/BusLocation/2018-03-26.csv"
busStopFile = "Data/nusbusstops.json"
EPS = 15 # meters

In [2]:
df = pd.read_csv(busLocationFile)

In [3]:
df['gps_time'] = pd.to_datetime(df['gps_time'])
df['gps_time'] = df['gps_time'].apply(lambda x:x.strftime('%H:%M:%S'))
df['date_ordinal'] = pd.to_datetime(df['gps_time']).apply(lambda date: date.toordinal())
buses = list(set(df['node_id']))
busStopData = json.load(open(busStopFile))
busStopLocations = {d['name']:(d['lat'],d['lng']) for d in busStopData.values()}

In [42]:
df.head()

Unnamed: 0,node_id,vehicle_serial,gps_time,latitude,longitude,altitude,speed,heading,date_ordinal
0,2024,PC3947T,01:58:01,1.292933,103.77866,42,32,94.05,736791
1,2025,PC3957P,01:58:01,1.29335,103.77237,31,11,286.45,736791
2,2029,PC3763G,01:58:01,1.296333,103.77206,46,0,251.37,736791
3,2031,PC3785T,01:58:01,1.29745,103.78146,38,22,102.56,736791
4,2032,PC3876P,01:58:01,1.301333,103.7704,40,13,213.19,736791


In [9]:
busStopLocations
A1 = set(["Prince George's Park Terminal", 'PGPR', 'Kent Ridge MRT', 'LT29', 'UHall', 'Opp University Health Centre', 'YIH', 'Central Library', 'LT13', 'AS7', 'COM2 (CP13)', 'BIZ 2', 'Opp PGP Hse No 12', 'PGP Hse No 7'])

In [7]:
busStopData

{'AS7': {'lat': '1.2936110496521',
  'lng': '103.771942138672',
  'name': 'AS7',
  'no': 'AS7'},
 'BIZ 2': {'lat': '1.29333305358887',
  'lng': '103.775001525879',
  'name': 'BIZ 2',
  'no': 'BIZ2'},
 'BTC - Oei Tiong Ham Building': {'lat': '1.31972205638886',
  'lng': '103.817779541016',
  'name': 'BTC - Oei Tiong Ham Building',
  'no': 'BUKITTIMAH-BTC2'},
 'Botanic Gardens MRT': {'lat': '1.32249999046326',
  'lng': '103.815002441406',
  'name': 'Botanic Gardens MRT',
  'no': 'BG-MRT'},
 'COM2 (CP13)': {'lat': '1.29416704177856',
  'lng': '103.773612976074',
  'name': 'COM2 (CP13)',
  'no': 'COM2'},
 'Central Library': {'lat': '1.29666996002197',
  'lng': '103.772453308105',
  'name': 'Central Library',
  'no': 'CENLIB'},
 'College Green Hostel': {'lat': '1.32333302497864',
  'lng': '103.816108703613',
  'name': 'College Green Hostel',
  'no': 'CGH'},
 'Computer Centre': {'lat': '1.29694402217865',
  'lng': '103.772499084473',
  'name': 'Computer Centre',
  'no': 'COMCEN'},
 'Kent Rid

In [10]:
for stop in A1:
    print(busStopData[stop]['no'])

LT13
COM2
KR-MRT
PGP
LT29
UHALL
PGP12-OPP
CENLIB
STAFFCLUB-OPP
AS7
YIH
BIZ2
PGP7
PGPT


In [11]:
[busStopData[stop]['no'] for stop in A1]

['LT13',
 'COM2',
 'KR-MRT',
 'PGP',
 'LT29',
 'UHALL',
 'PGP12-OPP',
 'CENLIB',
 'STAFFCLUB-OPP',
 'AS7',
 'YIH',
 'BIZ2',
 'PGP7',
 'PGPT']

## Label GPS data with nearest bus stop(s)

Pular due to possible GPS inaccuracies

In [43]:
# Helper distance functions()
def dist(lat1, lon1, lat2, lon2):
    '''
    returns distance in meters
    '''
    lat1, lon1, lat2, lon2 = map(float, [lat1, lon1, lat2, lon2])
    R = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c * 1000
    return distance

def near(locA, locB):
    return abs(dist(locA[0], locA[1], locB[0], locB[1])) < EPS

def nearBusStops(lat, long):
    return [stop for stop, loc in busStopLocations.items() if near(loc, (lat, long))]

In [54]:
# add new column
df['bus stops'] = df.apply(lambda row:",".join(nearBusStops(row['latitude'], row['longitude'])), axis=1)

In [55]:
df.head()

Unnamed: 0,node_id,vehicle_serial,gps_time,latitude,longitude,altitude,speed,heading,date_ordinal,bus stops
0,2024,PC3947T,01:58:01,1.292933,103.77866,42,32,94.05,736791,
1,2025,PC3957P,01:58:01,1.29335,103.77237,31,11,286.45,736791,
2,2029,PC3763G,01:58:01,1.296333,103.77206,46,0,251.37,736791,
3,2031,PC3785T,01:58:01,1.29745,103.78146,38,22,102.56,736791,
4,2032,PC3876P,01:58:01,1.301333,103.7704,40,13,213.19,736791,


In [56]:
# # plot all bus stop arrival events to see where the holes are
# fig, ax = plt.subplots(figsize=(12,12))
# for key, grp in df[df['bus stops'] != ''].groupby(['node_id']):
#     ax = grp.plot(ax=ax,x='gps_time',y='speed',label=key)

## Label Bus Service

In [59]:
# Stops for each service

A1 = set(["'Prince George's Park Terminal'", 'PGPR', 'Kent Ridge MRT', 'LT29', 'UHall', 'Opp University Health Centre', 'YIH', 'Central Library', 'LT13', 'AS7', 'COM2 (CP13)', 'BIZ 2', 'Opp PGP Hse No 12', 'PGP Hse No 7'])
A2 = set(["'Prince George's Park Terminal'", 'PGPR', 'PGP Hse No 14 and No 15', 'PGP Hse No 12', 'Opp HSSML', 'Opp NUSS', 'COM2 (CP13)', 'Ventus (Opp LT13)', 'Computer Centre', 'Opp YIH', 'Museum', 'University Health Centre', 'Opp UHall', 'S17', 'Opp Kent Ridge MRT'])
# A1 = set(['PGP Terminal', 'KR MRT Station', 'NUH', 'LT29', 'University Hall', 'Opp. University Health Centre', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'COM2', 'BIZ2', 'Opp. House 12', 'House 7'])
# A2 = set(['PGP Terminal', 'Opp. Hon Sui Sen Memorial Library', 'COM2', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Health Centre', 'Opp. University Hall', 'Opp. KR MRT Station'])
# B1 = set(['Kent Ridge Terminal','Computer Centre', 'Opp. Yusof Ishak House', 'University Town', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'BIZ2'])
# B2 = set(['Opp. Hon Sui Sen Memorial Library', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'University Town', 'Raffles Hall', 'Opp. Block EA', 'Kent Ridge Terminal'])
# C = set(['Kent Ridge Terminal''Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Health Centre', 'Opp. University Hall', 'Block S17', 'LT29', 'University Hall', 'Raffles Hall', 'Opp. Block EA'])
# D1 = set(['Opp. Hon Sui Sen Memorial Library', 'COM2', 'Ventus (Opp. LT13)', 'Computer Centre', 'Opp. Yusof Ishak House', 'Museum', 'University Town', 'Yusof Ishak House', 'Central Library', 'LT13', 'AS7', 'BIZ2'])
# D2 = set(['PGP Terminal', 'KR MRT Station', 'LT29', 'University Hall', 'Opp. University Health Centre', 'Museum', 'University Town', 'University Health Centre', 'Opp. University Hall', 'Block S17', 'Opp. KR MRT Station', 'BIZ2'])

#services = {'A1':A1,'A1E':A1E,'A2':A2,'B1':B1,'B2':B2,'C':C,'D1':D1,'D2':D2}
services = {'A1':A1,'A2':A2}

def stopsForBus(bus):
    temp = df[df['node_id']==bus]
    stops = list(temp['bus stops'])
    S = set()
    for row in stops:
        for s in row.split(','):
            S.add(s)
    return S

def percentCloseness(stops, service):
    return len(stops.intersection(service))
busServiceConfidenceData = [[percentCloseness(stopsForBus(bus), services[s]) for s in services.keys()]
                            for bus in buses]
df_confidence = pd.DataFrame(busServiceConfidenceData, index=buses, columns=services.keys())
df_confidence[df_confidence['A1'] >= 8].index

Int64Index([2048, 2050, 2052, 2062, 2063, 2074, 2129, 2024, 2025, 2031, 2032,
            2038, 2042],
           dtype='int64')

In [77]:
candidate_A1 = [2048, 2050, 2052, 2062, 2063, 2074, 2129, 2024, 2025, 2031, 2032,
            2038, 2042]
df_stops_each_bus = pd.DataFrame([",".join(sorted(list(stopsForBus(b)))) for b in candidate_A1], candidate_A1)
df_stops_each_bus.to_csv('busstoparrivals.csv')

In [78]:
df_2048 = df[df['node_id']==2048]
df_2048 = df_2048[df_2048['bus stops']!='']
df_2048 = df_2048[['bus stops','gps_time', 'node_id']]

In [79]:
df_2048.to_csv('2048.csv')

In [80]:
df_2048.head()

Unnamed: 0,bus stops,gps_time,node_id
13131,"PGP Hse No 14 and No 15,PGP Hse No 7",02:11:02,2048
13154,"PGP Hse No 14 and No 15,PGP Hse No 7",02:11:03,2048
15594,Prince George's Park Terminal,02:13:18,2048
15618,Prince George's Park Terminal,02:13:19,2048
15642,Prince George's Park Terminal,02:13:20,2048


## Bus Service A1

In [18]:
# find all bus stops that are common
common_bus = ['BIZ 2', 'Central Library', 'LT13', 'Opp HSSML', 'S17', 'YIH']

In [19]:
df_a1 = df[(df.node_id.isin(candidate_A1))]
df_a1 = df_a1[df_a1['bus stops'] != '']
df_a1 = df_a1.sort_values(['bus stops', 'node_id', 'gps_time'])
# df_a1 = df_a1.sort_values('gps_time')
df_a1 = df_a1[['bus stops','gps_time', 'node_id']]

In [20]:
temp = df_a1[df_a1['bus stops']=='AS7']

In [21]:
temp

Unnamed: 0,bus stops,gps_time,node_id
2214,AS7,23:05:25,2025
10336,AS7,13:34:16,2038
10349,AS7,13:34:17,2038
10362,AS7,13:34:18,2038
26498,AS7,14:34:19,2038
26509,AS7,14:34:20,2038
26519,AS7,14:34:21,2038
26530,AS7,14:34:22,2038
13789,AS7,13:42:14,2048
13801,AS7,13:42:15,2048


In [66]:
from datetime import datetime
from datetime import timedelta
import time
import datetime
def str_to_time(s): return datetime.datetime.strptime(s, '%H:%M:%S')
def removeDuplicates(df):
    delta = timedelta(seconds=120)
    data = df.as_matrix().tolist()
    result = []
    for i, r in enumerate(data):
        if len(result)==0:
            result.append(r)
        else:
            if data[i][2] == result[-1][2] and data[i][0] == result[-1][0]:
                t_curr = str_to_time(data[i][1])
                t_prev = str_to_time(result[-1][1])
                if t_curr - t_prev > delta:
                    result.append(r)
            else:
                result.append(r)
    return pd.DataFrame(result,columns=df.columns)
                
temp2 = removeDuplicates(df_a1)

In [35]:
temp2.sort_values(['bus stops', 'gps_time'])

Unnamed: 0,bus stops,gps_time,node_id
1,AS7,13:34:16,2038
3,AS7,13:42:14,2048
2,AS7,14:34:19,2038
4,AS7,14:39:16,2048
5,AS7,14:54:01,2129
0,AS7,23:05:25,2025
12,BIZ 2,13:46:05,2048
13,BIZ 2,14:13:32,2048
10,BIZ 2,14:18:38,2042
15,BIZ 2,14:27:07,2129


In [293]:
data = temp.as_matrix().tolist()
data

[['AS7', '23:05:25', 2025],
 ['AS7', '13:34:16', 2038],
 ['AS7', '13:34:17', 2038],
 ['AS7', '13:34:18', 2038],
 ['AS7', '14:34:19', 2038],
 ['AS7', '14:34:20', 2038],
 ['AS7', '14:34:21', 2038],
 ['AS7', '14:34:22', 2038],
 ['AS7', '13:42:14', 2048],
 ['AS7', '13:42:15', 2048],
 ['AS7', '14:39:16', 2048],
 ['AS7', '14:54:01', 2129],
 ['AS7', '14:54:02', 2129],
 ['AS7', '14:54:03', 2129],
 ['AS7', '14:54:04', 2129],
 ['AS7', '14:54:05', 2129],
 ['AS7', '14:54:06', 2129],
 ['AS7', '14:54:07', 2129],
 ['AS7', '14:54:08', 2129],
 ['AS7', '14:54:09', 2129],
 ['AS7', '14:54:10', 2129],
 ['AS7', '14:54:11', 2129]]

In [29]:
from datetime import datetime
from datetime import timedelta
datetime_object = datetime.strptime('23:05:25', '%H:%M:%S')
def str_to_time(s): return datetime.strptime(s, '%H:%M:%S')

In [30]:
d1 = str_to_time('20:05:25')
d2 = str_to_time('23:05:35')
delta = timedelta(seconds=60)

In [317]:
delta < d2-d1

True

In [312]:
ans.seconds

10810