# Trip Segmentation & Super Summaries with STPH App Data

In [2]:
# !pip install pandas numpy geopy
# !pip install folium
# !pip install scipy

In [3]:
import pandas as pd
from datetime import timedelta
import os
import numpy as np
import random  # For mapping a random sample of trips
from datetime import datetime


# Library created to create trips
import importlib
import stph_trips
importlib.reload(stph_trips)

<module 'stph_trips' from '/Users/pam/Documents/Git/STPH/Trip segmentation/stph_trips/__init__.py'>

<hr style="border:2px solid darkblue"> 

## Import STPH Mobile App data

In [4]:
def load_csv_files(folder_path):
    """
    Searches for all CSV files in the specified folder,
    loads them as pandas DataFrames, and stores them in a list.

    :param folder_path: Path to the folder containing CSV files.
    :return: List of pandas DataFrames.
    """
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    dataframes = [pd.read_csv(os.path.join(folder_path, file)) for file in csv_files]

    return dataframes

dfs = load_csv_files("/Users/pam/Documents/Git/STPH/Trip segmentation/sample_datasets/stph_app")
stph_data = pd.concat(dfs, ignore_index=True)
del dfs

In [5]:
# Remove rows with no vehicle_details, the column where we get the route data
stph_data = stph_data[~pd.isna(stph_data['vehicle_details'])].reset_index(drop=True)

In [6]:
# To break down the info from the vehicle_details
def extract_vehicle_info(df):
    def parse_details(details):
        if pd.isna(details) or details == 'NA':
            return pd.Series([None] * 9)
        try:
            # Split on ***
            parts = details.split('***')

            # Extract known patterns
            permit = parts[0].replace('Permit or Franchise Number ', '').strip()
            vehicle_type = parts[1].strip()
            case_number = parts[2].replace('Case Number ', '').strip()
            operator = parts[3].replace('Operator:', '').strip()
            route = parts[4].replace('Route:', '').strip()
            plate = parts[5].replace('Plate:', '').strip()
            chassis = parts[6].replace('Chassis Number ', '').strip()

            # Parse and convert date
            validity_str = parts[7].replace('Validity:', '').strip()
            try:
                validity = datetime.strptime(validity_str, '%d %B %Y').date()
            except ValueError:
                validity = None

            # Extract capacity
            capacity_str = parts[8].replace('Capacity:', '').strip()
            try:
                capacity = int(capacity_str)
            except ValueError:
                capacity = None

            return pd.Series([
                permit, vehicle_type, case_number, operator,
                route, plate, chassis, validity, capacity
            ])
        except Exception as e:
            # Return all None if something unexpected happens
            return pd.Series([None] * 9)

    df[['permit_franchiseNumber', 'vehicle_type', 'case_number', 'operator', 'route',
        'plate_number', 'chassis_number', 'validity', 'capacity']] = df['vehicle_details'].apply(parse_details)

    return df

stph_app_data = extract_vehicle_info(stph_data)

In [7]:
# Fixing the columns 
stph_app_data['timestamp'] = pd.to_datetime(stph_app_data['timestamp'], format="%Y-%m-%d %H:%M:%S") 
stph_app_data[['longitude', 'latitude']] = stph_app_data['location'].str.strip('[]').str.split(',', expand=True).astype(float)
stph_app_data = stph_app_data.drop(columns = ['location'])

# Keep only rows where 'route' matches "Route " followed by a number from 1 to 21
stph_app_data = stph_app_data[stph_app_data['route'].str.match(r'^Route (1[0-9]|2[0-1]|[1-9])$', na=False)]

In [8]:
# Summarize per timestamp and sum alighting and bording data
agg_dict = {'current_number_of_passengers': 'max', 'altitude_in_meters': 'max', 'accuracy': 'first',
    'is_boarding': 'sum', 'is_alighting': 'sum', 'trip_code': 'first', 'device_code': 'first',
    'driver_code': 'first', 'vehicle_code': 'first', 'permit_franchiseNumber': 'first',
    'vehicle_type': 'first', 'case_number': 'first', 'operator': 'first',
    'route': 'first', 'chassis_number': 'first', 'validity': 'first',
    'capacity': 'first', 'longitude': 'first', 'latitude': 'first'}

# Group by 'timestamp' and 'plate_number', then aggregate
stph_app_data = stph_app_data.groupby(['timestamp', 'plate_number']).agg(agg_dict).reset_index()

In [9]:
# Add distance travelled, per vehicle
from stph_trips.trip_summary_STPHapp import haversine  

## Sort by deviceCode and timestamp to ensure correct ordering
stph_app_data = stph_app_data.sort_values(['plate_number', 'timestamp'])

## Compute previous latitude, longitude, and timestamp per device
stph_app_data['prev_latitude'] = stph_app_data.groupby('plate_number')['latitude'].shift(1)
stph_app_data['prev_longitude'] = stph_app_data.groupby('plate_number')['longitude'].shift(1)
stph_app_data['prev_timestamp'] = stph_app_data.groupby('plate_number')['timestamp'].shift(1)

## Compute time difference in seconds
stph_app_data['time_diff'] = (stph_app_data['timestamp'] - stph_app_data['prev_timestamp']).dt.total_seconds()

## Compute distance traveled
stph_app_data['distance_travelled'] = stph_app_data.apply(
    lambda x: haversine(x['prev_latitude'], x['prev_longitude'], x['latitude'], x['longitude']) 
    if pd.notna(x['prev_latitude']) and x['time_diff'] <= 600 else 0,  # Ignore if time_diff > 600 sec (10 min)
    axis=1)
stph_app_data = stph_app_data.drop(columns=['prev_latitude', 'prev_longitude', 'prev_timestamp', 'time_diff'])

In [10]:
stph_app_data.head(3)

Unnamed: 0,timestamp,plate_number,current_number_of_passengers,altitude_in_meters,accuracy,is_boarding,is_alighting,trip_code,device_code,driver_code,...,vehicle_type,case_number,operator,route,chassis_number,validity,capacity,longitude,latitude,distance_travelled
0,2024-12-01 04:24:05,ABC-8765,0,68.5,HIGH,0.0,0.0,64bd67df-13da-4ff5-9d92-447724f65b81,e59623c2f706977f,guest_green_fox_105,...,PUV,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.534676,10.740532,0.0
15,2024-12-01 14:45:40,ABC-8765,0,62.7,HIGH,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e,e59623c2f706977f,guest_green_fox_105,...,PUV,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.499725,10.692713,0.0
17,2024-12-01 14:45:41,ABC-8765,0,62.7,HIGH,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e,e59623c2f706977f,guest_green_fox_105,...,PUV,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.499733,10.692716,0.000892


<hr style="border:2px solid darkblue"> 

## Trip segmentation

In [11]:
importlib.reload(stph_trips.trip_segmentor)

<module 'stph_trips.trip_segmentor' from '/Users/pam/Documents/Git/STPH/Trip segmentation/stph_trips/trip_segmentor.py'>

In [12]:
# Produce the route_dict first, the object name `routes_dict` shall not be changed
routes_dict = stph_trips.obtain_route_dict(path_of_gtfs_shapefiles = '/Users/pam/Documents/Git/STPH/Trip segmentation/gtfs_route_shape_files/',
                                           min_dist = 100, max_dist = 150)

In [21]:
vehicleFeeds_with_tripID_list = []

for vehicle in stph_app_data['plate_number'].unique().tolist():
    print("Now processing: " + vehicle)
    my_df = stph_app_data[stph_app_data['plate_number'] == vehicle].reset_index(drop=True)
    vehicle_feeds_with_tripID = stph_trips.trip_segmentation(vehicle_feeds_df = my_df,
                                                             routes_dict = routes_dict,
                                                             my_dist_cutoff = 100, 
                                                                 ## Increased, default is 50 meters
                                                             zero_cutoff = 60,  
                                                                 ## If a vehicle is not near a stop, should be no longer than 1 min (60 secs).
                                                                 ## Anything longer means that the vehicle is outside the route, and thus not a valid trip
                                                             my_dist_threshold = 0.3,
                                                                 ## Proportion of the expected distance that is allowable for a trip to be valid
                                                             my_time_threshold = 15,
                                                                 ## Minimum length (mins) for a trip to be considered "complete"
                                                             max_time_gap = 10,
                                                                 ## Maximum no. of minutes between two points for them to be in the same trip
                                                             STPHapp_indicator = True)   ## trips no shorter than 15 minutes
    vehicleFeeds_with_tripID_list.append(vehicle_feeds_with_tripID)

all_vehicleFeeds_with_tripID = pd.concat(vehicleFeeds_with_tripID_list, ignore_index = True)

Now processing: ABC-8765
Now processing: PRS-1234
Now processing: XYZ-1234


In [22]:
all_vehicleFeeds_with_tripID.head(3)

Unnamed: 0,timestamp,plate_number,current_number_of_passengers,altitude_in_meters,accuracy,is_boarding,is_alighting,trip_code,device_code,driver_code,...,case_number,operator,route,chassis_number,validity,capacity,longitude,latitude,distance_travelled,trip_identifier
0,2024-12-01 14:45:40,ABC-8765,0,62.7,HIGH,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e,e59623c2f706977f,guest_green_fox_105,...,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.499725,10.692713,0.0,outbound_trip_20241201_144540
1,2024-12-01 14:45:41,ABC-8765,0,62.7,HIGH,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e,e59623c2f706977f,guest_green_fox_105,...,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.499733,10.692716,0.000892,outbound_trip_20241201_144540
2,2024-12-01 14:45:42,ABC-8765,0,62.7,HIGH,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e,e59623c2f706977f,guest_green_fox_105,...,2024-TEST_Parasol,WVTC,Route 9,TEST,2030-12-25,22,122.499733,10.692723,0.000848,outbound_trip_20241201_144540


In [23]:
# Optional: Saving feeds with trip_identifier to CSV 
## all_vehicleFeeds_with_tripID.to_csv("STPH app feeds with Trip ID - Dec 2024.csv", index = False)

<hr style="border:2px solid darkblue"> 

## Mapping

In [None]:
# Optional
all_vehicleFeeds_with_tripID['trip_id'] = all_vehicleFeeds_with_tripID['plate_number'] + "_" + all_vehicleFeeds_with_tripID['trip_identifier']
all_vehicleFeeds_with_tripID.head(2)

In [None]:
from stph_trips.trip_segmentor import route_gtfs_stops_mapper
all_complete_trips = [trip for trip in all_vehicleFeeds_with_tripID['trip_id'].unique().tolist() if 'cuttrip' not in trip]
sample_trips = random.sample(all_complete_trips, 10)
for trip in sample_trips:
    feeds = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['trip_id'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
    try:
        route_gtfs_stops_mapper(feeds, output_html = trip + ".html")
    except:
        pass

In [None]:
# Algo-generated trips
sample_trips = ['c47d04e1-883a-4b98-a6e4-af558c19e496', 'd4c2b0ee-e389-42cb-95b1-a1868784feed']
for trip in sample_trips:
    feeds = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['trip_identifier'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
    try:
        route_gtfs_stops_mapper(feeds, output_html = trip + ".html")
    except:
        pass

In [None]:
# App-generated trips
sample_trips = ['c47d04e1-883a-4b98-a6e4-af558c19e496', 'd4c2b0ee-e389-42cb-95b1-a1868784feed']
for trip in sample_trips:
    feeds = stph_app_trips[stph_app_trips['trip_identifier'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
    try:
        route_gtfs_stops_mapper(feeds, output_html = trip + ".html")
    except:
        pass

<hr style="border:2px solid darkblue"> 

## Super summaries

### Trips segmented by own algo

In [24]:
importlib.reload(stph_trips.trip_summary_STPHapp)
from stph_trips.trip_summary_STPHapp import tripSuperSummary_STPHapp

In [25]:
tripSuperSummaries_list = []

for vehicle in all_vehicleFeeds_with_tripID['plate_number'].unique().tolist():
    print("Now processing: " + vehicle)
    my_df = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['plate_number'] == vehicle].reset_index(drop=True)
    trip_summaries = tripSuperSummary_STPHapp(vehicle_feeds_with_tripID = my_df,
                                                     speed_cutoff = 5,
                                                     overwaiting_time = [90, 150],
                                                     overspeeding_thresholds = [60, 65],
                                                     harsh_acceleration = [2.5, 3.5],
                                                     harsh_braking = [-2.5, -3.5])
    tripSuperSummaries_list.append(trip_summaries)

tripSuperSummaries = pd.concat(tripSuperSummaries_list, ignore_index = True)

Now processing: ABC-8765
Now processing: PRS-1234
Now processing: XYZ-1234


In [26]:
tripSuperSummaries.head(3)

Unnamed: 0,Vehicle ID,Route,Trip type,Trip status,Date,Start time,End time,Total trip duration (min),Average speed (kph),Maximum speed (kph),...,Average overwaiting time (90),Total overwaiting events (150),Average overwaiting time (150),Total harsh acceleration events 2.5,Total harsh acceleration events 3.5,Total harsh braking events -2.5,Total harsh braking events -3.5,Total overspeeding duration (60-65 kph),Total overspeeding duration (65 kph),Trip ID
0,ABC-8765,Route 9,Outbound,Complete trip,"Dec 01, 2024",02:45 PM,03:31 PM,45.98,15.3,56.24,...,267.0,3,383.0,15,6,7,3,0.0,0.0,outbound_trip_20241201_144540
1,ABC-8765,Route 9,Inbound,Complete trip,"Dec 01, 2024",03:33 PM,04:06 PM,32.72,18.54,48.18,...,0.0,0,0.0,27,14,10,3,0.0,0.0,inbound_trip_20241201_153330
2,ABC-8765,Route 9,Outbound,Complete trip,"Dec 01, 2024",04:06 PM,04:33 PM,26.95,22.88,75.49,...,175.0,1,175.0,31,17,22,4,3.0,2.0,outbound_trip_20241201_160614


In [28]:
# Optional: Saving trip super summaries as an Excel file
## tripSuperSummaries.to_excel("STPH App Trip super summaries - Dec 2024.xlsx", index = False)

### Trips segmented by the app

In [29]:
stph_app_trips = stph_app_data.copy()
stph_app_trips = stph_app_trips.rename(columns = {'trip_code': 'trip_identifier'})

In [30]:
importlib.reload(stph_trips.trip_summary_STPH_AppTrips)
from stph_trips.trip_summary_STPH_AppTrips import tripSuperSummary_STPHapp2

In [31]:
tripSuperSummaries_list = []

for vehicle in stph_app_trips['plate_number'].unique().tolist():
    print("Now processing: " + vehicle)
    my_df = stph_app_trips[stph_app_trips['plate_number'] == vehicle].reset_index(drop=True)
    trip_summaries = tripSuperSummary_STPHapp2(vehicle_feeds_with_tripID = my_df,
                                               speed_cutoff = 5,
                                               overwaiting_time = [90, 150],
                                               overspeeding_thresholds = [60, 65],
                                               harsh_acceleration = [2.5, 3.5],
                                               harsh_braking = [-2.5, -3.5])
    tripSuperSummaries_list.append(trip_summaries)

tripSuperSummaries_apptrips = pd.concat(tripSuperSummaries_list, ignore_index = True)

Now processing: ABC-8765
Now processing: PRS-1234
Now processing: XYZ-1234


In [32]:
tripSuperSummaries_apptrips.head(3)

Unnamed: 0,Vehicle ID,Route,Trip type,Date,Start time,End time,Total trip duration (min),Average speed (kph),Maximum speed (kph),Total distance travelled (km),...,Average overwaiting time (90),Total overwaiting events (150),Average overwaiting time (150),Total harsh acceleration events 2.5,Total harsh acceleration events 3.5,Total harsh braking events -2.5,Total harsh braking events -3.5,Total overspeeding duration (60-65 kph),Total overspeeding duration (65 kph),Trip ID
0,ABC-8765,Route 9,App-generated Trip,"Dec 01, 2024",04:24 AM,04:24 AM,0.0,,,0.0,...,0.0,0,0.0,0,0,0,0,0.0,0.0,64bd67df-13da-4ff5-9d92-447724f65b81
1,ABC-8765,Route 9,App-generated Trip,"Dec 01, 2024",02:45 PM,04:07 PM,81.52,16.61,56.24,19.98,...,267.0,3,383.0,44,22,17,6,0.0,0.0,49fc9847-6969-4754-a950-b46c44976f1e
2,ABC-8765,Route 9,App-generated Trip,"Dec 01, 2024",04:07 PM,05:07 PM,59.93,20.43,75.49,19.95,...,0.0,0,0.0,62,35,38,10,3.0,2.0,a43d9b7b-4eb1-4320-9108-af28684c25f9


In [33]:
# Optional: Saving app-generated trip super summaries as an Excel file
## tripSuperSummaries_apptrips.to_excel("STPH App Trip super summaries - Dec 2024 - Trips from the app.xlsx", index = False)

*End of code*

---

Date of most recent edit: **8 April 2025**