# Trip Segmentation & Super Summaries with Telematics Data

In [1]:
# !pip install pandas numpy geopy
# !pip install folium
# !pip install scipy

In [2]:
import pandas as pd
from datetime import timedelta
import os
import numpy as np
import random  # For mapping a random sample of trips

# Library created to create trips
import importlib
import stph_trips
importlib.reload(stph_trips)

<module 'stph_trips' from '/Users/pam/Documents/Git/STPH/Trip segmentation/stph_trips/__init__.py'>

<hr style="border:2px solid darkblue"> 

## Import telematics data

In [3]:
def import_and_concatenate_csvs(folder_path):
    all_dfs = []
    
    # Loop through all files in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):  # Check if the file is a CSV
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)  # Read CSV into DataFrame
            all_dfs.append(df)  # Store DataFrame in list
    
    # Concatenate all DataFrames into one
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        return final_df
    else:
        print("No CSV files found in the specified folder.")
        return None

folder_path = "/Users/pam/Documents/Git/STPH/Trip segmentation/sample_datasets/telematics" 
telematics_data = import_and_concatenate_csvs(folder_path)

#### Note

The code block below needs to be updated! The matching of vehicle_code / imei with the Route should be pulled from an independent data source that is updated whenever we add a new vehicle in our dataset.

In [4]:
# Data cleaning of telematics
imeis = ['350612076055969', '350612079013932', '350612076078946',
         '350612076068145', '350612076064508', '350544507513318']
routes = ['Route 9', 'Route 9', 'Route 17', 'Route 17', 'Route 9', 'Route 17']

list_of_ejeepneys = pd.DataFrame(data = {'imei': imeis, 'route': routes})

In [5]:
# Fix timestamp
telematics_data['timestamp'] = telematics_data['timestamp'].apply(str).str.replace('+00:00', '').str.replace('T', ' ').str[:19]
telematics_data['timestamp'] = pd.to_datetime(telematics_data['timestamp'], format = '%Y-%m-%d %H:%M:%S')

# Convert imei and deviceCode to string
telematics_data['deviceCode'] = telematics_data['deviceCode'].apply(str).str.replace('.0', '')
telematics_data['imei'] = telematics_data['imei'].apply(str).str.replace('.0', '')

telematics_data = telematics_data.sort_values(by = ['timestamp', 'deviceCode'])

In [6]:
# Add route
telematics_with_route = telematics_data.merge(list_of_ejeepneys, how='left', on = 'imei')
telematics_with_route.dropna(subset = ['route'], inplace = True)
telematics_with_route.reset_index(drop=True, inplace=True)

del telematics_data   ## To save memory 

In [7]:
# Add distance travelled, per vehicle
from stph_trips.trip_summary_telematics import haversine  

## Sort by deviceCode and timestamp to ensure correct ordering
telematics_with_route = telematics_with_route.sort_values(['deviceCode', 'timestamp'])

## Compute previous latitude, longitude, and timestamp per device
telematics_with_route['prev_latitude'] = telematics_with_route.groupby('deviceCode')['latitude'].shift(1)
telematics_with_route['prev_longitude'] = telematics_with_route.groupby('deviceCode')['longitude'].shift(1)
telematics_with_route['prev_timestamp'] = telematics_with_route.groupby('deviceCode')['timestamp'].shift(1)

## Compute time difference in seconds
telematics_with_route['time_diff'] = (telematics_with_route['timestamp'] - telematics_with_route['prev_timestamp']).dt.total_seconds()

## Compute distance traveled
telematics_with_route['distanceTravelled'] = telematics_with_route.apply(
    lambda x: haversine(x['prev_latitude'], x['prev_longitude'], x['latitude'], x['longitude']) 
    if pd.notna(x['prev_latitude']) and x['time_diff'] <= 600 else 0,  # Ignore if time_diff > 600 sec (10 min)
    axis=1)
telematics_with_route = telematics_with_route.drop(columns=['prev_latitude', 'prev_longitude', 'prev_timestamp', 'time_diff'])

In [8]:
telematics_with_route = telematics_with_route.reset_index(drop = True)
telematics_with_route.head(3)

Unnamed: 0,imei,latitude,longitude,timestamp,accelerationInMetersPerSecondSquared,distanceTravelledInKm,distanceTravelledSinceFaultCodesOrClearedInKm,tripOdomenterInKm,vehicleSpeedInKph,boardingCount,...,fuelPressure,fuelRailTemperatureIntake,manifoldAbsolutePressure,massAirFlowRate,throttlePosition,motorCurrentInXxx,averageFuelUse,directFuelRailPressure,route,distanceTravelled
0,350612076068145,10.688977,122.525517,2025-01-23 06:42:07,0,0,21126,0,0,0,...,4026,0,0,1966,7,0,362,3896,Route 17,0.0
1,350612076078946,14.685118,121.085595,2025-01-07 20:46:33,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Route 17,0.0
2,350612076078946,14.685118,121.085595,2025-01-07 20:46:34,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Route 17,0.0


<hr style="border:2px solid darkblue"> 

## Trip segmentation

In [9]:
# Produce the route_dict first, the object name `routes_dict` shall not be changed
routes_dict = stph_trips.obtain_route_dict(path_of_gtfs_shapefiles = '/Users/pam/Desktop/venv/STPH Trip Summary and Segmentation/Route Stops/Shapes/',
                                           min_dist = 100, max_dist = 150)

In [10]:
vehicleFeeds_with_tripID_list = []

for vehicle in telematics_with_route['imei'].unique().tolist():
    print("Now processing: " + vehicle)
    my_df = telematics_with_route[telematics_with_route['imei'] == vehicle].reset_index(drop=True)
    vehicle_feeds_with_tripID = stph_trips.trip_segmentation(vehicle_feeds_df = my_df,
                                                             routes_dict = routes_dict,
                                                             my_dist_cutoff = 100, 
                                                                ## Increased, default is 50 meters
                                                             zero_cutoff = 60, 
                                                                ## If a vehicle is not near a stop, should be no longer than 1 min (60 secs).
                                                                ## Anything longer means that the vehicle is outside the route, and thus not a valid trip
                                                             my_dist_threshold = 0.3,
                                                                ## Proportion of the expected distance that is allowable for a trip to be valid
                                                             my_time_threshold = 15,   
                                                                ## Minimum length (mins) for a trip to be considered "complete"
                                                             max_time_gap = 10,
                                                                    ## Maximum no. of minutes between two points for them to be in the same trip
                                                             STPHapp_indicator = False)   
    vehicleFeeds_with_tripID_list.append(vehicle_feeds_with_tripID)

all_vehicleFeeds_with_tripID = pd.concat(vehicleFeeds_with_tripID_list, ignore_index = True)

Now processing: 350612076068145
Now processing: 350612076078946
Now processing: 350612079013932


In [11]:
all_vehicleFeeds_with_tripID.head(5)

Unnamed: 0,imei,latitude,longitude,timestamp,accelerationInMetersPerSecondSquared,distanceTravelledInKm,distanceTravelledSinceFaultCodesOrClearedInKm,tripOdomenterInKm,vehicleSpeedInKph,boardingCount,...,fuelRailTemperatureIntake,manifoldAbsolutePressure,massAirFlowRate,throttlePosition,motorCurrentInXxx,averageFuelUse,directFuelRailPressure,route,distanceTravelled,trip_identifier
0,350612079013932,10.69261,122.499613,2025-01-20 06:00:12,0,0,-26425,661744,9,0,...,0,0,2449,32,0,0,8328,Route 9,0.002349,outbound_trip_20250120_060012
1,350612079013932,10.69263,122.499623,2025-01-20 06:00:13,0,0,-26425,661744,9,0,...,0,0,2680,30,0,0,7536,Route 9,0.002488,outbound_trip_20250120_060012
2,350612079013932,10.692655,122.499638,2025-01-20 06:00:14,0,0,-26425,661750,10,0,...,0,0,2794,30,0,0,7844,Route 9,0.003217,outbound_trip_20250120_060012
3,350612079013932,10.692673,122.499648,2025-01-20 06:00:15,0,0,-26425,661750,9,0,...,0,0,2811,28,0,0,6613,Route 9,0.002319,outbound_trip_20250120_060012
4,350612079013932,10.692695,122.499658,2025-01-20 06:00:16,0,0,-26425,661755,10,0,...,0,0,2897,29,0,0,7228,Route 9,0.002639,outbound_trip_20250120_060012


In [12]:
# Optional: Save the raw telematics data file with trip_id and route columms
## all_vehicleFeeds_with_tripID.to_csv("Telematics vehicle feeds with Trip ID.csv", index = False)

<hr style="border:2px solid darkblue"> 

## Super summaries

In [13]:
importlib.reload(stph_trips.trip_summary_telematics)
from stph_trips.trip_summary_telematics import tripSuperSummary_telematics

In [14]:
tripSuperSummaries_list = []

for vehicle in all_vehicleFeeds_with_tripID['imei'].unique().tolist():
    print("Now processing: " + vehicle)
    my_df = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['imei'] == vehicle].reset_index(drop=True)
    trip_summaries = tripSuperSummary_telematics(vehicle_feeds_with_tripID = my_df,
                                                 speed_cutoff = 5,
                                                 overwaiting_time = [90, 150],
                                                 overspeeding_thresholds = [60, 65],
                                                 harsh_acceleration = [(2.5, 2000), (2.5, 2500), (3.5, 2000), (3.5, 2500)],
                                                 harsh_braking = [(-2.5, -1000), (-3.5, -1000)])
    tripSuperSummaries_list.append(trip_summaries)

tripSuperSummaries = pd.concat(tripSuperSummaries_list, ignore_index = True)

Now processing: 350612079013932


In [15]:
tripSuperSummaries.head(5)

Unnamed: 0,Vehicle ID,Route,Trip type,Trip status,Date,Start time,End time,Total trip duration (min),Average speed (kph),Maximum speed (kph),...,Average overwaiting time (150),"Total harsh acceleration events (2.5, 2000)","Total harsh acceleration events (2.5, 2500)","Total harsh acceleration events (3.5, 2000)","Total harsh acceleration events (3.5, 2500)","Total harsh braking events (-2.5, -1000)","Total harsh braking events (-3.5, -1000)",Total overspeeding duration (60-65 kph),Total overspeeding duration (65 kph),Trip ID
0,350612079013932,Route 9,Outbound,Complete trip,"Jan 20, 2025",06:00 AM,06:29 AM,29.48,19.55,61.0,...,312.0,0,0,0,0,0,0,1.0,0.0,outbound_trip_20250120_060012
1,350612079013932,Route 9,Inbound,Complete trip,"Jan 20, 2025",06:41 AM,07:02 AM,20.75,22.97,69.0,...,0.0,0,0,0,0,0,0,13.0,5.0,inbound_trip_20250120_064122
2,350612079013932,Route 9,Outbound,Complete trip,"Jan 20, 2025",07:02 AM,07:59 AM,56.92,7.85,54.0,...,236.0,0,0,0,0,0,0,0.0,0.0,outbound_trip_20250120_070208
3,350612079013932,Route 9,Outbound,Cut trip,"Jan 20, 2025",07:59 AM,08:18 AM,18.93,6.55,36.0,...,0.0,0,0,0,0,0,0,0.0,0.0,outbound_cuttrip_20250120_075905
4,350612079013932,Route 9,Inbound,Complete trip,"Jan 20, 2025",08:23 AM,08:47 AM,23.93,21.34,58.0,...,0.0,0,0,0,0,0,0,0.0,0.0,inbound_trip_20250120_082313


In [16]:
# Optional: Saving trip super summaries as an Excel file
## tripSuperSummaries.to_excel("Trip super summaries.xlsx", index = False)

<hr style="border:2px solid darkblue"> 

## [Optional] Mapping complete trips for manual visual verification

In [None]:
# Optional
del telematics_with_route
# Adding the imei with the trip_id
all_vehicleFeeds_with_tripID['tripID'] = all_vehicleFeeds_with_tripID['imei'] + "_" + all_vehicleFeeds_with_tripID['trip_identifier']
all_vehicleFeeds_with_tripID.head(2)

### Plotting complete trips

A random sample of 10 complete trips:

In [None]:
all_complete_trips = [trip for trip in all_vehicleFeeds_with_tripID['tripID'].unique().tolist() if 'cuttrip' not in trip]
sample_trips = random.sample(all_complete_trips, 10)
for trip in sample_trips:
    feeds = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['tripID'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
    try:
        stph_trips.route_gtfs_stops_mapper(feeds, output_html = trip + ".html")
    except:
        pass

In [None]:
for trip in sample_trips:
    feeds = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['tripID'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
    try:
        stph_trips.route_gtfs_stops_mapper(feeds, output_html = feeds['route'].values[0] + " - " + trip + ".html")
    except:
        pass

Plotting 5 complete trips for each vehicle (imei)L

In [None]:
for imei in all_vehicleFeeds_with_tripID['imei'].unique().tolist():
    print(imei)
    imei_df = all_vehicleFeeds_with_tripID[all_vehicleFeeds_with_tripID['imei'] == imei]
    all_complete_trips = [trip for trip in imei_df['tripID'].unique().tolist() if 'cuttrip' not in trip]
    if len(all_complete_trips) > 0:
        if len(all_complete_trips) < 5:
            samp_size = len(all_complete_trips)
        else:
            samp_size = 5
        sample_trips = random.sample(all_complete_trips, samp_size)
        for trip in sample_trips:
            feeds = imei_df[imei_df['tripID'] == trip].sort_values(by = 'timestamp').reset_index(drop=True)
            try:
                stph_trips.route_gtfs_stops_mapper(feeds, output_html = feeds['route'].values[0] + " - " + trip + ".html")
            except:
                pass
    else:
        pass

*End of code*

---

Date of most recent edit: **8 April 2025**