In [2]:
import pandas as pd

# Load GPS data
df = pd.read_csv('data/Workbook.csv')

# Clean data
df = df.dropna(subset=['Longitude', 'Latitude', 'TimeStamp'])
df = df[df['Accuracy'] < 50]  # Filter high-accuracy points
df = df[df['Provider'] == 'gps']  # Optional: Use GPS provider only
df = df.sort_values('TimeStamp')

# Convert timestamps
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%Y %I:%M:%S %p')

In [3]:
df.shape

(99147, 13)

In [4]:
# import pandas as pd
# from sklearn.cluster import DBSCAN
# from geopy.distance import great_circle
# import numpy as np

# # Convert to coordinate tuples
# coords = df[['Latitude', 'Longitude']].to_numpy()

# # Apply DBSCAN clustering to detect dense clusters
# kms_per_radian = 6371.0088
# epsilon = 0.03 / kms_per_radian  # 30 meters radius in radians
# db = DBSCAN(eps=epsilon, min_samples=5, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

# # Labeling
# df['cluster'] = db.labels_

# # Remove points from dense clusters (optional: keep only noise points or non-core clusters)
# filtered_df = df[df['cluster'] == -1]  # Keep only noise/outliers


# print(f"Original points: {len(df)}, Cleaned points: {len(filtered_df)}")


In [5]:
from geopy.distance import geodesic
import matplotlib.pyplot as plt

DISTANCE_THRESHOLD_METERS = 20  # Minimum distance to retain a point
# === STEP 2: REMOVE CONCENTRATED POINTS ===
cleaned_points = []
last_point = None

for _, row in df.iterrows():
    current_point = (row['Latitude'], row['Longitude'])
    if last_point is None:
        cleaned_points.append(row)
        last_point = current_point
    else:
        distance = geodesic(last_point, current_point).meters
        if distance >= DISTANCE_THRESHOLD_METERS:
            cleaned_points.append(row)
            last_point = current_point

# === STEP 3: SAVE CLEANED DATA ===
cleaned_df = pd.DataFrame(cleaned_points)

In [6]:
cleaned_df.shape

(745, 13)

In [7]:
from geopy.distance import geodesic

# Function to detect stops
def detect_stops(df, distance_threshold=30, time_threshold=60):
    stops = []
    prev_point = None
    stop_start = None
    stop_points = []
    for index, row in df.iterrows():
        point = (row['Latitude'], row['Longitude'])
        bearing = row['Bearing']
        if prev_point is None:
            prev_point = point
            stop_start = row['DateTime']
            stop_points.append(index)
            continue
        distance = geodesic(prev_point, point).meters
        if distance < distance_threshold and bearing == 0:
            stop_points.append(index)
        else:
            time_diff = (row['DateTime'] - stop_start).total_seconds()
            if time_diff > time_threshold:
                stops.append({
                    'start_time': stop_start,
                    'end_time': row['DateTime'],
                    'indices': stop_points,
                    'location': prev_point
                })
            prev_point = point
            stop_start = row['DateTime']
            stop_points = [index]
    if stop_points and (df.iloc[-1]['DateTime'] - stop_start).total_seconds() > time_threshold:
        stops.append({
            'start_time': stop_start,
            'end_time': df.iloc[-1]['DateTime'],
            'indices': stop_points,
            'location': prev_point
        })
    return stops

stops = detect_stops(df)
stop_indices = [idx for stop in stops for idx in stop['indices']]
road_df = df[~df.index.isin(stop_indices)]

In [8]:
road_df.shape

(74156, 13)

In [9]:
import pandas as pd
import requests
import folium
import polyline
import time
from geopy.distance import geodesic

In [10]:
def visualize_all_locations(df):
    """
    Visualize all locations from a CSV file on a folium map.
    :param csv_file_path: Path to the CSV file containing location data
    :return: folium.Map object
    """
    # Load the CSV data into a DataFrame
    # try:
    #     df = df
    # except Exception as e:
    #     print(f"Error loading CSV file {csv_file_path}: {e}")
    #     return None
    try:
        # Create a map centered at the mean of all latitudes and longitudes
        map_center = [df['Latitude'].mean(), df['Longitude'].mean()]
        m = folium.Map(location=map_center, zoom_start=10)
    except Exception as e:
        print(f"Error getting gps data: {e}")
        return None

    # Add a marker for each location
    for _, row in df.iterrows():
        folium.Marker(
            location=[row['Latitude'], row['Longitude']],
            # popup=f"CODE: {row['CODE']}<br>LOCATION: {row['LOCATION']}<br>ADDRESS: {row['ADDRESS']}<br>BRAND: {row['BRAND']}",
            popup=f"Accuracy: {row['Accuracy']}<br>Latitude: {row['Latitude']}<br>Longitude: {row['Longitude']}<br>DeviceTime: {row['DeviceTime']}",
            icon=folium.Icon(color='blue', icon='info-sign')
        ).add_to(m)

    # Add title
    title_html = '<h3 align="center" style="font-size:16px">All Locations Map</h3>'
    m.get_root().html.add_child(folium.Element(title_html))

    return m

In [11]:
visualize_all_locations(cleaned_df)

In [12]:
# Segment trip based on time gaps and movement
road_df['TimeDiff'] = road_df['DateTime'].diff().dt.total_seconds()
segments = []
current_segment = []
for index, row in road_df.iterrows():
    if not current_segment or (row['TimeDiff'] < 600 and row['Bearing'] != 0):
        current_segment.append((row['Latitude'], row['Longitude']))
    else:
        if len(current_segment) > 1:
            segments.append(current_segment)
        current_segment = [(row['Latitude'], row['Longitude'])]
if len(current_segment) > 1:
    segments.append(current_segment)

# Moderate downsampling (e.g., every 3rd point or distance-based)
def downsample_segment(segment, max_waypoints=20):
    if len(segment) <= max_waypoints:
        return segment
    step = max(1, len(segment) // max_waypoints)
    return segment[::step]

waypoints_per_segment = [downsample_segment(segment) for segment in segments]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  road_df['TimeDiff'] = road_df['DateTime'].diff().dt.total_seconds()


In [13]:
waypoints_per_segment

[[(6.89274, 80.03597),
  (6.89277, 80.03596),
  (6.89274, 80.03595),
  (6.89275, 80.03596),
  (6.89274, 80.03597),
  (6.89273, 80.03597),
  (6.89271, 80.03598),
  (6.8927, 80.03599),
  (6.89269, 80.03598),
  (6.89269, 80.03598),
  (6.89268, 80.03598),
  (6.89267, 80.03599),
  (6.89268, 80.036),
  (6.89268, 80.036),
  (6.89269, 80.036),
  (6.89269, 80.036),
  (6.89269, 80.036),
  (6.89269, 80.036),
  (6.89269, 80.036),
  (6.89269, 80.036),
  (6.89272, 80.03597),
  (6.89272, 80.03597),
  (6.89272, 80.03597)],
 [(6.89271, 80.03596),
  (6.89268, 80.03594),
  (6.8927, 80.03595),
  (6.89269, 80.03597),
  (6.89269, 80.036),
  (6.89269, 80.03601),
  (6.89269, 80.03601),
  (6.89268, 80.03598),
  (6.89267, 80.03598),
  (6.89267, 80.03598),
  (6.89268, 80.03599),
  (6.89272, 80.03599),
  (6.89272, 80.03597),
  (6.89274, 80.03596),
  (6.89276, 80.03597),
  (6.89278, 80.03597),
  (6.89279, 80.03596),
  (6.89279, 80.03596),
  (6.89279, 80.03596),
  (6.89279, 80.03592),
  (6.89287, 80.03589)],
 [(6.8

In [14]:
import folium
from folium.plugins import MarkerCluster
import random

In [15]:
# Initialize Folium map
m = folium.Map(zoom_start=10, tiles='OpenStreetMap')

# Add marker cluster for raw GPS points
marker_cluster = MarkerCluster().add_to(m)
for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        color='gray',
        fill=True,
        fill_color='gray',
        fill_opacity=0.5,
        popup=f"Raw GPS: {row['TimeStamp']}, Accuracy: {row.get('Accuracy', 'N/A')}"
    ).add_to(marker_cluster)

# Define colors for different segments
colors = ['blue', 'red', 'green', 'purple', 'orange', 'darkblue', 'darkred', 'darkgreen']

# Plot waypoints and polylines for each segment
for i, segment in enumerate(waypoints_per_segment):
    if len(segment) < 1:
        continue
    color = colors[i % len(colors)]
    
    # Add polyline for the segment
    folium.PolyLine(
        locations=segment,
        color=color,
        weight=5,
        opacity=0.7,
        popup=f"Segment {i+1}"
    ).add_to(m)
    
    # Add markers for waypoints
    for j, (lat, lon) in enumerate(segment):
        folium.Marker(
            location=[lat, lon],
            popup=f"Segment {i+1}, Waypoint {j+1}",
            icon=folium.Icon(color=color, icon='circle')
        ).add_to(m)

In [None]:
m

In [None]:
import googlemaps
from datetime import datetime

# Initialize Google Maps client
gmaps = googlemaps.Client(key='AIzaSyACpYMHnmkd8DWxWS3KTJ70EeKIRYN2xHM')

# Function to get road distance for a segment
def get_road_distance(waypoints):
    if len(waypoints) < 2:
        return 0
    try:
        directions = gmaps.directions(
            origin=waypoints[0],
            destination=waypoints[-1],
            waypoints=waypoints[1:-1] if len(waypoints) > 2 else None,
            mode='driving',
            departure_time=datetime.now()
        )
        distance = sum(leg['distance']['value'] for leg in directions[0]['legs']) / 1000  # Convert to km
        return distance
    except Exception as e:
        print(f"Error for waypoints {waypoints}: {e}")
        return 0

# Calculate total distance
total_distance = sum(get_road_distance(segment) for segment in waypoints_per_segment)
print(f"Total road distance: {total_distance:.2f} km")

NameError: name 'waypoints_per_segment' is not defined

In [None]:
# Further downsample if segment has too many waypoints
waypoints_per_segment = [segment[::10] if len(segment) > 20 else segment for segment in waypoints_per_segment]

In [None]:
import pandas as pd
import requests
import folium
import polyline
import time
from geopy.distance import geodesic
from math import radians, sin, cos, sqrt, atan2

# ======= CONFIGURATION =======
API_KEY = 'AIzaSyACpYMHnmkd8DWxWS3KTJ70EeKIRYN2xHM'  # Replace with your actual API key
FILE_PATH = 'data/Gps-Collection-2.csv'  # Path to your CSV file
ACCURACY_THRESHOLD = 100  # Max acceptable GPS accuracy (meters)
MIN_MOVE_DISTANCE_M = 10  # Minimum movement to keep points (meters)
MAX_SNAP_DISTANCE_M = 10 # Max snap-to-road distance (meters)
TIME_THRESHOLD_S = 60  # 1 minute for stop detection
DISTANCE_THRESHOLD_M = 10  # Max distance for stop clustering
TIME_GAP_THRESHOLD_S = 1800  # 30 minutes for trip segmentation
# =============================

# Step 1: Load and preprocess data
df = pd.read_csv(FILE_PATH)
df.columns = df.columns.str.strip()
df = df.dropna(subset=['Longitude', 'Latitude', 'TimeStamp'])
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%Y %I:%M:%S %p')
df = df.sort_values('DateTime')

# Detect outliers
def detect_outliers(df, max_distance_m=500):
    outliers = []
    prev_point = None
    for index, row in df.iterrows():
        point = (row['Latitude'], row['Longitude'])
        if prev_point is None:
            prev_point = point
            continue
        distance = geodesic(prev_point, point).meters
        if distance > max_distance_m:
            outliers.append(index)
        prev_point = point
    return outliers

outlier_indices = detect_outliers(df)
df = df[~df.index.isin(outlier_indices)]
print(f"Removed {len(outlier_indices)} outliers (jumps > 500m)")

# Step 2: Segment trips by time gaps
df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
trip_segments = []
current_segment = [df.index[0]]
for i in range(1, len(df)):
    if df.iloc[i]['TimeDiff'] > TIME_GAP_THRESHOLD_S:
        trip_segments.append(df.loc[current_segment])
        current_segment = [df.index[i]]
    else:
        current_segment.append(df.index[i])
trip_segments.append(df.loc[current_segment])

# Filter segments and select the longest
def estimate_segment_distance(segment_df):
    if len(segment_df) < 2:
        return 0
    distance = 0
    for i in range(1, len(segment_df)):
        distance += geodesic(
            (segment_df.iloc[i-1]['Latitude'], segment_df.iloc[i-1]['Longitude']),
            (segment_df.iloc[i]['Latitude'], segment_df.iloc[i]['Longitude'])
        ).km
    return distance

target_segment = None
max_distance = 0
for segment in trip_segments:
    segment = segment[segment['Accuracy'] <= ACCURACY_THRESHOLD]
    if len(segment) < 2:
        continue
    seg_distance = estimate_segment_distance(segment)
    print(f"Segment from {segment['DateTime'].min()} to {segment['DateTime'].max()}: ~{seg_distance:.2f} km")
    if seg_distance > max_distance:
        max_distance = seg_distance
        target_segment = segment

# Check if a segment was selected
if target_segment is None or target_segment.empty:
    raise ValueError("No valid segments found. Check data integrity.")

df_filtered = target_segment
raw_points = list(zip(df_filtered['Latitude'], df_filtered['Longitude']))
print(f"Selected largest segment: {len(raw_points)} points (accuracy ≤ {ACCURACY_THRESHOLD}m)")

# Print start and end coordinates of the segment
print(f"Segment start: ({df_filtered.iloc[0]['Latitude']}, {df_filtered.iloc[0]['Longitude']})")
print(f"Segment end: ({df_filtered.iloc[-1]['Latitude']}, {df_filtered.iloc[-1]['Longitude']})")

# Step 3: Detect and exclude stops
def detect_stops(df, distance_threshold=10, time_threshold=60):
    stops = []
    prev_point = None
    stop_start = None
    stop_points = []
    for index, row in df.iterrows():
        point = (row['Latitude'], row['Longitude'])
        bearing = row['Bearing']
        accuracy = row['Accuracy']
        if prev_point is None:
            prev_point = point
            stop_start = row['DateTime']
            stop_points.append(index)
            continue
        distance = geodesic(prev_point, point).meters
        adjusted_threshold = max(distance_threshold, accuracy / 2)  # Adjust threshold by accuracy
        if distance < adjusted_threshold and (bearing == 0 or pd.isna(bearing)):
            stop_points.append(index)
        else:
            time_diff = (row['DateTime'] - stop_start).total_seconds()
            if time_diff >= time_threshold:
                stops.append({
                    'start_time': stop_start,
                    'end_time': row['DateTime'],
                    'indices': stop_points,
                    'location': prev_point
                })
            prev_point = point
            stop_start = row['DateTime']
            stop_points = [index]
    if stop_points and (df.iloc[-1]['DateTime'] - stop_start).total_seconds() >= time_threshold:
        stops.append({
            'start_time': stop_start,
            'end_time': df.iloc[-1]['DateTime'],
            'indices': stop_points,
            'location': prev_point
        })
    return stops

stops = detect_stops(df_filtered)
stop_indices = [idx for stop in stops for idx in stop['indices']]
road_df = df_filtered[~df_filtered.index.isin(stop_indices)]
road_points = list(zip(road_df['Latitude'], road_df['Longitude']))
print(f"Remaining after stop detection: {len(road_points)} points ({len(stop_indices)} stop points removed)")
# Debug stops
for stop in stops:
    print(f"Stop at {stop['location']} from {stop['start_time']} to {stop['end_time']}, {len(stop['indices'])} points")

# Step 4: Filter by minimum movement
def filter_by_distance(points, min_distance_m=8):
    if not points:
        return []
    filtered = [points[0]]
    for pt in points[1:]:
        if geodesic(filtered[-1], pt).meters >= min_distance_m:
            filtered.append(pt)
    return filtered

filtered_points = filter_by_distance(road_points, MIN_MOVE_DISTANCE_M)
print(f"Remaining after movement filtering: {len(filtered_points)} points")

# Step 5: Snap to nearest road
def snap_to_nearest_road_filtered(points, max_snap_distance_m=50):
    snapped = []
    for i in range(0, len(points), 100):
        batch = points[i:i+100]
        path = "|".join([f"{lat},{lng}" for lat, lng in batch])
        url = f"https://roads.googleapis.com/v1/snapToRoads?path={path}&interpolate=true&key={API_KEY}"
        try:
            r = requests.get(url)
            r.raise_for_status()
            data = r.json()
            if 'snappedPoints' in data:
                for p in data['snappedPoints']:
                    snapped_point = (p['location']['latitude'], p['location']['longitude'])
                    if 'originalIndex' in p:
                        idx = p['originalIndex']
                        original = batch[idx]
                        distance = geodesic(original, snapped_point).meters
                        if distance <= max_snap_distance_m:
                            snapped.append(snapped_point)
                    else:
                        snapped.append(snapped_point)  # Include interpolated points
            else:
                print(f"Snap error: {data.get('status')} - {data.get('error_message')}")
        except requests.RequestException as e:
            print(f"Snap request failed: {e}")
        time.sleep(0.1)
    return snapped

snapped_path = snap_to_nearest_road_filtered(filtered_points, MAX_SNAP_DISTANCE_M)
print(f"Snapped points within {MAX_SNAP_DISTANCE_M}m of roads: {len(snapped_path)}")

# Step 6: Calculate road distance using Directions API
def get_road_distance(points):
    total_km = 0
    for i in range(0, len(points), 24):
        segment = points[i:i+25]
        if len(segment) < 2:
            continue
        origin = segment[0]
        destination = segment[-1]
        waypoints = "|".join([f"via:{lat},{lng}" for lat, lng in segment[1:-1]]) if len(segment) > 2 else ""
        url = (
            f"https://maps.googleapis.com/maps/api/directions/json?"
            f"origin={origin[0]},{origin[1]}&destination={destination[0]},{destination[1]}"
            f"&waypoints={waypoints}&mode=driving&key={API_KEY}"
        )
        try:
            r = requests.get(url)
            r.raise_for_status()
            data = r.json()
            if data["status"] == "OK":
                for leg in data["routes"][0]["legs"]:
                    total_km += leg["distance"]["value"] / 1000
            else:
                print(f"Directions error: {data.get('status')} - {data.get('error_message')}")
        except requests.RequestException as e:
            print(f"Directions request failed: {e}")
        time.sleep(0.1)
    return total_km

total_km = get_road_distance(snapped_path)
print(f"✅ Total road distance: {total_km:.2f} km")

# Step 7: Visualize the route
def get_directions_geometry(points):
    route_coords = []
    for i in range(0, len(points), 24):
        segment = points[i:i+25]
        if len(segment) < 2:
            continue
        origin = segment[0]
        destination = segment[-1]
        waypoints = "|".join([f"via:{lat},{lng}" for lat, lng in segment[1:-1]]) if len(segment) > 2 else ""
        url = (
            f"https://maps.googleapis.com/maps/api/directions/json?"
            f"origin={origin[0]},{origin[1]}&destination={destination[0]},{destination[1]}"
            f"&waypoints={waypoints}&mode=driving&key={API_KEY}"
        )
        try:
            r = requests.get(url)
            r.raise_for_status()
            data = r.json()
            if data["status"] == "OK":
                poly = data["routes"][0]["overview_polyline"]["points"]
                route_coords.extend(polyline.decode(poly))
            else:
                print(f"Polyline error: {data.get('status')} - {data.get('error_message')}")
        except requests.RequestException as e:
            print(f"Polyline request failed: {e}")
        time.sleep(0.1)
    return route_coords

route_coords = get_directions_geometry(snapped_path)

# Plot map
m = folium.Map(location=route_coords[0] if route_coords else [6.13852, 80.10066], zoom_start=10)
if route_coords:
    folium.PolyLine(route_coords, color="blue", weight=5).add_to(m)
    folium.Marker(route_coords[0], tooltip="Start", icon=folium.Icon(color='green')).add_to(m)
    folium.Marker(route_coords[-1], tooltip="End", icon=folium.Icon(color='red')).add_to(m)
m.save("route_map.html")
print("Map saved as route_map.html")

# Step 8: Validate with corrected Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

raw_distance = 0
for i in range(1, len(df_filtered)):
    dist = haversine_distance(
        df_filtered.iloc[i-1]['Latitude'], df_filtered.iloc[i-1]['Longitude'],
        df_filtered.iloc[i]['Latitude'], df_filtered.iloc[i]['Longitude']
    )
    if dist > 1:  # Log large jumps
        print(f"Large jump at index {i}: {dist:.2f} km")
    raw_distance += dist
print(f"Raw GPS distance (selected segment): {raw_distance:.2f} km")

road_raw_distance = 0
for i in range(1, len(road_df)):
    dist = haversine_distance(
        road_df.iloc[i-1]['Latitude'], road_df.iloc[i-1]['Longitude'],
        road_df.iloc[i]['Latitude'], road_df.iloc[i]['Longitude']
    )
    if dist > 1:
        print(f"Large jump in road points at index {i}: {dist:.2f} km")
    road_raw_distance += dist
print(f"Raw GPS distance (road points): {road_raw_distance:.2f} km")

# Direct start-to-end distance check
if snapped_path:
    start = snapped_path[0]
    end = snapped_path[-1]
    url = (
        f"https://maps.googleapis.com/maps/api/directions/json?"
        f"origin={start[0]},{start[1]}&destination={end[0]},{end[1]}"
        f"&mode=driving&key={API_KEY}"
    )
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        if data["status"] == "OK":
            direct_distance = sum(leg["distance"]["value"] for leg in data["routes"][0]["legs"]) / 1000
            print(f"Direct road distance (start to end): {direct_distance:.2f} km")
        else:
            print(f"Direct distance error: {data.get('status')} - {data.get('error_message')}")
    except requests.RequestException as e:
        print(f"Direct distance request failed: {e}")

In [None]:
m