In [8]:
import pandas as pd
from keplergl import KeplerGl
import openrouteservice
from shapely.geometry import LineString
from datetime import datetime, timedelta
import time # For potential rate limiting
import os
import io

In [2]:
df = pd.read_csv('output.csv')

In [4]:
df.head(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443
6,id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,N,341
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,N,1551
8,id1301050,1,2016-05-27 23:12:23,2016-05-27 23:16:38,1,-73.999481,40.7384,-73.985786,40.732815,N,255
9,id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973,40.789989,N,1225


In [5]:
df['path'] = df.apply(lambda row: [
    [row['pickup_longitude'], row['pickup_latitude']],
    [row['dropoff_longitude'], row['dropoff_latitude']]
], axis=1)

df['timestamp'] = pd.to_datetime(df['pickup_datetime']).astype(int) // 10**9  # Unix time

# Keep only necessary columns
df_final = df[['id', 'path', 'timestamp']]
df_final.to_json("kepler_routes.json", orient='records')

In [10]:
# --- CONFIGURATION ---
# IMPORTANT: Set your OpenRouteService API key as an environment variable
# export ORS_API_KEY='YOUR_API_KEY_HERE'
# OR, for quick testing (NOT recommended for shared code):
# ORS_API_KEY = 'YOUR_API_KEY_HERE'
ORS_API_KEY = '5b3ce3597851110001cf624810d1e1e3700e42bfa650e2a2f3cb5aa8'
if not ORS_API_KEY:
    raise ValueError("Please set the ORS_API_KEY environment variable or define it in the script.")

df = pd.read_csv('output.csv')
# If loading from a file:
# df = pd.read_csv('your_file.csv')

# Convert datetime columns
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

# For demonstration, let's process only a small subset to avoid hitting API limits quickly
# df = df.head(2) # Uncomment to test with fewer rows

# --- FUNCTION TO GET ROUTE AND INTERPOLATE TIMESTAMPS ---
def get_routed_trip_data(row):
    pickup_coords = (row['pickup_longitude'], row['pickup_latitude'])
    dropoff_coords = (row['dropoff_longitude'], row['dropoff_latitude'])

    try:
        # 1. Get route from ORS
        # ORS expects (longitude, latitude)
        routes = ors_client.directions(
            coordinates=[pickup_coords, dropoff_coords],
            profile='driving-car', # or 'cycling-regular', 'foot-walking', etc.
            format='geojson',
            instructions=False, # We don't need turn-by-turn instructions
            # geometry_simplify='true' # Can simplify geometry if needed
        )
        
        # Extract the coordinates from the GeoJSON response
        # The geometry is a LineString
        route_coords = routes['features'][0]['geometry']['coordinates']
        
        if not route_coords or len(route_coords) < 2:
            print(f"No valid route found for trip {row['id']}")
            return None, None

        # 2. Create Shapely LineString for the route
        route_geometry = LineString(route_coords)

        # 3. Interpolate timestamps along the route
        # We'll distribute the actual trip duration proportionally to the segment lengths
        
        pickup_time_obj = row['pickup_datetime']
        dropoff_time_obj = row['dropoff_datetime']
        actual_total_duration_seconds = (dropoff_time_obj - pickup_time_obj).total_seconds()

        if actual_total_duration_seconds <= 0: # Handle cases where duration is zero or negative
             print(f"Trip {row['id']} has zero or negative duration. Using simple start/end times.")
             return route_geometry, [
                 int(pickup_time_obj.timestamp() * 1000),
                 int(dropoff_time_obj.timestamp() * 1000)
             ]


        # Get segment durations from ORS if available (preferred for more realistic interpolation)
        # ORS can provide 'segments' with 'duration' and 'distance' for each leg.
        # For a simple two-point query, there's usually one segment in 'routes'
        # Let's use the overall route duration from ORS to proportion the *actual* trip duration.
        # Note: ORS returns duration in seconds.
        
        # We need the timestamps for *each vertex* in route_coords
        trip_timestamps_ms = []
        num_vertices = len(route_coords)
        
        # First timestamp is pickup time
        trip_timestamps_ms.append(int(pickup_time_obj.timestamp() * 1000))

        if num_vertices > 2:
            # The ORS response `routes['features'][0]['properties']['summary']['duration']` is the *estimated* travel time.
            # We want to use the *actual* trip duration from our data.
            # We'll assume constant speed along the *estimated route* to distribute the *actual duration*.
            # This involves calculating cumulative distance along the path.

            # To be more accurate, ORS provides 'segments' which might have 'steps'
            # and each step has a duration. We can use these to proportion.
            # For simplicity here, we'll do a simpler proportion based on vertex count,
            # assuming segments are roughly equal. A more advanced method would use segment distances.
            
            # Simple linear interpolation of time based on vertex index:
            for i in range(1, num_vertices - 1):
                fraction_of_trip = i / (num_vertices - 1)
                interpolated_time_obj = pickup_time_obj + timedelta(seconds=actual_total_duration_seconds * fraction_of_trip)
                trip_timestamps_ms.append(int(interpolated_time_obj.timestamp() * 1000))

        # Last timestamp is dropoff time (or very close to it if interpolated)
        trip_timestamps_ms.append(int(dropoff_time_obj.timestamp() * 1000))
        
        # Ensure timestamps are monotonically increasing (important for Kepler)
        for i in range(1, len(trip_timestamps_ms)):
            if trip_timestamps_ms[i] < trip_timestamps_ms[i-1]:
                trip_timestamps_ms[i] = trip_timestamps_ms[i-1] # Correct if slightly out of order

        return route_geometry, trip_timestamps_ms

    except openrouteservice.exceptions.ApiError as e:
        print(f"ORS API Error for trip {row['id']}: {e}")
        # Fallback: straight line
        straight_line = LineString([pickup_coords, dropoff_coords])
        timestamps = [
            int(row['pickup_datetime'].timestamp() * 1000),
            int(row['dropoff_datetime'].timestamp() * 1000)
        ]
        return straight_line, timestamps
    except Exception as e:
        print(f"Error processing trip {row['id']}: {e}")
        # Fallback: straight line
        straight_line = LineString([pickup_coords, dropoff_coords])
        timestamps = [
            int(row['pickup_datetime'].timestamp() * 1000),
            int(row['dropoff_datetime'].timestamp() * 1000)
        ]
        return straight_line, timestamps

# --- PROCESS DATA ---
processed_trips = []
for index, row in df.iterrows():
    print(f"Processing trip {row['id']} ({index+1}/{len(df)})...")
    route_geom, route_timestamps = get_routed_trip_data(row)
    
    if route_geom and route_timestamps:
        processed_trips.append({
            'id': row['id'],
            'vendor_id': row['vendor_id'],
            'pickup_datetime': row['pickup_datetime'], # Keep original for filtering
            'geometry': route_geom,          # This will be our LineString
            'trip_timestamps': route_timestamps # List of timestamps for each vertex
        })
    
    # Basic rate limiting to be kind to the ORS API (free tier allows 40 req/min)
    if (index + 1) % 30 == 0: # After every 30 requests
        print("Pausing for 60 seconds to respect API rate limits...")
        time.sleep(60)
    else:
        time.sleep(0.5) # Small pause between requests


df_kepler = pd.DataFrame(processed_trips)

if df_kepler.empty:
    print("No trips were successfully processed. Exiting.")
    exit()

# --- KEPLER.GL VISUALIZATION ---
map_1 = KeplerGl(height=700, data={'taxi_routes': df_kepler})

# Create a default config (Kepler.gl is good at auto-detecting Trip layers)
# Or, provide a more specific config:
min_time_ms = int(df_kepler['pickup_datetime'].min().timestamp() * 1000)
# max_time_ms = int(df_kepler['pickup_datetime'].max().timestamp() * 1000) # This might clip ongoing trips
# Better to use the max of all timestamps within trip_timestamps for the filter's upper bound
max_time_ms_list = []
for ts_list in df_kepler['trip_timestamps']:
    if ts_list:
        max_time_ms_list.append(max(ts_list))
max_time_ms = max(max_time_ms_list) if max_time_ms_list else min_time_ms + 3600000 # Default to 1 hour if empty

trip_layer_config = {
    'version': 'v1',
    'config': {
        'visState': {
            'layers': [{
                'id': 'routed_trip_layer',
                'type': 'trip',
                'config': {
                    'dataId': 'taxi_routes',
                    'label': 'Routed Taxi Trips',
                    'color': [255, 153, 31], # Orange
                    'columns': {'geojson': '_geojson'}, # Kepler will use the 'geometry' column
                    'isVisible': True,
                    'visualChannels': {},
                    'animation': {'enabled': True},
                    'trip': {
                        'trailLength': 300, # Adjust for desired trail length
                        'currentTime': None,
                        'opacity': 0.8,
                        'thickness': 3,
                        'color': [255, 153, 31],
                        'timestamps': 'trip_timestamps' # Crucial: our list of interpolated timestamps
                    }
                }
            }],
            'filters': [{
                'dataId': ['taxi_routes'],
                'id': 'time_filter_animate_routed',
                'name': 'pickup_datetime', # Filter based on when the trip starts
                'type': 'timeRange',
                'value': [min_time_ms, max_time_ms],
                'enlarged': True,
                'plotType': 'histogram',
                'animationWindow': 'incremental',
                'speed': 1
            }],
            'interactionConfig': {
                'tooltip': {
                    'fieldsToShow': {
                        'taxi_routes': [
                            {'name': 'id', 'format': None},
                            {'name': 'vendor_id', 'format': None},
                            {'name': 'pickup_datetime', 'format': 'YYYY-MM-DD HH:mm:ss'}
                        ]
                    },
                    'enabled': True
                }
            },
            'mapState': {
                'latitude': 40.75,
                'longitude': -73.98,
                'zoom': 11
            }
        }
    }
}

map_1.config = trip_layer_config

# Display the map
map_1

# Optional: Save to HTML
# map_1.save_to_html(file_name='routed_taxi_animation.html', config=map_1.config)
# print("Map saved to routed_taxi_animation.html")

Processing trip id2875421 (1/50)...
Error processing trip id2875421: name 'ors_client' is not defined
Processing trip id2377394 (2/50)...
Error processing trip id2377394: name 'ors_client' is not defined
Processing trip id3858529 (3/50)...
Error processing trip id3858529: name 'ors_client' is not defined
Processing trip id3504673 (4/50)...
Error processing trip id3504673: name 'ors_client' is not defined
Processing trip id2181028 (5/50)...
Error processing trip id2181028: name 'ors_client' is not defined
Processing trip id0801584 (6/50)...
Error processing trip id0801584: name 'ors_client' is not defined
Processing trip id1813257 (7/50)...
Error processing trip id1813257: name 'ors_client' is not defined
Processing trip id1324603 (8/50)...
Error processing trip id1324603: name 'ors_client' is not defined
Processing trip id1301050 (9/50)...
Error processing trip id1301050: name 'ors_client' is not defined
Processing trip id0012891 (10/50)...
Error processing trip id0012891: name 'ors_cl