In [1]:
# Function to add random variation to travel time
def add_variation(base_time, day_of_week, time_str, customer_id):
    # Add variation based on customer habit (some customers consistently take longer routes)
    # Fix the customer_id parsing to handle different prefixes (EE, EC, etc.)
    try:
        # Extract numeric part of customer ID
        numeric_part = ''.join(filter(str.isdigit, customer_id))
        if numeric_part:
            customer_factor = 0.9 + (float(int(numeric_part) % 20) / 100)
        else:
            # Fallback if no numeric part found
            customer_factor = 1.0
    except ValueError:
        # If any conversion error occurs, use default value
        customer_factor = 1.0
    
    # Calculate traffic factor based on time of day and day of week
    traffic_factor = calculate_traffic_factor(time_str, day_of_week)
    
    # Add some random variation (±10%)
    random_factor = random.uniform(0.9, 1.1)
    
    # Calculate final travel time
    final_time = base_time * traffic_factor * customer_factor * random_factor
    
    # Traffic condition labels
    if traffic_factor < 1.1:
        traffic_condition = "light"
    elif traffic_factor < 1.3:
        traffic_condition = "normal"
    elif traffic_factor < 1.5:
        traffic_condition = "moderate"
    else:
        traffic_condition = "heavy"
    
    return final_time, traffic_condition

In [12]:
import pandas as pd
import numpy as np
import requests
import datetime
import time
import random
import os
from math import radians, sin, cos, sqrt, atan2
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Constants
API_KEY = "your_openrouteservice_api_key"  # Replace with your OpenRouteService API key
BASE_URL = "https://api.openrouteservice.org/v2/directions/driving-car"
NUM_DAYS = 90  # 3 months of data
START_DATE = datetime.datetime(2025, 1, 6)  # Starting from January 6, 2025 (Monday)
TRAFFIC_PATTERNS = {
    0: {  # Monday
        "morning_peak": {"start": "07:30", "end": "09:30", "factor": 1.5},
        "evening_peak": {"start": "17:00", "end": "19:00", "factor": 1.6},
    },
    1: {  # Tuesday
        "morning_peak": {"start": "07:30", "end": "09:30", "factor": 1.4},
        "evening_peak": {"start": "17:00", "end": "19:00", "factor": 1.5},
    },
    2: {  # Wednesday
        "morning_peak": {"start": "07:30", "end": "09:30", "factor": 1.4},
        "evening_peak": {"start": "17:00", "end": "19:00", "factor": 1.5},
    },
    3: {  # Thursday
        "morning_peak": {"start": "07:30", "end": "09:30", "factor": 1.4},
        "evening_peak": {"start": "17:00", "end": "19:00", "factor": 1.6},
    },
    4: {  # Friday
        "morning_peak": {"start": "07:30", "end": "09:30", "factor": 1.3},
        "evening_peak": {"start": "16:00", "end": "19:00", "factor": 1.7},
    },
    5: {  # Saturday
        "morning_peak": {"start": "10:00", "end": "12:00", "factor": 1.2},
        "evening_peak": {"start": "16:00", "end": "18:00", "factor": 1.3},
    },
    6: {  # Sunday
        "morning_peak": {"start": "11:00", "end": "13:00", "factor": 1.1},
        "evening_peak": {"start": "17:00", "end": "19:00", "factor": 1.2},
    },
}

# File paths configuration
CUSTOMER_DATA_FILE = "CustomerAO2.csv"
TRIPS_OUTPUT_FILE = "trips_full.csv"
INCOMPLETE_TRIPS_FILE = "incomplete_trips.csv"
COMPLETED_TRIPS_FILE = "trips_completed.csv"

# Function to calculate distance between two lat-long points using Haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [float(lat1), float(lon1), float(lat2), float(lon2)])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = 6371 * c  # Radius of Earth in kilometers
    
    return distance

# Function to get typical travel time using OpenRouteService API
def get_travel_time(home_lat, home_lon, office_lat, office_lon):
    retry_count = 0
    max_retries = 3
    backoff_factor = 2
    
    while retry_count < max_retries:
        try:
            # For actual API integration, uncomment the following code
            # headers = {"Authorization": API_KEY}
            # params = {
            #     "start": f"{home_lon},{home_lat}",
            #     "end": f"{office_lon},{office_lat}"
            # }
            # response = requests.get(BASE_URL, headers=headers, params=params, timeout=10)
            # response.raise_for_status()
            # data = response.json()
            # travel_time_seconds = data["routes"][0]["summary"]["duration"]
            
            # For this example, we'll simulate the API response using the Haversine distance
            distance = calculate_distance(home_lat, home_lon, office_lat, office_lon)
            # Assuming an average speed of 30km/h in city traffic
            travel_time_seconds = (distance / 30) * 3600
            
            return travel_time_seconds
        except Exception as e:
            print(f"Error fetching travel time: {e}")
            retry_count += 1
            if retry_count < max_retries:
                time.sleep(backoff_factor ** retry_count)
            else:
                # Return an estimated time based on direct distance as fallback
                distance = calculate_distance(home_lat, home_lon, office_lat, office_lon)
                return (distance / 30) * 3600  # Assuming 30km/h average speed

# Function to calculate traffic factor based on time and day of week
def calculate_traffic_factor(time_str, day_of_week):
    hour, minute = map(int, time_str.split(':'))
    current_time = hour * 60 + minute  # Convert to minutes
    
    day_pattern = TRAFFIC_PATTERNS[day_of_week]
    
    # Check if current time is in morning peak
    # Fix: Properly convert time string to minutes
    morning_hour, morning_minute = map(int, day_pattern["morning_peak"]["start"].split(':'))
    morning_start = morning_hour * 60 + morning_minute
    
    morning_hour_end, morning_minute_end = map(int, day_pattern["morning_peak"]["end"].split(':'))
    morning_end = morning_hour_end * 60 + morning_minute_end
    
    if morning_start <= current_time <= morning_end:
        # Calculate gradual factor increase/decrease near peak hours
        peak_center = (morning_start + morning_end) / 2
        distance_from_center = abs(current_time - peak_center)
        peak_duration = (morning_end - morning_start) / 2
        
        if distance_from_center <= peak_duration:
            return day_pattern["morning_peak"]["factor"] * (1 - 0.5 * distance_from_center / peak_duration)
    
    # Check if current time is in evening peak
    # Fix: Properly convert time string to minutes
    evening_hour, evening_minute = map(int, day_pattern["evening_peak"]["start"].split(':'))
    evening_start = evening_hour * 60 + evening_minute
    
    evening_hour_end, evening_minute_end = map(int, day_pattern["evening_peak"]["end"].split(':'))
    evening_end = evening_hour_end * 60 + evening_minute_end
    
    if evening_start <= current_time <= evening_end:
        # Calculate gradual factor increase/decrease near peak hours
        peak_center = (evening_start + evening_end) / 2
        distance_from_center = abs(current_time - peak_center)
        peak_duration = (evening_end - evening_start) / 2
        
        if distance_from_center <= peak_duration:
            return day_pattern["evening_peak"]["factor"] * (1 - 0.5 * distance_from_center / peak_duration)
    
    # Base traffic outside peak hours (varies slightly by time of day)
    base_factor = 1.0
    if 0 <= hour < 6:  # Late night/early morning
        base_factor = 0.8
    elif 10 <= hour < 16:  # Midday
        base_factor = 1.1
    elif 20 <= hour <= 23:  # Evening
        base_factor = 0.9
        
    return base_factor

# Function to add random variation to travel time
# Function to add random variation to travel time
def add_variation(base_time, day_of_week, time_str, customer_id):
    # Add variation based on customer habit (some customers consistently take longer routes)
    # Fix the customer_id parsing to handle different prefixes (EE, EC, etc.)
    try:
        # Extract numeric part of customer ID
        numeric_part = ''.join(filter(str.isdigit, customer_id))
        if numeric_part:
            customer_factor = 0.9 + (float(int(numeric_part) % 20) / 100)
        else:
            # Fallback if no numeric part found
            customer_factor = 1.0
    except ValueError:
        # If any conversion error occurs, use default value
        customer_factor = 1.0
    
    # Calculate traffic factor based on time of day and day of week
    traffic_factor = calculate_traffic_factor(time_str, day_of_week)
    
    # Add some random variation (±10%)
    random_factor = random.uniform(0.9, 1.1)
    
    # Calculate final travel time
    final_time = base_time * traffic_factor * customer_factor * random_factor
    
    # Traffic condition labels
    if traffic_factor < 1.1:
        traffic_condition = "light"
    elif traffic_factor < 1.3:
        traffic_condition = "normal"
    elif traffic_factor < 1.5:
        traffic_condition = "moderate"
    else:
        traffic_condition = "heavy"
    
    return final_time, traffic_condition

# Function to generate a complete dataset
def generate_dataset(customer_data):
    all_trips = []
    trip_id = 1
    
    # Loop through each day in the 3-month period
    current_date = START_DATE
    for _ in range(NUM_DAYS):
        date_str = current_date.strftime("%d-%m-%Y")
        day_of_week = current_date.weekday()  # Monday=0, Sunday=6
        day_name = current_date.strftime("%A")
        
        # For each customer
        for _, customer in customer_data.iterrows():
            # Handle different column naming in different files
            if "cust_id" in customer:
                customer_id = customer["cust_id"]
            elif "Customer_ID" in customer:
                customer_id = customer["Customer_ID"]
            else:
                raise ValueError("Could not find customer ID column")
                
            home_lat = customer["home_lat"]
            home_lon = customer["home_lon"]
            office_lat = customer["office_lat"]
            office_lon = customer["office_lon"]
            
            # Skip weekends for some customers (70% probability)
            if day_of_week >= 5 and random.random() < 0.7:
                continue
                
            # Morning trip (home to office)
            if random.random() < 0.95:  # 95% chance of going to office
                # Generate departure time (more likely during morning hours)
                if day_of_week < 5:  # Weekday
                    hour = random.choices([7, 8, 9, 10], weights=[0.3, 0.4, 0.2, 0.1])[0]
                else:  # Weekend
                    hour = random.choices([9, 10, 11, 12], weights=[0.2, 0.3, 0.3, 0.2])[0]
                
                minute = random.randint(0, 59)
                departure_time = f"{hour:02d}:{minute:02d}"
                
                # Get base travel time from API or calculation
                base_time_seconds = get_travel_time(home_lat, home_lon, office_lat, office_lon)
                base_time_minutes = base_time_seconds / 60
                
                # Add variation based on time, day, etc.
                final_time_minutes, traffic_condition = add_variation(
                    base_time_minutes, day_of_week, departure_time, customer_id
                )
                
                # Calculate arrival time
                departure_datetime = datetime.datetime.strptime(departure_time, "%H:%M")
                arrival_datetime = departure_datetime + datetime.timedelta(minutes=final_time_minutes)
                arrival_time = arrival_datetime.strftime("%H:%M")
                
                # Create trip record
                trip = {
                    "Trip_ID": f"TRIP{trip_id:04d}",
                    "Customer_ID": customer_id,
                    "Date": date_str,
                    "Day_of_Week": day_name,
                    "Direction": 0,  # 0 for home to office
                    "Departure_Time": departure_time,
                    "Arrival_Time": arrival_time,
                    "Duration_min": round(final_time_minutes, 1),
                    "traffic_conditions": traffic_condition
                }
                all_trips.append(trip)
                trip_id += 1
                
                # Evening trip (office to home)
                if random.random() < 0.98:  # 98% chance of returning home if they went to office
                    # Generate departure time (more likely during evening hours)
                    if day_of_week < 5:  # Weekday
                        hour = random.choices([16, 17, 18, 19, 20], weights=[0.1, 0.3, 0.3, 0.2, 0.1])[0]
                    else:  # Weekend
                        hour = random.choices([15, 16, 17, 18], weights=[0.3, 0.3, 0.3, 0.1])[0]
                    
                    minute = random.randint(0, 59)
                    departure_time = f"{hour:02d}:{minute:02d}"
                    
                    # For return journey (slightly different characteristics)
                    base_time_seconds = get_travel_time(office_lat, office_lon, home_lat, home_lon)
                    base_time_minutes = base_time_seconds / 60
                    
                    # Add variation based on time, day, etc.
                    final_time_minutes, traffic_condition = add_variation(
                        base_time_minutes, day_of_week, departure_time, customer_id
                    )
                    
                    # Calculate arrival time
                    departure_datetime = datetime.datetime.strptime(departure_time, "%H:%M")
                    arrival_datetime = departure_datetime + datetime.timedelta(minutes=final_time_minutes)
                    arrival_time = arrival_datetime.strftime("%H:%M")
                    
                    # Create trip record
                    trip = {
                        "Trip_ID": f"TRIP{trip_id:04d}",
                        "Customer_ID": customer_id,
                        "Date": date_str,
                        "Day_of_Week": day_name,
                        "Direction": 1,  # 1 for office to home
                        "Departure_Time": departure_time,
                        "Arrival_Time": arrival_time,
                        "Duration_min": round(final_time_minutes, 1),
                        "traffic_conditions": traffic_condition
                    }
                    all_trips.append(trip)
                    trip_id += 1
        
        # Move to next day
        current_date += datetime.timedelta(days=1)
    
    return pd.DataFrame(all_trips)

# Function to train the prediction model
def train_prediction_model(trip_data):
    # Prepare features
    features = trip_data.copy()
    
    # Extract hour and minute from departure time
    features['departure_hour'] = features['Departure_Time'].apply(lambda x: int(x.split(':')[0]))
    features['departure_minute'] = features['Departure_Time'].apply(lambda x: int(x.split(':')[1]))
    
    # Convert day of week to numeric
    day_mapping = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
        'Friday': 4, 'Saturday': 5, 'Sunday': 6
    }
    features['day_numeric'] = features['Day_of_Week'].map(day_mapping)
    
    # One-hot encode direction
    features['direction_to_office'] = (features['Direction'] == 0).astype(int)
    features['direction_to_home'] = (features['Direction'] == 1).astype(int)
    
    # One-hot encode traffic conditions
    traffic_dummies = pd.get_dummies(features['traffic_conditions'], prefix='traffic')
    features = pd.concat([features, traffic_dummies], axis=1)
    
    # Select features for the model
    X = features[['departure_hour', 'departure_minute', 'day_numeric', 
                 'direction_to_office', 'direction_to_home',
                 'traffic_light', 'traffic_normal', 'traffic_moderate', 'traffic_heavy']]
    
    y = features['Duration_min']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a RandomForest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model Mean Absolute Error: {mae:.2f} minutes")
    print(f"Model R² Score: {r2:.4f}")
    
    return model

# Function to predict arrival time and duration for a new trip
def predict_trip(model, customer_data, trip_info):
    # Extract trip information
    customer_id = trip_info['Customer_ID']
    date_str = trip_info['Date']
    day_of_week = trip_info['Day_of_Week']
    direction = trip_info['Direction']
    departure_time = trip_info['Departure_Time']
    
    # Create feature vector
    departure_hour = int(departure_time.split(':')[0])
    departure_minute = int(departure_time.split(':')[1])
    
    day_mapping = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
        'Friday': 4, 'Saturday': 5, 'Sunday': 6
    }
    day_numeric = day_mapping[day_of_week]
    
    # Determine traffic conditions based on time and day
    traffic_factor = calculate_traffic_factor(departure_time, day_numeric)
    
    if traffic_factor < 1.1:
        traffic_condition = "light"
    elif traffic_factor < 1.3:
        traffic_condition = "normal"
    elif traffic_factor < 1.5:
        traffic_condition = "moderate"
    else:
        traffic_condition = "heavy"
    
    # Create one-hot encoded features
    traffic_light = 1 if traffic_condition == "light" else 0
    traffic_normal = 1 if traffic_condition == "normal" else 0
    traffic_moderate = 1 if traffic_condition == "moderate" else 0
    traffic_heavy = 1 if traffic_condition == "heavy" else 0
    
    direction_to_office = 1 if direction == 0 else 0
    direction_to_home = 1 if direction == 1 else 0
    
    # Create feature vector
    features = [[
        departure_hour, departure_minute, day_numeric,
        direction_to_office, direction_to_home,
        traffic_light, traffic_normal, traffic_moderate, traffic_heavy
    ]]
    
    # Make prediction
    predicted_duration = model.predict(features)[0]
    
    # Calculate arrival time
    departure_datetime = datetime.datetime.strptime(departure_time, "%H:%M")
    arrival_datetime = departure_datetime + datetime.timedelta(minutes=predicted_duration)
    arrival_time = arrival_datetime.strftime("%H:%M")
    
    return {
        'Arrival_Time': arrival_time,
        'Duration_min': round(predicted_duration, 1),
        'traffic_conditions': traffic_condition
    }

# Save model for future use
def save_model(model, filename="travel_time_model.joblib"):
    import joblib
    joblib.dump(model, filename)
    print(f"Model saved to {filename}")

# Main function
def main():
    try:
        # Load customer data
        try:
            # Try the first customer data file name
            customer_data = pd.read_csv(CUSTOMER_DATA_FILE)
            print(f"Successfully loaded customer data from {CUSTOMER_DATA_FILE}")
        except FileNotFoundError:
            # Try alternative file name
            alt_file = "customer_ao.csv"
            customer_data = pd.read_csv(alt_file)
            print(f"Successfully loaded customer data from {alt_file}")
        
        # Generate dataset
        print("Generating trip dataset...")
        trips_df = generate_dataset(customer_data)
        
        # Save to CSV
        trips_df.to_csv(TRIPS_OUTPUT_FILE, index=False)
        print(f"Generated {len(trips_df)} trip records and saved to {TRIPS_OUTPUT_FILE}")
        
        # Train prediction model
        print("Training prediction model...")
        model = train_prediction_model(trips_df)
        
        # Save the model for future use
        save_model(model)
        
        # Complete the incomplete trips data
        try:
            incomplete_trips = pd.read_csv(INCOMPLETE_TRIPS_FILE)
            
            # Fill in missing data
            for idx, trip in incomplete_trips.iterrows():
                if pd.isna(trip['Arrival_Time']) or pd.isna(trip['Duration_min']) or pd.isna(trip['traffic_conditions']):
                    predictions = predict_trip(model, customer_data, trip)
                    
                    incomplete_trips.at[idx, 'Arrival_Time'] = predictions['Arrival_Time']
                    incomplete_trips.at[idx, 'Duration_min'] = predictions['Duration_min']
                    incomplete_trips.at[idx, 'traffic_conditions'] = predictions['traffic_conditions']
            
            # Save completed trips
            incomplete_trips.to_csv(COMPLETED_TRIPS_FILE, index=False)
            print(f"Completed missing trip data and saved to {COMPLETED_TRIPS_FILE}")
        except FileNotFoundError:
            print(f"No incomplete trips file found at {INCOMPLETE_TRIPS_FILE}")
            
    except Exception as e:
        print(f"Error in main function: {e}")

if __name__ == "__main__":
    main()

Successfully loaded customer data from CustomerAO2.csv
Generating trip dataset...
Generated 6826 trip records and saved to trips_full.csv
Training prediction model...
Model Mean Absolute Error: 12.28 minutes
Model R² Score: -0.2129
Model saved to travel_time_model.joblib
Error in main function: No columns to parse from file
