In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to calculate distance between two coordinate points using Haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Function to convert time string to minutes since midnight
def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            time_obj = datetime.strptime(time_str, '%H:%M:%S').time()
        except ValueError:
            time_obj = datetime.strptime(time_str, '%H:%M').time()
        return time_obj.hour * 60 + time_obj.minute
    return 0

# Function to convert minutes to time string
def minutes_to_time(minutes):
    hours = minutes // 60
    mins = minutes % 60
    return f"{hours:02d}:{mins:02d}"

# Function to calculate arrival time based on departure and locations
def calculate_trip_duration(home_lat, home_lon, office_lat, office_lon, departure_time, day_of_week):
    # Calculate distance
    distance_km = calculate_distance(home_lat, home_lon, office_lat, office_lon)
    
    # Apply road factor (roads are not straight lines)
    road_distance = distance_km * 1.3
    
    # Convert departure time to minutes if it's a string
    if isinstance(departure_time, str):
        departure_minutes = time_to_minutes(departure_time)
    else:
        departure_minutes = departure_time
    
    # Extract hour for traffic conditions
    departure_hour = departure_minutes // 60
    
    # Determine base speed based on time of day (in km/h)
    if 8 <= departure_hour < 10:  # Morning peak
        base_speed = 20
        traffic_condition = "Heavy" if departure_hour >= 9 else "Moderate"
    elif 17 <= departure_hour < 20:  # Evening peak
        base_speed = 18
        traffic_condition = "Heavy"
    else:  # Off-peak
        base_speed = 30
        traffic_condition = "Light"
    
    # Calculate base duration in minutes
    base_duration = (road_distance / base_speed) * 60
    
    # Apply day of week factor
    if day_of_week == "Monday":
        day_factor = 1.1
    elif day_of_week == "Friday":
        day_factor = 1.15
    else:
        day_factor = 1.0
    
    # Calculate final duration (as integer minutes)
    duration_minutes = int(base_duration * day_factor)
    
    return duration_minutes, traffic_condition

# Load customer data from CSV
def load_customer_data(file_path):
    return pd.read_csv(file_path)

# Generate 3 months of trip data for all customers
def generate_trip_data(customers_df):
    # Generate 3 months of weekdays (Mon-Fri)
    start_date = datetime(2025, 1, 6)  # First Monday of January 2025
    end_date = datetime(2025, 3, 31)
    dates = []
    current_date = start_date
    
    while current_date <= end_date:
        if current_date.weekday() < 5:  # Monday to Friday
            dates.append(current_date)
        current_date += timedelta(days=1)
    
    # Days of week mapping
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    
    # Generate the dataset
    trip_data = []
    trip_id = 1
    
    for _, customer in customers_df.iterrows():
        # Assign typical departure times for this customer (with some small variations)
        morning_base = datetime.strptime(f"{np.random.randint(7, 10):02d}:{np.random.randint(0, 60):02d}:00", "%H:%M:%S").time()
        evening_base = datetime.strptime(f"{np.random.randint(17, 20):02d}:{np.random.randint(0, 60):02d}:00", "%H:%M:%S").time()
        
        for date in dates:
            day_of_week = days[date.weekday()]
            
            # Morning trip (Home to Office)
            # Add some daily variation to departure time
            morning_variation = np.random.randint(-15, 16)  # ±15 minutes
            morning_departure = datetime.combine(date, morning_base) + timedelta(minutes=morning_variation)
            morning_departure_str = morning_departure.strftime("%H:%M:%S")
            
            duration, traffic = calculate_trip_duration(
                customer["home_lat"], customer["home_lon"],
                customer["office_lat"], customer["office_lon"],
                morning_departure_str, day_of_week
            )
            
            # Calculate arrival time
            morning_arrival = morning_departure + timedelta(minutes=duration)
            morning_arrival_str = morning_arrival.strftime("%H:%M:%S")
            
            trip_data.append({
                "Trip_ID": f"TRIP{trip_id:05d}",
                "Customer_ID": customer["Customer_ID"],
                "Date": date.strftime("%Y-%m-%d"),
                "Day_of_Week": day_of_week,
                "Direction": "Home→Office",
                "Departure_Time": morning_departure_str,
                "Arrival_Time": morning_arrival_str,
                "Duration_min": duration,
                "Traffic_Condition": traffic
            })
            trip_id += 1
            
            # Evening trip (Office to Home)
            evening_variation = np.random.randint(-15, 16)  # ±15 minutes
            evening_departure = datetime.combine(date, evening_base) + timedelta(minutes=evening_variation)
            evening_departure_str = evening_departure.strftime("%H:%M:%S")
            
            duration, traffic = calculate_trip_duration(
                customer["office_lat"], customer["office_lon"],
                customer["home_lat"], customer["home_lon"],
                evening_departure_str, day_of_week
            )
            
            # Calculate arrival time
            evening_arrival = evening_departure + timedelta(minutes=duration)
            evening_arrival_str = evening_arrival.strftime("%H:%M:%S")
            
            trip_data.append({
                "Trip_ID": f"TRIP{trip_id:05d}",
                "Customer_ID": customer["Customer_ID"],
                "Date": date.strftime("%Y-%m-%d"),
                "Day_of_Week": day_of_week,
                "Direction": "Office→Home",
                "Departure_Time": evening_departure_str,
                "Arrival_Time": evening_arrival_str,
                "Duration_min": duration,
                "Traffic_Condition": traffic
            })
            trip_id += 1
    
    return pd.DataFrame(trip_data)

# Feature engineering for the prediction model
def prepare_features(trips_df, customer_df):
    # Merge customer location data with trip data
    df = trips_df.merge(customer_df, on='Customer_ID')
    
    # Convert day of week to numeric (Monday=0, Friday=4)
    day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4}
    df['Day_Numeric'] = df['Day_of_Week'].map(day_mapping)
    
    # Convert departure time to minutes since midnight
    df['Departure_Minutes'] = df['Departure_Time'].apply(time_to_minutes)
    
    # Extract hour from departure time
    df['Departure_Hour'] = df['Departure_Minutes'] // 60
    
    # Create time of day category (Morning/Evening)
    df['Time_Of_Day'] = df['Departure_Hour'].apply(
        lambda x: 'Morning' if 5 <= x < 12 else 'Evening' if 17 <= x < 23 else 'Midday')
    
    # Convert time of day to numeric
    time_mapping = {'Morning': 0, 'Midday': 1, 'Evening': 2}
    df['Time_Of_Day_Numeric'] = df['Time_Of_Day'].map(time_mapping)
    
    # Calculate distance between home and office
    df['Distance'] = df.apply(
        lambda row: calculate_distance(
            row['home_lat'], row['home_lon'], 
            row['office_lat'], row['office_lon']
        ) if row['Direction'] == 'Home→Office' else calculate_distance(
            row['office_lat'], row['office_lon'], 
            row['home_lat'], row['home_lon']
        ), 
        axis=1
    )
    
    # Direction as numeric
    df['Direction_Numeric'] = df['Direction'].apply(lambda x: 0 if x == 'Home→Office' else 1)
    
    return df

# Train ride time prediction model
def train_prediction_model(prepared_data):
    # Features to use for prediction
    features = [
        'Distance', 'Departure_Minutes', 'Day_Numeric', 
        'Time_Of_Day_Numeric', 'Direction_Numeric'
    ]
    
    X = prepared_data[features]
    y = prepared_data['Duration_min']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Calculate percentage of predictions within 5% of actual duration
    within_5_percent = np.mean(np.abs(y_pred - y_test) / y_test <= 0.05) * 100
    within_10_percent = np.mean(np.abs(y_pred - y_test) / y_test <= 0.10) * 100
    
    print(f"Model Performance:")
    print(f"MAE: {mae:.2f} minutes")
    print(f"RMSE: {rmse:.2f} minutes")
    print(f"R² Score: {r2:.4f}")
    print(f"Predictions within 5% of actual: {within_5_percent:.2f}%")
    print(f"Predictions within 10% of actual: {within_10_percent:.2f}%")
    
    return model, features

# Function to predict ride duration for a new trip
def predict_ride_duration(model, features_list, customer_id, home_lat, home_lon, office_lat, office_lon, 
                          departure_time, day_of_week, direction):
    # Calculate distance
    if direction == 'Home→Office':
        distance = calculate_distance(home_lat, home_lon, office_lat, office_lon)
    else:  # Office→Home
        distance = calculate_distance(office_lat, office_lon, home_lat, home_lon)
    
    # Convert departure time to minutes
    departure_minutes = time_to_minutes(departure_time)
    
    # Get departure hour
    departure_hour = departure_minutes // 60
    
    # Create time of day category
    time_of_day = 0 if 5 <= departure_hour < 12 else 2 if 17 <= departure_hour < 23 else 1
    
    # Convert day of week to numeric
    day_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4}
    day_numeric = day_mapping.get(day_of_week, 0)
    
    # Direction as numeric
    direction_numeric = 0 if direction == 'Home→Office' else 1
    
    # Create a DataFrame with the new data point
    new_data = pd.DataFrame({
        'Distance': [distance],
        'Departure_Minutes': [departure_minutes],
        'Day_Numeric': [day_numeric],
        'Time_Of_Day_Numeric': [time_of_day],
        'Direction_Numeric': [direction_numeric]
    })
    
    # Make the prediction
    predicted_duration = int(model.predict(new_data)[0])
    
    # Calculate arrival time
    departure_datetime = datetime.strptime(departure_time, '%H:%M:%S')
    arrival_datetime = departure_datetime + timedelta(minutes=predicted_duration)
    arrival_time = arrival_datetime.strftime('%H:%M:%S')
    
    return predicted_duration, arrival_time

# Main function to run the application
def main():
    # Load customer data
    customer_df = load_customer_data('CustomerAO2.csv')
    
    print(f"Loaded {len(customer_df)} customer records")
    
    # Generate trip data
    trips_df = generate_trip_data(customer_df)
    
    print(f"Generated {len(trips_df)} trip records")
    
    # Save the generated data to CSV (optional)
    trips_df.to_csv('bangalore_trips_data.csv', index=False)
    print("Saved trip data to bangalore_trips_data.csv")
    
    # Prepare features for model
    prepared_data = prepare_features(trips_df, customer_df)
    
    # Train the prediction model
    model, features = train_prediction_model(prepared_data)
    
    # Example prediction for a new trip
    customer_id = 'EE01'
    customer_record = customer_df[customer_df['Customer_ID'] == customer_id].iloc[0]
    
    home_lat = customer_record['home_lat']
    home_lon = customer_record['home_lon']
    office_lat = customer_record['office_lat']
    office_lon = customer_record['office_lon']
    departure_time = '08:30:00'
    day_of_week = 'Monday'
    direction = 'Home→Office'
    
    predicted_duration, arrival_time = predict_ride_duration(
        model, features, customer_id, home_lat, home_lon, 
        office_lat, office_lon, departure_time, day_of_week, direction
    )
    
    print(f"\nExample Prediction:")
    print(f"Customer: {customer_id}")
    print(f"From {'Home' if direction == 'Home→Office' else 'Office'} to {'Office' if direction == 'Home→Office' else 'Home'}")
    print(f"Departure Time: {departure_time} on {day_of_week}")
    print(f"Predicted Duration: {predicted_duration} minutes")
    print(f"Predicted Arrival Time: {arrival_time}")

# Run the application
if __name__ == "__main__":
    main()

Loaded 50 customer records
Generated 6100 trip records
Saved trip data to bangalore_trips_data.csv
Model Performance:
MAE: 0.07 minutes
RMSE: 0.60 minutes
R² Score: 0.9995
Predictions within 5% of actual: 99.10%
Predictions within 10% of actual: 99.75%

Example Prediction:
Customer: EE01
From Home to Office
Departure Time: 08:30:00 on Monday
Predicted Duration: 43 minutes
Predicted Arrival Time: 09:13:00
