In [26]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from geopy.distance import geodesic

In [27]:
trips_df = pd.read_csv("Data/trips_full.csv")
customer_df = pd.read_csv("Data/customer_ao.csv")

In [28]:
merged_df = pd.merge(trips_df, customer_df, on='Customer_ID', how='left')

In [29]:
def get_coords(row):
    if row['Direction'] == 0:
        return (row['home_lat'], row['home_lon']), (row['office_lat'], row['office_lon'])
    else:
        return (row['office_lat'], row['office_lon']), (row['home_lat'], row['home_lon'])

In [30]:
merged_df['start_coords'], merged_df['end_coords'] = zip(*merged_df.apply(get_coords, axis=1))
merged_df['distance_km'] = merged_df.apply(lambda row: geodesic(row['start_coords'], row['end_coords']).km, axis=1)

In [31]:
merged_df['Departure_Hour'] = pd.to_datetime(merged_df['Departure_Time'], format='%H:%M').dt.hour
merged_df['Day_of_Week'] = merged_df['Day_of_Week'].astype('category').cat.codes

In [32]:
le = LabelEncoder()
merged_df['traffic_conditions'] = le.fit_transform(merged_df['traffic_conditions'])

In [33]:
merged_df['Arrival_Minutes'] = pd.to_datetime(merged_df['Arrival_Time'], format='%H:%M').dt.hour * 60 + \
                               pd.to_datetime(merged_df['Arrival_Time'], format='%H:%M').dt.minute

In [34]:
features = ['Day_of_Week', 'Direction', 'Departure_Hour', 'distance_km', 'traffic_conditions']
X = merged_df[features]
y = merged_df['Arrival_Minutes']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

In [37]:
preds = model.predict(X_test)
r2 = r2_score(y_test, preds) 
print(f"R² Score: {r2:.4f}")

R² Score: 0.9968


In [38]:
def minutes_to_time(mins):
    hours = int(mins) // 60
    minutes = int(mins) % 60
    return f"{hours:02d}:{minutes:02d}"

In [39]:
sample_predictions = [minutes_to_time(m) for m in preds[:5]]
print("Sample Predicted Arrival Times:", sample_predictions)

Sample Predicted Arrival Times: ['09:55', '18:56', '19:09', '17:29', '17:36']


In [40]:
try:
    joblib.dump(model, "arrival_model.pkl")
    joblib.dump(le, "traffic_encoder.pkl")
    print("Models saved with joblib")
    
    with open("arrival_model_pickle.pkl", "wb") as f:
        pickle.dump(model, f)
    with open("traffic_encoder_pickle.pkl", "wb") as f:
        pickle.dump(le, f)
    print("Models saved with pickle as backup")

except Exception as e:
    print(f"Error saving models: {e}")

Models saved with joblib
Models saved with pickle as backup
