### Test and evaluate if training models on categorical features and predicting output for Base_Fare, Per_Km_Rate and Per_Minute_Rate is worth it

In [5]:
import pandas as pd

df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing.csv").dropna()

df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
12,41.79,Night,Weekend,3.0,High,Clear,4.6,1.77,0.11,86.95,88.1328
14,9.91,Evening,Weekday,2.0,High,Clear,2.32,1.26,0.34,41.72,28.9914


In [6]:
X, y = df[["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather"]], df[["Base_Fare", "Per_Km_Rate", "Per_Minute_Rate"]]
X = pd.get_dummies(X, drop_first=True)*1
X

Unnamed: 0,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Day_of_Week_Weekend,Traffic_Conditions_Low,Traffic_Conditions_Medium,Weather_Rain,Weather_Snow
0,0,1,0,0,1,0,0,0
2,1,0,0,1,0,0,0,0
5,0,0,0,1,0,1,0,0
12,0,0,1,1,0,0,0,0
14,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
990,1,0,0,0,1,0,0,0
992,0,0,0,0,0,1,0,0
994,1,0,0,0,0,1,0,0
995,0,0,0,1,0,1,0,0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape, y_train.shape

((449, 8), (113, 8), (449, 3))

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

model = MultiOutputRegressor(RandomForestRegressor())
model.fit(X_train, y_train)

y_preds = model.predict(X_test)

In [9]:
y_test.iloc[0]

Base_Fare          3.59
Per_Km_Rate        0.76
Per_Minute_Rate    0.17
Name: 881, dtype: float64

In [10]:
y_preds[0]

array([3.69734145, 1.15198949, 0.28771808])

In [11]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

results = {}
for i, col in enumerate(y.columns):
    rmse = root_mean_squared_error(y_test.iloc[:, i], y_preds[:, i])
    mae = mean_absolute_error(y_test.iloc[:, i], y_preds[:, i])
    results[col] = {"RMSE": rmse, "MAE": mae}

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Base_Fare,Per_Km_Rate,Per_Minute_Rate
RMSE,0.956926,0.489765,0.1151
MAE,0.803285,0.425955,0.097176


In [12]:
import joblib

model.fit(X,y)

joblib.dump(model, "../src/taxipred/models/feature_price_multiregressor.joblib")

['../src/taxipred/models/feature_price_multiregressor.joblib']