In [1]:
import pandas as pd

df_no_categorical = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_cleaned_no_categorical.csv", index_col=0)

df_categorical = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_cleaned.csv", index_col=0)

df_no_categorical.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,3.56,0.8,0.32,53.82,36.2624
2,36.87,1.0,2.7,1.21,0.15,37.27,52.9032
3,30.33,4.0,3.48,0.51,0.15,116.81,36.4698
4,25.83,3.0,2.93,0.63,0.32,22.64,15.618
5,8.64,2.0,2.55,1.71,0.48,89.33,60.2028


In [2]:
df_categorical.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,Clear,3.48,0.51,0.15,116.81,36.4698
4,25.83,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028


In [3]:
df_categorical.shape, df_no_categorical.shape

((925, 11), (925, 7))

### encoded categoricals

In [5]:
df_categorical = pd.get_dummies(df_categorical, drop_first=True)
df_categorical.shape

(925, 15)

## final evaluation with X_test and y_test for both datasets

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

dfs = [df_no_categorical, df_categorical]

for df in dfs:
    X, y = df.drop(columns="Trip_Price"), df["Trip_Price"]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
    
    model = RandomForestRegressor(min_samples_leaf=2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"| {mae= } | {mse= } | {rmse= } |")

| mae= 4.499306261617458 | mse= 42.25627059692166 | rmse= 6.500482335713379 |
| mae= 4.602600984110763 | mse= 43.49167685733746 | rmse= 6.594821973134488 |


## Export models

In [7]:
import joblib

paths = ["taxi_regressor", "taxi_regressor_categorical"]

for df, path in zip(dfs, paths):
    X, y = df.drop(columns="Trip_Price"), df["Trip_Price"]
    model = RandomForestRegressor(min_samples_leaf=2, random_state=42)
    model.fit(X,y)
    
    joblib.dump(model, f"../src/taxipred/models/{path}.joblib")