In [1]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV

models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "XGBRegressor": XGBRegressor(),
    "RidgeCV": RidgeCV(),
    "ElasticNetCV": ElasticNetCV()
    }

models = {key: MultiOutputRegressor(value) for key, value in models.items()}
    
for key, model in models.items():
    print(key, ":", model.estimator)

LinearRegression : LinearRegression()
RandomForestRegressor : RandomForestRegressor()
KNeighborsRegressor : KNeighborsRegressor()
XGBRegressor : XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=None, num_parallel_tree=None, ...)
RidgeCV : RidgeCV()
ElasticNetCV : ElasticNetCV()


In [2]:
from sklearn.model_selection import train_test_split

import pandas as pd

df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_cleaned_no_categorical.csv", index_col=0)

# X, y = df.drop(columns=target), df[target]

# X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.33, random_state=42)

# y_train

In [3]:
df

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,3.0,3.56,0.80,0.320000,53.82,36.2624
2,36.87,1.0,2.70,1.21,0.150000,37.27,52.9032
3,30.33,4.0,3.48,0.51,0.150000,116.81,36.4698
4,25.83,3.0,2.93,0.63,0.320000,22.64,15.6180
5,8.64,2.0,2.55,1.71,0.480000,89.33,60.2028
...,...,...,...,...,...,...,...
995,5.49,4.0,2.39,0.62,0.490000,58.39,34.4049
996,45.95,4.0,3.12,0.61,0.292916,61.96,62.1295
997,7.70,3.0,2.08,1.78,0.292916,54.18,33.1236
998,47.56,1.0,2.67,0.82,0.170000,114.94,61.2090


In [9]:
from model_testing import test_models
import pandas as pd
# _no_categorical
df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_cleaned.csv", index_col=0)

target = "Trip_Price"
df = pd.get_dummies(df, drop_first=True)*1

result = test_models(df, target)
result.sort_values(by="rmse_%")



Unnamed: 0,model,scaler,mae,mse,rmse,rmse_%,r2_score
3,RandomForestRegressor,MinMaxScaler,4.502965,44.42132,6.664932,0.135508,0.910137
2,RandomForestRegressor,,4.531139,46.758238,6.838,0.139026,0.90541
6,XGBRegressor,,5.041203,52.002562,7.21128,0.146616,0.894801
7,XGBRegressor,MinMaxScaler,5.041203,52.002562,7.21128,0.146616,0.894801
11,ElasticNetCV,MinMaxScaler,6.596511,70.800735,8.414317,0.171075,0.856773
8,RidgeCV,,6.591788,70.812323,8.415006,0.171089,0.85675
9,RidgeCV,MinMaxScaler,6.59007,70.815279,8.415181,0.171093,0.856744
0,LinearRegression,,6.592182,70.864454,8.418103,0.171152,0.856644
1,LinearRegression,MinMaxScaler,6.592182,70.864454,8.418103,0.171152,0.856644
10,ElasticNetCV,,9.618881,156.538578,12.511538,0.254377,0.683329


In [8]:
result.sort_values(by="rmse_%")

Unnamed: 0,model,scaler,mae,mse,rmse,rmse_%,r2_score
3,RandomForestRegressor,MinMaxScaler,4.380512,45.004554,6.708543,0.136394,0.908958
2,RandomForestRegressor,,4.474088,45.759821,6.764601,0.137534,0.90743
6,XGBRegressor,,5.205549,52.607471,7.253101,0.147466,0.893577
7,XGBRegressor,MinMaxScaler,5.205549,52.607471,7.253101,0.147466,0.893577
8,RidgeCV,,6.485292,69.901026,8.360683,0.169985,0.858593
9,RidgeCV,MinMaxScaler,6.483583,69.914459,8.361487,0.170001,0.858566
0,LinearRegression,,6.485047,69.957001,8.36403,0.170053,0.85848
1,LinearRegression,MinMaxScaler,6.485047,69.957001,8.36403,0.170053,0.85848
11,ElasticNetCV,MinMaxScaler,6.512725,70.109241,8.373126,0.170238,0.858172
5,KNeighborsRegressor,MinMaxScaler,7.230217,88.991479,9.43353,0.191797,0.819974


In [2]:
from model_testing import test_models
import pandas as pd

df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing.csv").dropna()
df = df[["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate"]]
df = pd.get_dummies(df, drop_first=True)*1
df
target = ["Base_Fare", "Per_Km_Rate", "Per_Minute_Rate"]
result = test_models(df, target)
result[["model", "scaler", "mae_avg", "rmse_avg", "rmse_%_avg", "r2_avg"]].sort_values(by="rmse_avg")

Unnamed: 0,model,scaler,mae_avg,rmse_avg,rmse_%_avg,r2_avg
0,LinearRegression,,0.412636,0.474581,0.41036,-0.009114
1,LinearRegression,MinMaxScaler,0.412636,0.474581,0.41036,-0.009114
9,RidgeCV,MinMaxScaler,0.413363,0.474745,0.409303,-0.007822
8,RidgeCV,,0.413363,0.474745,0.409303,-0.007822
11,ElasticNetCV,MinMaxScaler,0.417443,0.478239,0.407605,-0.014662
10,ElasticNetCV,,0.417443,0.478239,0.407605,-0.014662
5,KNeighborsRegressor,MinMaxScaler,0.417863,0.482756,0.415929,-0.039032
4,KNeighborsRegressor,,0.417863,0.482756,0.415929,-0.039032
2,RandomForestRegressor,,0.442057,0.520839,0.422834,-0.162742
3,RandomForestRegressor,MinMaxScaler,0.446005,0.525218,0.424124,-0.175232


In [3]:
# result[["model", "scaler", "mae_avg", "rmse_avg", "rmse_%_avg", "r2_avg"]].sort_values(by="rmse_avg")
result

Unnamed: 0,model,scaler,mae_Base_Fare,rmse_Base_Fare,rmse_%_Base_Fare,r2_Base_Fare,mae_Per_Km_Rate,rmse_Per_Km_Rate,rmse_%_Per_Km_Rate,r2_Per_Km_Rate,mae_Per_Minute_Rate,rmse_Per_Minute_Rate,rmse_%_Per_Minute_Rate,r2_Per_Minute_Rate,mae_avg,rmse_avg,rmse_%_avg,r2_avg
0,LinearRegression,,0.737497,0.851463,0.247175,0.014541,0.402542,0.459956,0.367445,-0.000535,0.097868,0.112322,0.41036,-0.041347,0.412636,0.474581,0.41036,-0.009114
1,LinearRegression,MinMaxScaler,0.737497,0.851463,0.247175,0.014541,0.402542,0.459956,0.367445,-0.000535,0.097868,0.112322,0.41036,-0.041347,0.412636,0.474581,0.41036,-0.009114
2,RandomForestRegressor,,0.801301,0.955419,0.277353,-0.240779,0.427567,0.491361,0.392533,-0.141829,0.097303,0.115737,0.422834,-0.105617,0.442057,0.520839,0.422834,-0.162742
3,RandomForestRegressor,MinMaxScaler,0.813923,0.969552,0.281456,-0.27776,0.425962,0.490011,0.391454,-0.135562,0.098131,0.11609,0.424124,-0.112374,0.446005,0.525218,0.424124,-0.175232
4,KNeighborsRegressor,,0.745352,0.870787,0.252785,-0.030696,0.409054,0.463633,0.370382,-0.016597,0.099182,0.113847,0.415929,-0.069803,0.417863,0.482756,0.415929,-0.039032
5,KNeighborsRegressor,MinMaxScaler,0.745352,0.870787,0.252785,-0.030696,0.409054,0.463633,0.370382,-0.016597,0.099182,0.113847,0.415929,-0.069803,0.417863,0.482756,0.415929,-0.039032
6,XGBRegressor,,0.83266,0.985131,0.285978,-0.319152,0.434102,0.502712,0.401601,-0.195192,0.099919,0.11967,0.437202,-0.182034,0.45556,0.535837,0.437202,-0.232126
7,XGBRegressor,MinMaxScaler,0.83266,0.985131,0.285978,-0.319152,0.434102,0.502712,0.401601,-0.195192,0.099919,0.11967,0.437202,-0.182034,0.45556,0.535837,0.437202,-0.232126
8,RidgeCV,,0.739291,0.852407,0.247449,0.012355,0.403262,0.459795,0.367316,0.000168,0.097537,0.112033,0.409303,-0.035989,0.413363,0.474745,0.409303,-0.007822
9,RidgeCV,MinMaxScaler,0.739291,0.852407,0.247449,0.012355,0.403262,0.459795,0.367316,0.000168,0.097537,0.112033,0.409303,-0.035989,0.413363,0.474745,0.409303,-0.007822
