# Testing out some models

In [None]:
import pandas as pd
from taxipred.utils.constants import CLEANED_DATA

df = pd.read_csv(CLEANED_DATA / "taxi_nonan_dropped_columns.csv")
df.head()

Unnamed: 0,trip_distance_km,time_of_day,day_of_week,traffic_conditions,weather,per_km_rate,trip_duration_minutes,trip_price
0,19.35,Morning,Weekday,Low,Clear,0.8,53.82,36.2624
1,36.87,Evening,Weekend,High,Clear,1.21,37.27,52.9032
2,8.64,Afternoon,Weekend,Medium,Clear,1.71,89.33,60.2028
3,41.79,Night,Weekend,High,Clear,1.77,86.95,88.1328
4,9.91,Evening,Weekday,High,Clear,1.26,41.72,28.9914


### Separating features and target

In [373]:
X, y = df.drop(columns="trip_price"), df["trip_price"]
X.shape, y.shape

((562, 7), (562,))

# Linear Regression

### Time for some dummy encoding
When I did this, I learned that dummy encoding requires train and test columns to be exactly the same,

which is fine so far since we encode them before the split, but if a new categories appears in production, this approach becomes more fragile.

In [374]:
dummy_columns = [
    "time_of_day",
    "day_of_week",
    "traffic_conditions",
    "weather"
]

X = pd.get_dummies(X, columns=dummy_columns, dtype=int, drop_first=True)
X.head()

Unnamed: 0,trip_distance_km,per_km_rate,trip_duration_minutes,time_of_day_Evening,time_of_day_Morning,time_of_day_Night,day_of_week_Weekend,traffic_conditions_Low,traffic_conditions_Medium,weather_Rain,weather_Snow
0,19.35,0.8,53.82,0,1,0,0,1,0,0,0
1,36.87,1.21,37.27,1,0,0,1,0,0,0,0
2,8.64,1.71,89.33,0,0,0,1,0,1,0,0
3,41.79,1.77,86.95,0,0,1,1,0,0,0,0
4,9.91,1.26,41.72,1,0,0,0,0,0,0,0


# Train|Test-split

In [375]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(f"""Shapes:
X_train: {X_train.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}""")

Shapes:
X_train: (376, 11)
X_test: (186, 11)
y_train: (376,)
y_test: (186,)


# S-s-s-s-scaaaling

In [376]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"""X_train min: {scaled_X_train.min()}
X_train max: {scaled_X_train.max()}
X_test min: {scaled_X_test.min()}
X_test max: {scaled_X_test.max()}""")

X_train min: -1.7713962822536558
X_train max: 5.685906130602294
X_test min: -1.7514815912563007
X_test max: 5.701295599620912


# Training ðŸ’ª

In [377]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(scaled_X_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


# Predict

In [378]:
y_pred = model.predict(scaled_X_test)
y_pred.mean(), y_test.mean()

(np.float64(58.52596591325338), np.float64(57.388769554269466))

# Evaluate

In [379]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"""MAE: {mae}
RMSE: {rmse}""")

MAE: 11.560074802893082
RMSE: 16.599734784460995


# Random Forest

In [380]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [381]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"""MAE: {mae}
RMSE: {rmse}""")

MAE: 7.994960690671603
RMSE: 11.524904933398629


## Conclusion:
Random forest is better for this dataset, given the parameters. It also handles larger errors better.

Lets try with the filled nan-values. Here, I make a quick function so I don't have to copy-paste all the code.

In [382]:
def quick_dirty_train_eval(df, model, target, dummy_columns, scaler=None):
    X, y = df.drop(columns=[target]), df[target]
    X = pd.get_dummies(X, columns=dummy_columns, dtype=int, drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    if scaler is not None:
        scaled_X_train = scaler.fit_transform(X_train)
        scaled_X_test = scaler.transform(X_test)
        model.fit(scaled_X_train, y_train)
        y_pred = model.predict(scaled_X_test)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return {"mae": mae, "rmse": rmse, "y_pred": y_pred}

In [None]:
df = pd.read_csv(CLEANED_DATA / "taxi_fillednan_dropped_columns.csv")
model = LinearRegression()
target = "trip_price"
scaler = StandardScaler()sns.boxplot(x="Traffic_Conditions", y="Trip_Price", data=data)
result = quick_dirty_train_eval(df, model, target, dummy_columns, scaler=scaler)
print(f"MAE: {result["mae"]}, RMSE: {result["rmse"]}")

MAE: 11.29618622405259, RMSE: 18.227574988033446


In [384]:
model = RandomForestRegressor(random_state=42)
scaler = None
result = result = quick_dirty_train_eval(df, model, target, dummy_columns, scaler=scaler)
print(f"MAE: {result["mae"]}, RMSE: {result["rmse"]}")

MAE: 9.058615419087824, RMSE: 13.680954965193193


## Let's try with all the columns

In [None]:
df = pd.read_csv(CLEANED_DATA / "taxi_fillednan_all_columns.csv")
model = LinearRegression()
target = "trip_price"
scaler = StandardScaler()
result = quick_dirty_train_eval(df, model, target, dummy_columns, scaler=scaler)
print(f"MAE: {result["mae"]}, RMSE: {result["rmse"]}")

MAE: 9.447620065560066, RMSE: 16.5940588276874


In [386]:
model = RandomForestRegressor(random_state=42)
scaler = None
result = result = quick_dirty_train_eval(df, model, target, dummy_columns, scaler=scaler)
print(f"MAE: {result["mae"]}, RMSE: {result["rmse"]}")

MAE: 6.033287788350274, RMSE: 11.201734971265655


In [388]:
baseline_mae = mean_absolute_error(y_test, [y_train.mean()] * len(y_test))
print(baseline_mae)

25.49147155683648


## Conclusions:
- Random forest works better for this dataset
- The model is better when using all columns
- The results are better when we fill in the NaN-values

### Decision / moving forward
- Implement a reproducible preprocessing + model pipeline.
- Use RandomForestRegressor as baseline.
- Use the full processed feature set (drop only if needed for UX/leakage).
- Keep NaN imputation for this dataset; validate with an extra split/CV.
