In [17]:
# model_building.ipynb

# -----------------------------
# Import Libraries
# -----------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# ------------
# Load Dataset
# ------------
df = pd.read_csv("data/tripfare_feature_engg.csv")
print("Dataset shape:", df.shape)



Dataset shape: (208024, 34)


In [18]:
# List object columns
print(df.select_dtypes(include='object').columns)


Index(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'store_and_fwd_flag',
       'pickup_day'],
      dtype='object')


In [19]:

# Define Features and Target
# Drop columns not needed for modeling
drop_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_day', 'store_and_fwd_flag']

X = df.drop(columns=drop_cols + ['total_amount'])
y = df['total_amount']

In [20]:
df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,208024.0,208024.0,208024.0,208024.0
mean,-73.972823,40.753706,-73.973955,40.752968
std,0.035505,0.026996,0.034313,0.028939
min,-74.229523,40.575928,-74.234215,40.537407
25%,-73.99102,40.741074,-73.990738,40.740795
50%,-73.97982,40.756161,-73.978912,40.755741
75%,-73.96414,40.770512,-73.965828,40.767689
max,-73.715897,40.897079,-73.700356,40.899857


In [21]:
# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# -----------------------------
# Define Regression Models
# -----------------------------
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=42)
}


# -----------------------------
# Train & Evaluate Models
# -----------------------------

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'R2': r2,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae
    })

results_df = pd.DataFrame(results)
print("\nModel Evaluation Results:\n")
print(results_df.sort_values(by='R2', ascending=False))


Train shape: (166419, 29) Test shape: (41605, 29)

Model Evaluation Results:

              Model        R2           MSE          RMSE           MAE
0  LinearRegression  1.000000  3.024759e-27  5.499781e-14  2.447682e-14
1             Ridge  1.000000  5.101893e-06  2.258737e-03  9.569104e-05
3      RandomForest  0.999365  1.043115e-01  3.229728e-01  2.157547e-02
4  GradientBoosting  0.999211  1.297531e-01  3.602126e-01  1.726781e-01
2             Lasso  0.994418  9.175817e-01  9.579049e-01  5.040332e-01


In [22]:
# -----------------------------
# Hyperparameter Tuning for RandomForest
# -----------------------------
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],           # number of trees
    'max_depth': [10, 20, 30, None],           # maximum depth of trees
    'min_samples_split': [2, 5, 10],           # minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],             # minimum samples per leaf
    'max_features': ['auto', 'sqrt']           # features considered for split
}

# Initialize RandomForest
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# RandomizedSearchCV for tuning
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,           # number of random combinations
    cv=3,                # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='r2'
)

# Fit on training data
rf_random.fit(X_train, y_train)

# Best estimator and parameters
best_rf = rf_random.best_estimator_
print("Best RandomForest Parameters:\n", rf_random.best_params_)

# Evaluate on test set
y_pred_rf = best_rf.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print(f"\nRandomForest Performance after Tuning:\nR2: {r2}\nRMSE: {rmse}\nMAE: {mae}")



Fitting 3 folds for each of 20 candidates, totalling 60 fits


33 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Best RandomForest Parameters:
 {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}

RandomForest Performance after Tuning:
R2: 0.9964846988907609
RMSE: 0.7601522200863982
MAE: 0.14882607579217177


In [23]:
import joblib

# Save the tuned RandomForest
joblib.dump(best_rf, "data/best_taxi_fare_model.pkl")
print("Tuned RandomForest model saved as 'data/best_taxi_fare_model.pkl'")


Tuned RandomForest model saved as 'data/best_taxi_fare_model.pkl'
