#### Libraries

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import numpy as np

- Load the training dataset

In [44]:
train_data = pd.read_csv('../data/splits/X_train.csv')

- Define features and target variable 

In [45]:
y = pd.read_csv('../data/splits/y_train.csv')

In [46]:
print(train_data.columns)
X = train_data

Index(['Trip ID', 'Distance (km)', 'Start Hour', 'Day of the Week',
       'Weather_Clear', 'Weather_Cloudy', 'Weather_Rain'],
      dtype='object')


#### Feature Engineering: Create new features
- Add binary feature for weekend

In [47]:
X['Is_Weekend'] = (X['Day of the Week'] == 'Weekend').astype(int)

- Drop the original 'Day of the Week' column

In [48]:
X.drop(columns=['Day of the Week'], axis=1)

Unnamed: 0,Trip ID,Distance (km),Start Hour,Weather_Clear,Weather_Cloudy,Weather_Rain,Is_Weekend
0,63,16.7,21,True,False,False,0
1,87,14.9,15,False,True,False,0
2,65,6.3,4,False,False,True,0
3,51,19.4,11,False,False,True,0
4,41,3.3,11,True,False,False,0
...,...,...,...,...,...,...,...
64,6,4.0,22,False,True,False,0
65,58,4.7,23,False,False,True,0
66,22,3.7,11,False,False,True,0
67,73,1.1,22,True,False,False,0


- Define categorical and numerical features

In [49]:
categorical_features = ['Weather_Clear', 'Weather_Cloudy', 'Weather_Rain', 'Is_Weekend']
numerical_features = ['Distance (km)', 'Start Hour']

- Create a column transformer for preprocessing

In [50]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_features)   # One-hot encode categorical features
    ])

- Create a pipeline with Ridge regression

In [51]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # Alpha is the regularization strength
])

- Train the model with cross-validation

In [52]:
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
mean_mse = np.mean(-cross_val_scores)

- Fit the model on the full training set

In [53]:
model.fit(X, y)

- Load the test dataset

In [54]:
test_data = pd.read_csv('../data/splits/X_test.csv')

- Prepare the test features

In [55]:
print(test_data.columns)
X_test = test_data
y_test = pd.read_csv('../data/splits/y_test.csv')

Index(['Trip ID', 'Distance (km)', 'Start Hour', 'Day of the Week',
       'Weather_Clear', 'Weather_Cloudy', 'Weather_Rain'],
      dtype='object')


- add is weekend for the test data 

In [56]:
X_test['Is_Weekend'] = (X_test['Day of the Week'] == 'Weekend').astype(int)

- Test the model

In [57]:
y_pred = model.predict(X_test)

- Evaluate the model

In [58]:
mse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


-  Display evaluation results

In [59]:
print("Model Evaluation Metrics:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print(f"Cross-Validated Mean MSE: {mean_mse:.2f}")

Model Evaluation Metrics:
Mean Squared Error (MSE): 4.51
Mean Absolute Error (MAE): 3.69
R-squared (R²): 0.96
Cross-Validated Mean MSE: 26.21
