In [2]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
src_path = os.path.join(project_root, "src")
sys.path.append(src_path)

from regression import *
from utils import *

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

train_80= pd.read_csv("../data/train80.csv")
train_70= pd.read_csv("../data/train70.csv")
test_20= pd.read_csv("../data/test20.csv")
test_30= pd.read_csv("../data/test30.csv")


#### Linear Regression

In [4]:
feature_cols = [ 'haversine_distance',
                'manhattan_distance',
                'pickup_hour_sin',
                'pickup_hour_cos', 
                'pickup_weekday_sin', 
                'pickup_weekday_cos', 
                'pickup_minute_sin', 
                'pickup_minute_cos', 
                'weekend_indicator', 
                'rush_hour', 
                'store_and_fwd_flag', 
                'direction_NS', 
                'direction_EW', 
                'vendor_id', 
                'passenger_count' ]

X1_train = train_80[feature_cols]
y1_train = train_80['log_trip_duration']
X1_test= test_20[feature_cols]
y1_test= test_20['log_trip_duration']

X2_train = train_70[feature_cols]
y2_train = train_70['log_trip_duration']
X2_test= test_30[feature_cols]
y2_test= test_30['log_trip_duration']

lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', MyLinearRegression())
])

print("--- Training Linear Regression with split ratio 0.2 ---\n")
print(f"Training on {len(X1_train)} samples, testing on {len(X1_test)} samples.")
lr_pipeline.fit(X1_train, y1_train)
y1_train_pred_lr = lr_pipeline.predict(X1_train)
y1_test_pred_lr = lr_pipeline.predict(X1_test)
rmse_train_X1_lr = np.sqrt(mse(y1_train, y1_train_pred_lr))
rmse_test_X1_lr = np.sqrt(mse(y1_test, y1_test_pred_lr))
print(f"Linear Regression Training RMSE (log): {rmse_train_X1_lr:.4f}")
print(f"Linear Regression Test RMSE (log): {rmse_test_X1_lr:.4f}\n")

print("--- Training Linear Regression with split ratio 0.3 ---\n")
print(f"Training on {len(X2_train)} samples, testing on {len(X2_test)} samples.")
lr_pipeline.fit(X2_train, y2_train)
y2_train_pred_lr = lr_pipeline.predict(X2_train)
y2_test_pred_lr = lr_pipeline.predict(X2_test)
rmse_train_X2_lr = np.sqrt(mse(y2_train, y2_train_pred_lr))
rmse_test_X2_lr = np.sqrt(mse(y2_test, y2_test_pred_lr))
print(f"Linear Regression Training RMSE (log): {rmse_train_X2_lr:.4f}")
print(f"Linear Regression Test RMSE (log): {rmse_test_X2_lr:.4f}\n")

print("--- Evaluation Complete ---")

--- Training Linear Regression with split ratio 0.2 ---

Training on 1166912 samples, testing on 291728 samples.
Linear Regression Training RMSE (log): 0.6387
Linear Regression Test RMSE (log): 0.6437

--- Training Linear Regression with split ratio 0.3 ---

Training on 1021048 samples, testing on 437592 samples.
Linear Regression Training RMSE (log): 0.6422
Linear Regression Test RMSE (log): 0.6345

--- Evaluation Complete ---


#### Polynomial Regression

In [5]:
feature_cols = [ 'haversine_distance',
                'manhattan_distance',
                'pickup_hour_sin',
                'pickup_hour_cos', 
                'pickup_weekday_sin', 
                'pickup_weekday_cos', 
                'pickup_minute_sin', 
                'pickup_minute_cos', 
                'weekend_indicator', 
                'rush_hour', 
                'store_and_fwd_flag', 
                'direction_NS', 
                'direction_EW', 
                'vendor_id', 
                'passenger_count' ]

degree = 2

X1_train = train_80[feature_cols].values
y1_train = train_80['log_trip_duration']
X1_test= test_20[feature_cols].values
y1_test= test_20['log_trip_duration']

X2_train = train_70[feature_cols].values
y2_train = train_70['log_trip_duration']
X2_test= test_30[feature_cols].values
y2_test= test_30['log_trip_duration']

X1_train_poly = create_polynomial_features(X1_train, degree)
X1_test_poly = create_polynomial_features(X1_test, degree)
X2_train_poly = create_polynomial_features(X2_train, degree)
X2_test_poly = create_polynomial_features(X2_test, degree)

scaler = StandardScaler()
poly_model = MyLinearRegression()

X1_train_scaled = scaler.fit_transform(X1_train_poly)
X1_test_scaled = scaler.transform(X1_test_poly)
X2_train_scaled = scaler.fit_transform(X2_train_poly)
X2_test_scaled = scaler.transform(X2_test_poly)

poly_model.fit(X1_train_scaled, y1_train)
poly_model.fit(X2_train_scaled, y2_train)

y1_train_pred_poly = poly_model.predict(X1_train_scaled)
y1_test_pred_poly = poly_model.predict(X1_test_scaled)
y2_train_pred_poly = poly_model.predict(X2_train_scaled)
y2_test_pred_poly = poly_model.predict(X2_test_scaled)
   
rmse_train_X1_poly = np.sqrt(mse(y1_train, y1_train_pred_poly))
rmse_test_X1_poly = np.sqrt(mse(y1_test, y1_test_pred_poly))
rmse_train_X2_poly = np.sqrt(mse(y2_train, y2_train_pred_poly))
rmse_test_X2_poly = np.sqrt(mse(y2_test, y2_test_pred_poly))

print(f"--- Training Polynomial Regression (Degree {degree}) for split ratio 0.2 ---\n")
print(f"Polynomial Regression Training RMSE (log): {rmse_train_X1_poly:.4f}")
print(f"Polynomial Regression Test RMSE (log): {rmse_test_X1_poly:.4f}\n")

print(f"--- Training Polynomial Regression (Degree {degree}) for split ratio 0.3 ---\n")
print(f"Polynomial Regression Training RMSE (log): {rmse_train_X2_poly:.4f}")
print(f"Polynomial Regression Test RMSE (log): {rmse_test_X2_poly:.4f}\n")

print("--- Evaluation Complete ---")

--- Training Polynomial Regression (Degree 2) for split ratio 0.2 ---

Polynomial Regression Training RMSE (log): 0.5940
Polynomial Regression Test RMSE (log): 0.5986

--- Training Polynomial Regression (Degree 2) for split ratio 0.3 ---

Polynomial Regression Training RMSE (log): 0.5939
Polynomial Regression Test RMSE (log): 0.5973

--- Evaluation Complete ---


#### Ridge Regression

In [None]:
feature_cols = [ 'haversine_distance',
                'manhattan_distance',
                'pickup_hour_sin',
                'pickup_hour_cos', 
                'pickup_weekday_sin', 
                'pickup_weekday_cos', 
                'pickup_minute_sin', 
                'pickup_minute_cos', 
                'weekend_indicator', 
                'rush_hour', 
                'store_and_fwd_flag', 
                'direction_NS', 
                'direction_EW', 
                'vendor_id', 
                'passenger_count' ]

degree = 2
alpha_value = 1.0

X1_train = train_80[feature_cols].values
y1_train = train_80['log_trip_duration']
X1_test= test_20[feature_cols].values
y1_test= test_20['log_trip_duration']

X2_train = train_70[feature_cols].values
y2_train = train_70['log_trip_duration']
X2_test= test_30[feature_cols].values
y2_test= test_30['log_trip_duration']

scaler_linear = StandardScaler()
scaler_poly = StandardScaler()
ridge_linear_model = MyRidgeRegression(alpha=alpha_value)
ridge_poly_model = MyRidgeRegression(alpha=alpha_value)

X1_train_scaled_linear = scaler_linear.fit_transform(X1_train)
X1_test_scaled_linear = scaler_linear.transform(X1_test)
X2_train_scaled_linear = scaler_linear.fit_transform(X2_train)
X2_test_scaled_linear = scaler_linear.transform(X2_test)

ridge_linear_model.fit(X1_train_scaled_linear, y1_train)
ridge_linear_model.fit(X2_train_scaled_linear, y2_train)

y1_train_pred_linear = ridge_linear_model.predict(X1_train_scaled_linear)
y1_test_pred_linear = ridge_linear_model.predict(X1_test_scaled_linear)
y2_train_pred_linear = ridge_linear_model.predict(X2_train_scaled_linear)
y2_test_pred_linear = ridge_linear_model.predict(X2_test_scaled_linear)

rmse_train_y1_linear = np.sqrt(mse(y1_train, y1_train_pred_linear))
rmse_test_y1_linear = np.sqrt(mse(y1_test, y1_test_pred_linear))
rmse_train_y2_linear = np.sqrt(mse(y2_train, y2_train_pred_linear))
rmse_test_y2_linear = np.sqrt(mse(y2_test, y2_test_pred_linear))

X1_train_poly = create_polynomial_features(X1_train, degree)
X1_test_poly = create_polynomial_features(X1_test, degree)
X2_train_poly = create_polynomial_features(X2_train, degree)
X2_test_poly = create_polynomial_features(X2_test, degree)

X1_train_scaled_poly = scaler_poly.fit_transform(X1_train_poly)
X1_test_scaled_poly = scaler_poly.transform(X1_test_poly)
X2_train_scaled_poly = scaler_poly.fit_transform(X2_train_poly)
X2_test_scaled_poly = scaler_poly.transform(X2_test_poly)

ridge_poly_model.fit(X1_train_scaled_poly, y1_train)
ridge_poly_model.fit(X2_train_scaled_poly, y2_train)

y1_train_pred_poly = ridge_poly_model.predict(X1_train_scaled_poly)
y1_test_pred_poly = ridge_poly_model.predict(X1_test_scaled_poly)
y2_train_pred_poly = ridge_poly_model.predict(X2_train_scaled_poly)
y2_test_pred_poly = ridge_poly_model.predict(X2_test_scaled_poly)

rmse_train_y1_poly = np.sqrt(mse(y1_train, y1_train_pred_poly))
rmse_test_y1_poly = np.sqrt(mse(y1_test, y1_test_pred_poly))
rmse_train_y2_poly = np.sqrt(mse(y2_train, y2_train_pred_poly))
rmse_test_y2_poly = np.sqrt(mse(y2_test, y2_test_pred_poly))

print(f"--- Starting Ridge Regression Evaluation (alpha={alpha_value})  for split ratio 0.2---\n")
print(f"  [Linear Features] Ridge Training RMSE (log): {rmse_train_y1_linear:.4f}")
print(f"  [Linear Features] Ridge Test RMSE (log): {rmse_test_y1_linear:.4f}")
print(f"  [Polynomial Features] Ridge Training RMSE (log): {rmse_train_y1_poly:.4f}")
print(f"  [Polynomial Features] Ridge Test RMSE (log): {rmse_test_y1_poly:.4f}\n")

print(f"--- Starting Ridge Regression Evaluation (alpha={alpha_value})  for split ratio 0.3---\n")
print(f"  [Linear Features] Ridge Training RMSE (log): {rmse_train_y2_linear:.4f}")
print(f"  [Linear Features] Ridge Test RMSE (log): {rmse_test_y2_linear:.4f}")
print(f"  [Polynomial Features] Ridge Training RMSE (log): {rmse_train_y2_poly:.4f}")
print(f"  [Polynomial Features] Ridge Test RMSE (log): {rmse_test_y2_poly:.4f}\n")

print("--- Evaluation Complete ---")

--- Starting Ridge Regression Evaluation (alpha=10.0)  for split ratio 0.2---

  [Linear Features] Ridge Training RMSE (log): 0.6388
  [Linear Features] Ridge Test RMSE (log): 0.6442
  [Polynomial Features] Ridge Training RMSE (log): 0.5945
  [Polynomial Features] Ridge Test RMSE (log): 0.6010

--- Starting Ridge Regression Evaluation (alpha=10.0)  for split ratio 0.3---

  [Linear Features] Ridge Training RMSE (log): 0.6422
  [Linear Features] Ridge Test RMSE (log): 0.6345
  [Polynomial Features] Ridge Training RMSE (log): 0.5945
  [Polynomial Features] Ridge Test RMSE (log): 0.5990

--- Evaluation Complete ---


####  Lasso Regression

In [12]:
feature_cols = [ 'haversine_distance',
                'manhattan_distance',
                'pickup_hour_sin',
                'pickup_hour_cos', 
                'pickup_weekday_sin', 
                'pickup_weekday_cos', 
                'pickup_minute_sin', 
                'pickup_minute_cos', 
                'weekend_indicator', 
                'rush_hour', 
                'store_and_fwd_flag', 
                'direction_NS', 
                'direction_EW', 
                'vendor_id', 
                'passenger_count' ]

degree = 2
alpha_value = 0.01
n_iters = 1000

X1_train = train_80[feature_cols].values
y1_train = train_80['log_trip_duration']
X1_test= test_20[feature_cols].values
y1_test= test_20['log_trip_duration']

X2_train = train_70[feature_cols].values
y2_train = train_70['log_trip_duration']
X2_test= test_30[feature_cols].values
y2_test= test_30['log_trip_duration']

scaler_linear = StandardScaler()
scaler_poly = StandardScaler()
lasso_linear_model = MyLassoRegression(alpha=alpha_value, n_iterations=n_iters)
lasso_poly_model = MyLassoRegression(alpha=alpha_value, n_iterations=n_iters)

X1_train_scaled_linear = scaler_linear.fit_transform(X1_train)
X1_test_scaled_linear = scaler_linear.transform(X1_test)
X2_train_scaled_linear = scaler_linear.fit_transform(X2_train)
X2_test_scaled_linear = scaler_linear.transform(X2_test)

lasso_linear_model.fit(X1_train_scaled_linear, y1_train)
lasso_linear_model.fit(X2_train_scaled_linear, y2_train)

y1_train_pred_linear = lasso_linear_model.predict(X1_train_scaled_linear)
y1_test_pred_linear = lasso_linear_model.predict(X1_test_scaled_linear)
y2_train_pred_linear = lasso_linear_model.predict(X2_train_scaled_linear)
y2_test_pred_linear = lasso_linear_model.predict(X2_test_scaled_linear)

rmse_train_y1_linear = np.sqrt(mse(y1_train, y1_train_pred_linear))
rmse_test_y1_linear = np.sqrt(mse(y1_test, y1_test_pred_linear))
rmse_train_y2_linear = np.sqrt(mse(y2_train, y2_train_pred_linear))
rmse_test_y2_linear = np.sqrt(mse(y2_test, y2_test_pred_linear))

X1_train_poly = create_polynomial_features(X1_train, degree)
X1_test_poly = create_polynomial_features(X1_test, degree)
X2_train_poly = create_polynomial_features(X2_train, degree)
X2_test_poly = create_polynomial_features(X2_test, degree)

X1_train_scaled_poly = scaler_poly.fit_transform(X1_train_poly)
X1_test_scaled_poly = scaler_poly.transform(X1_test_poly)
X2_train_scaled_poly = scaler_poly.fit_transform(X2_train_poly)
X2_test_scaled_poly = scaler_poly.transform(X2_test_poly)

lasso_poly_model.fit(X1_train_scaled_poly, y1_train)
lasso_poly_model.fit(X2_train_scaled_poly, y2_train)

y1_train_pred_poly = lasso_poly_model.predict(X1_train_scaled_poly)
y1_test_pred_poly = lasso_poly_model.predict(X1_test_scaled_poly)
y2_train_pred_poly = lasso_poly_model.predict(X2_train_scaled_poly)
y2_test_pred_poly = lasso_poly_model.predict(X2_test_scaled_poly)

rmse_train_y1_poly = np.sqrt(mse(y1_train, y1_train_pred_poly))
rmse_test_y1_poly = np.sqrt(mse(y1_test, y1_test_pred_poly))
rmse_train_y2_poly = np.sqrt(mse(y2_train, y2_train_pred_poly))
rmse_test_y2_poly = np.sqrt(mse(y2_test, y2_test_pred_poly))

print(f"--- Starting Lasso Regression Evaluation (alpha={alpha_value})  for split ratio 0.2---\n")
print(f"  [Linear Features] Lasso Training RMSE (log): {rmse_train_y1_linear:.4f}")
print(f"  [Linear Features] Lasso Test RMSE (log): {rmse_test_y1_linear:.4f}")
print(f"  [Polynomial Features] Lasso Training RMSE (log): {rmse_train_y1_poly:.4f}")
print(f"  [Polynomial Features] Lasso Test RMSE (log): {rmse_test_y1_poly:.4f}\n")

print(f"--- Starting Lasso Regression Evaluation (alpha={alpha_value})  for split ratio 0.3---\n")
print(f"  [Linear Features] Lasso Training RMSE (log): {rmse_train_y2_linear:.4f}")
print(f"  [Linear Features] Lasso Test RMSE (log): {rmse_test_y2_linear:.4f}")
print(f"  [Polynomial Features] Lasso Training RMSE (log): {rmse_train_y2_poly:.4f}")
print(f"  [Polynomial Features] Lasso Test RMSE (log): {rmse_test_y2_poly:.4f}\n")

print("--- Evaluation Complete ---")

--- Starting Lasso Regression Evaluation (alpha=0.01)  for split ratio 0.2---

  [Linear Features] Lasso Training RMSE (log): 0.6388
  [Linear Features] Lasso Test RMSE (log): 0.6442
  [Polynomial Features] Lasso Training RMSE (log): 0.5948
  [Polynomial Features] Lasso Test RMSE (log): 0.6014

--- Starting Lasso Regression Evaluation (alpha=0.01)  for split ratio 0.3---

  [Linear Features] Lasso Training RMSE (log): 0.6422
  [Linear Features] Lasso Test RMSE (log): 0.6345
  [Polynomial Features] Lasso Training RMSE (log): 0.5948
  [Polynomial Features] Lasso Test RMSE (log): 0.5993

--- Evaluation Complete ---


In [7]:
import time
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

X_train_to_tune = X1_train_scaled_poly
y_train_to_tune = y1_train
y_train_values = y_train_to_tune.values
param_grid = {
    'alpha': np.logspace(-3, 0, 4) # Search alphas: 0.001, 0.01, 0.1, 1
}

ridge_model = Ridge(random_state=42)
print(f"Starting GridSearchCV on Ridge Regression (alpha range: {param_grid['alpha']}) with {len(X_train_to_tune)} samples...")
start_time_grid = time.time()
grid_search = GridSearchCV(
    estimator=ridge_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=4
)

grid_search.fit(X_train_to_tune, y_train_values)
end_time_grid = time.time()
tuning_time = end_time_grid - start_time_grid

best_alpha = grid_search.best_params_['alpha']
best_rmse = np.sqrt(abs(grid_search.best_score_))
test_pred = grid_search.predict(X1_test_scaled_poly)
test_rmse = np.sqrt(mse(y1_test, test_pred))

print(f"\n--- GridSearchCV Results ---")
print(f"Tuning Time: {tuning_time:.2f}s")
print(f"Best Alpha found: {best_alpha}")
print(f"Best Cross-Validation Training RMSE (log): {best_rmse:.4f}")
print(f"Final Test RMSE (log) with best model: {test_rmse:.4f}")

Starting GridSearchCV on Ridge Regression (alpha range: [0.001 0.01  0.1   1.   ]) with 1166912 samples...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................alpha=0.001; total time=   1.1s
[CV] END ........................................alpha=0.001; total time=   1.1s
[CV] END .........................................alpha=0.01; total time=   1.1s
[CV] END ........................................alpha=0.001; total time=   1.1s
[CV] END .........................................alpha=0.01; total time=   1.0s
[CV] END ..........................................alpha=0.1; total time=   1.1s
[CV] END ..........................................alpha=0.1; total time=   1.1s
[CV] END .........................................alpha=0.01; total time=   1.2s
[CV] END ..........................................alpha=0.1; total time=   1.1s
[CV] END ..........................................alpha=1.0; total time=   1.0s
[CV] END ..............

In [8]:
import time
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

X_train_to_tune = X1_train_scaled_poly
y_train_to_tune = y1_train
y_train_values = y_train_to_tune.values

param_grid_lasso = {
    'alpha': np.logspace(-3, 0, 4)  # [0.001, 0.01, 0.1, 1.0]
}

lasso_model = Lasso(random_state=42, max_iter=2000)

print(f"Starting GridSearchCV on Lasso Regression (alpha range: {param_grid_lasso['alpha']}) with {len(X_train_to_tune)} samples...")
start_time_grid_lasso = time.time()

grid_search_lasso = GridSearchCV(
    estimator=lasso_model,
    param_grid=param_grid_lasso,
    scoring='neg_mean_squared_error',
    cv=3,       # 3-fold CV to save time on your large dataset
    verbose=2,  # Higher verbose to see progress
    n_jobs=4  # Use 4 CPU cores
)

grid_search_lasso.fit(X_train_to_tune, y_train_values)
end_time_grid_lasso = time.time()
tuning_time_lasso = end_time_grid_lasso - start_time_grid_lasso

best_alpha_lasso = grid_search_lasso.best_params_['alpha']
best_rmse_lasso_cv = np.sqrt(abs(grid_search_lasso.best_score_))

best_lasso_model = grid_search_lasso.best_estimator_
test_pred_lasso = best_lasso_model.predict(X1_test_scaled_poly)
test_rmse_lasso = np.sqrt(mse(y1_test, test_pred_lasso))

print(f"\n--- Lasso GridSearchCV Results ---")
print(f"Tuning Time: {tuning_time_lasso:.2f}s")
print(f"Best Alpha found: {best_alpha_lasso}")
print(f"Best Cross-Validation Training RMSE (log): {best_rmse_lasso_cv:.4f}")
print(f"Final Test RMSE (log) with best model: {test_rmse_lasso:.4f}")

Starting GridSearchCV on Lasso Regression (alpha range: [0.001 0.01  0.1   1.   ]) with 1166912 samples...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END .........................................alpha=0.01; total time=  19.4s
[CV] END .........................................alpha=0.01; total time=   8.7s
[CV] END .........................................alpha=0.01; total time= 2.3min
[CV] END ..........................................alpha=0.1; total time=   9.2s
[CV] END ..........................................alpha=0.1; total time=   6.3s
[CV] END ..........................................alpha=0.1; total time=  10.1s
[CV] END ..........................................alpha=1.0; total time=   3.7s
[CV] END ..........................................alpha=1.0; total time=   3.8s
[CV] END ..........................................alpha=1.0; total time=   3.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time= 3.8min


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ........................................alpha=0.001; total time= 4.0min
[CV] END ........................................alpha=0.001; total time= 4.0min

--- Lasso GridSearchCV Results ---
Tuning Time: 244.70s
Best Alpha found: 0.01
Best Cross-Validation Training RMSE (log): 0.6210
Final Test RMSE (log) with best model: 0.6233


In [13]:

import time
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

final_results = {}

def evaluate_model(model, X_train_data, y_train_data, X_test_data, y_test_data, model_name):
    """Fits model, records test RMSE, and prints training time."""
    print(f"Training {model_name}...")
    start_time = time.time()
    model.fit(X_train_data, y_train_data)
    y_test_pred = model.predict(X_test_data)
    rmse = np.sqrt(mse(y_test_data, y_test_pred))
    end_time = time.time()
    training_time = end_time - start_time
    final_results[model_name] = {'Test RMSE (log)': rmse, 'Training Time (s)': training_time}
    print(f"{model_name} finished. Test RMSE: {rmse:.4f} | Training Time: {training_time:.2f}s\n")

xgb_model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, n_jobs=-1, random_state=42)
evaluate_model(xgb_model, X1_train_scaled_poly, y1_train, X1_test_scaled_poly,y1_test ,"XGBoost (split ratio 0.2)")
evaluate_model(xgb_model, X2_train_scaled_poly, y2_train, X2_test_scaled_poly,y2_test ,"XGBoost (split ratio 0.3)")

lgbm_model = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, n_jobs=-1, random_state=42)
evaluate_model(lgbm_model, X1_train_scaled_poly, y1_train, X1_test_scaled_poly,y1_test, "LightGBM (Split ratio 0.2)")
evaluate_model(lgbm_model, X2_train_scaled_poly, y2_train, X2_test_scaled_poly,y2_test, "LightGBM (Split ratio 0.3)")

N_COMPONENTS = 15
N_SAMPLE_FOR_SVR = 50000
pca = PCA(n_components=N_COMPONENTS, random_state=42)
X1_train_pca = pca.fit_transform(X1_train_scaled_poly)
X1_test_pca = pca.transform(X1_test_scaled_poly)
X2_train_pca = pca.fit_transform(X2_train_scaled_poly)
X2_test_pca = pca.transform(X2_test_scaled_poly)
print("PCA complete.")

n_available_rows_X1 = len(X1_train_pca)
n_available_rows_X2 = len(X2_train_pca)
actual_sample_size_X1 = min(N_SAMPLE_FOR_SVR, n_available_rows_X1)
actual_sample_size_X2 = min(N_SAMPLE_FOR_SVR, n_available_rows_X2)
sample_indices_X1 = np.random.choice(n_available_rows_X1, actual_sample_size_X1, replace=False)
sample_indices_X2 = np.random.choice(n_available_rows_X2, actual_sample_size_X2, replace=False)
X1_train_svr_pca = X1_train_pca[sample_indices_X1]
y1_train_svr_pca = y1_train.iloc[sample_indices_X1]
X2_train_svr_pca = X2_train_pca[sample_indices_X2]
y2_train_svr_pca = y2_train.iloc[sample_indices_X2]

svr_model_pca = SVR(kernel='rbf', C=1.0, epsilon=0.1) 
evaluate_model(
    svr_model_pca,X1_train_svr_pca, y1_train_svr_pca, X1_test_pca,y1_test.values,f"SVR ( RBF + PCA + Sampled {actual_sample_size_X1} for split ratio 0.2)"
)
evaluate_model(
    svr_model_pca, X2_train_svr_pca, y2_train_svr_pca, X2_test_pca,y2_test.values,f"SVR ( RBF + PCA + Sampled {actual_sample_size_X2} for split ratio 0.3)"
)

print("\n" + "="*60)
print("          FINAL MODEL COMPARISON (Test RMSE & Time)         ")
print("="*60)
df_results = pd.DataFrame.from_dict(final_results, orient='index')
df_results = df_results.sort_values(by='Test RMSE (log)')
df_results['Rank'] = np.arange(1, len(df_results) + 1)
print(df_results.to_markdown(floatfmt=".4f"))


Training XGBoost (split ratio 0.2)...
XGBoost (split ratio 0.2) finished. Test RMSE: 0.4566 | Training Time: 7.31s

Training XGBoost (split ratio 0.3)...
XGBoost (split ratio 0.3) finished. Test RMSE: 0.4556 | Training Time: 5.80s

Training LightGBM (Split ratio 0.2)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.123429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12460
[LightGBM] [Info] Number of data points in the train set: 1166912, number of used features: 135
[LightGBM] [Info] Start training from score 6.467280
LightGBM (Split ratio 0.2) finished. Test RMSE: 0.4555 | Training Time: 9.58s

Training LightGBM (Split ratio 0.3)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12474
[Lig