In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
path = str(Path.cwd().parent)
print(path)
sys.path.insert(1, path)

import numpy as np
import pandas as pd
import skforecast

print(skforecast.__version__)

/home/joaquin/Documents/GitHub/skforecast
0.19.0


In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
import timeit

In [3]:
class FastLinearRegression:
    """
    Fast linear regression with using numpy linalg.solve as primary method and
    numpy lstsq as fallback method in case of multicollinearity. This class is
    designed to be a lightweight alternative to sklearn's LinearRegression.
    
    Attributes
    ----------
    intercept_ : float
        The intercept term
    coef_ : np.ndarray
        The coefficient array
    """
    
    def __init__(self):
        self.intercept_ = None
        self.coef_ = None
    
    def fit(self, X, y):
        """
        Fit the linear regression model.
        
        Parameters
        ----------
        X : np.ndarray
            Feature matrix of shape (n_samples, n_features)
        y : np.ndarray
            Target values of shape (n_samples,)
            
        Returns
        -------
        self
        """
        X = np.asarray(X)
        y = np.asarray(y)
        
        # Add intercept column
        X_with_intercept = np.column_stack([np.ones(len(X)), X])
        
        try:
            # Try fastest method: closed-form solution
            XtX = X_with_intercept.T @ X_with_intercept
            coefficients = np.linalg.solve(XtX, X_with_intercept.T @ y)
            
        except np.linalg.LinAlgError:
            # Fallback to lstsq (handles rank-deficient matrices)
            coefficients = np.linalg.lstsq(X_with_intercept, y, rcond=None)[0]
        
        self.intercept_ = coefficients[0]
        self.coef_ = coefficients[1:]
        
        return self
    
    def predict(self, X):
        """
        Predict using the linear model.
        
        Parameters
        ----------
        X : np.ndarray
            Feature matrix of shape (n_samples, n_features)
            
        Returns
        -------
        y_pred : np.ndarray
            Predicted values of shape (n_samples,)
        """
        if self.intercept_ is None or self.coef_ is None:
            raise ValueError("Model must be fitted before making predictions")
        
        X = np.asarray(X)
        return X @ self.coef_ + self.intercept_

In [4]:

# Generate test data with different sizes
# ==============================================================================
datasets = {
    "Small (100x5)": (100, 5),
    "Medium (1000x10)": (1000, 10),
    "Large (10000x20)": (10000, 20),
    "XLarge (50000x50)": (50000, 50),
}

results = []

for name, (n_samples, n_features) in datasets.items():
    print(f"\n{name} dataset: {n_samples} samples, {n_features} features")
    print("=" * 60)
    
    # Generate random data
    np.random.seed(123)
    X, y = make_regression(n_samples=n_samples, n_features=n_features, noise=0.1)
    
    # Benchmark sklearn LinearRegression
    lr_sklearn = LinearRegression()
    start = timeit.default_timer()
    lr_sklearn.fit(X, y)
    y_pred_sklearn = lr_sklearn.predict(X)
    elapsed_sklearn = timeit.default_timer() - start
    
    # Benchmark FastLinearRegression
    lr_fast = FastLinearRegression()
    start = timeit.default_timer()
    lr_fast.fit(X, y)
    y_pred_fast = lr_fast.predict(X)
    elapsed_fast = timeit.default_timer() - start
    
    # Compare coefficients
    coef_diff = np.max(np.abs(lr_sklearn.coef_ - lr_fast.coef_))
    intercept_diff = np.abs(lr_sklearn.intercept_ - lr_fast.intercept_)
    pred_diff = np.max(np.abs(y_pred_sklearn - y_pred_fast))
    
    # Store results
    results.append({
        "Dataset": name,
        "sklearn time (s)": elapsed_sklearn,
        "Fast time (s)": elapsed_fast,
        "Speedup": elapsed_sklearn / elapsed_fast,
        "Max coef diff": coef_diff,
        "Intercept diff": intercept_diff,
        "Max pred diff": pred_diff,
    })
    
    print(f"sklearn time: {elapsed_sklearn:.6f} s")
    print(f"Fast time:    {elapsed_fast:.6f} s")
    print(f"Speedup:      {elapsed_sklearn / elapsed_fast:.2f}x")
    print(f"Max coefficient difference: {coef_diff:.2e}")
    print(f"Intercept difference:       {intercept_diff:.2e}")
    print(f"Max prediction difference:  {pred_diff:.2e}")

# Summary table
# ==============================================================================
results_df = pd.DataFrame(results)
print("\n\nSummary Results:")
print("=" * 80)
print(results_df.to_string(index=False))


Small (100x5) dataset: 100 samples, 5 features
sklearn time: 0.001565 s
Fast time:    0.000205 s
Speedup:      7.64x
Max coefficient difference: 2.13e-13
Intercept difference:       3.94e-15
Max prediction difference:  6.82e-13

Medium (1000x10) dataset: 1000 samples, 10 features
sklearn time: 0.004866 s
Fast time:    0.000293 s
Speedup:      16.58x
Max coefficient difference: 1.71e-13
Intercept difference:       4.92e-15
Max prediction difference:  1.02e-12

Large (10000x20) dataset: 10000 samples, 20 features
sklearn time: 0.039853 s
Fast time:    0.012508 s
Speedup:      3.19x
Max coefficient difference: 1.14e-13
Intercept difference:       3.16e-16
Max prediction difference:  9.66e-13

XLarge (50000x50) dataset: 50000 samples, 50 features
sklearn time: 1.174337 s
Fast time:    0.122701 s
Speedup:      9.57x
Max coefficient difference: 9.24e-13
Intercept difference:       1.37e-15
Max prediction difference:  5.83e-12


Summary Results:
          Dataset  sklearn time (s)  Fast time