In [2]:
%%time
import numpy as np
import pandas as pd
import time
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error

# Import various regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

# Load dataset
data = load_diabetes()
X, y = data.data, data.target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # random_state=42

# Define regressors to test
regressors = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
#   "Neural Network (MLP)": MLPRegressor()
}

# Evaluate regressors using cross-validation
best_model = None
best_score = float("inf")  # Using MSE, lower is better
results = {}

for name, reg in regressors.items():
    pipeline = make_pipeline(StandardScaler(), reg)
    start_time = time.time()
    scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
    end_time = time.time()
    mean_score = -np.mean(scores)  # Convert to positive MSE
    elapsed_time = end_time - start_time
    results[name] = {"Mean Squared Error": mean_score, "Time (seconds)": elapsed_time}

    if mean_score < best_score:  # Lower MSE is better
        best_score = mean_score
        best_model = pipeline

# Fit the best model and evaluate on test data
best_model.fit(X_train, y_train)
test_mse = mean_squared_error(y_test, best_model.predict(X_test))

# Display results
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df = results_df.sort_values(by="Mean Squared Error")

print("Model Performance:")
print(results_df)
print(f"\nBest Model: {best_model.named_steps}\nTest Mean Squared Error: {test_mse:.4f}")

Model Performance:
                        Mean Squared Error  Time (seconds)
Ridge Regression               2769.327130        0.002794
Linear Regression              2772.009236        0.007952
Lasso Regression               2775.291297        0.010754
AdaBoost                       3195.103025        0.283566
Random Forest                  3288.777483        0.944686
K-Nearest Neighbors            3295.015673        0.008910
Gradient Boosting              3427.435039        0.403535
Support Vector Machine         4792.489688        0.021765
Decision Tree                  7213.664194        0.017416

Best Model: {'standardscaler': StandardScaler(), 'ridge': Ridge()}
Test Mean Squared Error: 3586.7139
CPU times: total: 1.72 s
Wall time: 1.71 s
