# Paramsemble (Parametric Ensemble Regression) - Basic Usage GuideThis notebook demonstrates the core functionality of the Paramsemble package, including:- Training ensemble models with ElasticNet and MARS methods- Model serialization and loading- Scoring new data with saved models- SQL export for database deployment- Performance visualization and comparison

## Setup and Data Generation

First, let's import the necessary libraries and generate synthetic data for demonstration.

In [None]:
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.datasets import make_regressionfrom sklearn.model_selection import train_test_splitfrom paramsemble import ParamsembleRegressor# Set random seed for reproducibilitynp.random.seed(42)# Set plotting stylesns.set_style('whitegrid')plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Generate synthetic regression data
X, y = make_regression(
    n_samples=1000,
    n_features=15,
    n_informative=10,
    noise=10.0,
    random_state=42
)

# Create feature names
feature_names = [f'feature_{i}' for i in range(X.shape[1])]

# Convert to DataFrame
X_df = pd.DataFrame(X, columns=feature_names)
y_series = pd.Series(y, name='target')

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y_series, test_size=0.3, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Number of features: {X_train.shape[1]}")

## Example 1: ElasticNet Ensemble MethodLet's train an Paramsemble model using the ElasticNet ensemble method.

In [None]:
# Initialize Paramsemble with ElasticNet ensembleregressor_elastic = ParamsembleRegressor(    m=50,                    # Generate 50 feature combinations    f=5,                     # Each combination has 5 features    sample='unique',         # No duplicate features in combinations    method='elastic',        # Use ElasticNet for ensemble    spread=10,               # Select top 10 models for ensemble    ELM2json='models/constituent_models_elastic.json',    modeljson='models/ensemble_model_elastic.json',    random_state=42)print("Training Paramsemble with ElasticNet ensemble...")regressor_elastic.fit(X_train, y_train, X_test, y_test)print("Training complete!")

In [None]:
# View baseline metricsprint("\n=== Baseline Model Performance ===")print(f"Baseline wMAPE: {regressor_elastic.baseline_metrics_['wmape']:.4f}")print(f"Baseline R²: {regressor_elastic.baseline_metrics_['r2']:.4f}")# View number of selected modelsprint(f"\nNumber of models selected for ensemble: {len(regressor_elastic.selected_models_)}")

In [None]:
# Generate predictionsy_pred_elastic = regressor_elastic.predict(X_test)# Calculate performance metricsfrom sklearn.metrics import r2_score, mean_absolute_errorr2_elastic = r2_score(y_test, y_pred_elastic)mae_elastic = mean_absolute_error(y_test, y_pred_elastic)print("\n=== ElasticNet Ensemble Performance ===")print(f"R² Score: {r2_elastic:.4f}")print(f"Mean Absolute Error: {mae_elastic:.4f}")

## Example 2: MARS Ensemble MethodNow let's train an Paramsemble model using the MARS (Multivariate Adaptive Regression Splines) ensemble method.

In [None]:
# Initialize Paramsemble with MARS ensembleregressor_mars = ParamsembleRegressor(    m=50,    f=5,    sample='unique',    method='mars',           # Use MARS for ensemble    spread=10,    ELM2json='models/constituent_models_mars.json',    modeljson='models/ensemble_model_mars.json',    random_state=42)print("Training Paramsemble with MARS ensemble...")regressor_mars.fit(X_train, y_train, X_test, y_test)print("Training complete!")

In [None]:
# Generate predictionsy_pred_mars = regressor_mars.predict(X_test)# Calculate performance metricsr2_mars = r2_score(y_test, y_pred_mars)mae_mars = mean_absolute_error(y_test, y_pred_mars)print("\n=== MARS Ensemble Performance ===")print(f"R² Score: {r2_mars:.4f}")print(f"Mean Absolute Error: {mae_mars:.4f}")

## Model Serialization and Scoring Workflow

Demonstrate how to save models and use them for scoring new data.

In [None]:
# Generate new scoring data
X_new, _ = make_regression(
    n_samples=100,
    n_features=15,
    n_informative=10,
    noise=10.0,
    random_state=123
)
X_new_df = pd.DataFrame(X_new, columns=feature_names)
X_new_df['id'] = range(1, 101)

print(f"New scoring dataset: {X_new_df.shape[0]} samples")

In [None]:
# Score using saved ElasticNet modelpredictions_elastic = regressor_elastic.score_from_json(    X_new_df.drop('id', axis=1),    'models/ensemble_model_elastic.json',    id_column=X_new_df['id'])print("\n=== Predictions from Saved ElasticNet Model ===")print(predictions_elastic.head(10))

In [None]:
# Score using saved MARS modelpredictions_mars = regressor_mars.score_from_json(    X_new_df.drop('id', axis=1),    'models/ensemble_model_mars.json',    id_column=X_new_df['id'])print("\n=== Predictions from Saved MARS Model ===")print(predictions_mars.head(10))

## SQL Export for Database Deployment

Export the trained model as SQL code for deployment in database environments.

In [None]:
# Export ElasticNet model to SQLsql_code_elastic = regressor_elastic.export_sql(    'models/ensemble_model_elastic.json',    table_name='input_features',    id_column='sample_id')print("\n=== SQL Export (ElasticNet Model) ===")print("\nFirst 1500 characters of generated SQL:")print(sql_code_elastic[:1500])print("\n... (truncated)")

In [None]:
# Save SQL to file
with open('models/ensemble_model_elastic.sql', 'w') as f:
    f.write(sql_code_elastic)

print("SQL code saved to: models/ensemble_model_elastic.sql")
print("\nYou can now execute this SQL in your database:")
print("  - PostgreSQL: psql -d your_database -f ensemble_model_elastic.sql")
print("  - MySQL: mysql -u user -p database < ensemble_model_elastic.sql")
print("  - SQL Server: sqlcmd -S server -d database -i ensemble_model_elastic.sql")

## Performance Visualization and Comparison

Visualize and compare the performance of different models.

In [None]:
# Create comparison plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Actual vs Predicted (ElasticNet)
axes[0].scatter(y_test, y_pred_elastic, alpha=0.5, s=30)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Values', fontsize=12)
axes[0].set_ylabel('Predicted Values', fontsize=12)
axes[0].set_title(f'ElasticNet Ensemble\nR² = {r2_elastic:.4f}, MAE = {mae_elastic:.2f}', fontsize=14)
axes[0].grid(True, alpha=0.3)

# Plot 2: Actual vs Predicted (MARS)
axes[1].scatter(y_test, y_pred_mars, alpha=0.5, s=30, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Values', fontsize=12)
axes[1].set_ylabel('Predicted Values', fontsize=12)
axes[1].set_title(f'MARS Ensemble\nR² = {r2_mars:.4f}, MAE = {mae_mars:.2f}', fontsize=14)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Residual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ElasticNet residuals
residuals_elastic = y_test - y_pred_elastic
axes[0].scatter(y_pred_elastic, residuals_elastic, alpha=0.5, s=30)
axes[0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[0].set_xlabel('Predicted Values', fontsize=12)
axes[0].set_ylabel('Residuals', fontsize=12)
axes[0].set_title('ElasticNet Ensemble - Residual Plot', fontsize=14)
axes[0].grid(True, alpha=0.3)

# MARS residuals
residuals_mars = y_test - y_pred_mars
axes[1].scatter(y_pred_mars, residuals_mars, alpha=0.5, s=30, color='green')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_xlabel('Predicted Values', fontsize=12)
axes[1].set_ylabel('Residuals', fontsize=12)
axes[1].set_title('MARS Ensemble - Residual Plot', fontsize=14)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Model comparison bar chartmodels = ['Baseline\n(Random Forest)', 'ElasticNet\nEnsemble', 'MARS\nEnsemble']r2_scores = [    regressor_elastic.baseline_metrics_['r2'],    r2_elastic,    r2_mars]fig, ax = plt.subplots(figsize=(10, 6))bars = ax.bar(models, r2_scores, color=['#1f77b4', '#ff7f0e', '#2ca02c'], alpha=0.7)ax.set_ylabel('R² Score', fontsize=12)ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')ax.set_ylim([min(r2_scores) * 0.95, max(r2_scores) * 1.05])ax.grid(True, alpha=0.3, axis='y')# Add value labels on barsfor bar, score in zip(bars, r2_scores):    height = bar.get_height()    ax.text(bar.get_x() + bar.get_width()/2., height,            f'{score:.4f}',            ha='center', va='bottom', fontsize=11, fontweight='bold')plt.tight_layout()plt.show()

In [None]:
# Constituent model performance distributionconstituent_r2_scores = [model['r2'] for model in regressor_elastic.constituent_models_]selected_r2_scores = [model['r2'] for model in regressor_elastic.selected_models_]fig, ax = plt.subplots(figsize=(12, 6))ax.hist(constituent_r2_scores, bins=20, alpha=0.6, label='All Constituent Models', color='blue')ax.hist(selected_r2_scores, bins=10, alpha=0.8, label='Selected for Ensemble', color='orange')ax.axvline(regressor_elastic.baseline_metrics_['r2'], color='red', linestyle='--',            linewidth=2, label='Baseline R²')ax.set_xlabel('R² Score', fontsize=12)ax.set_ylabel('Frequency', fontsize=12)ax.set_title('Distribution of Constituent Model Performance', fontsize=14, fontweight='bold')ax.legend(fontsize=11)ax.grid(True, alpha=0.3)plt.tight_layout()plt.show()

## Summary Statistics

In [None]:
# Create summary tablesummary_data = {    'Model': ['Baseline (Random Forest)', 'ElasticNet Ensemble', 'MARS Ensemble'],    'R² Score': [        regressor_elastic.baseline_metrics_['r2'],        r2_elastic,        r2_mars    ],    'MAE': [        'N/A',        f"{mae_elastic:.4f}",        f"{mae_mars:.4f}"    ],    'wMAPE': [        f"{regressor_elastic.baseline_metrics_['wmape']:.4f}",        'N/A',        'N/A'    ]}summary_df = pd.DataFrame(summary_data)print("\n=== Model Performance Summary ===")print(summary_df.to_string(index=False))print(f"\n\nTotal constituent models trained: {len(regressor_elastic.constituent_models_)}")print(f"Models selected for ensemble: {len(regressor_elastic.selected_models_)}")print(f"Feature combinations per model: {regressor_elastic.f}")print(f"Total features available: {X_train.shape[1]}")

## ConclusionThis notebook demonstrated:1. **Training Paramsemble models** with both ElasticNet and MARS ensemble methods2. **Model serialization** - saving models to JSON for later use3. **Scoring workflow** - loading saved models and generating predictions on new data4. **SQL export** - converting trained models to executable SQL for database deployment5. **Performance visualization** - comparing model performance with various plots### Key Takeaways:- Paramsemble automatically generates and evaluates multiple feature combinations- The ensemble approach combines the best-performing constituent models- Both ElasticNet and MARS methods can capture different patterns in the data- Models can be easily serialized and deployed in production environments- SQL export enables database-native predictions without Python dependencies### Next Steps:- Experiment with different hyperparameters (m, f, spread)- Try different sampling methods ('unique' vs 'replace')- Apply Paramsemble to your own regression problems- Deploy SQL models in your database environment