# Paramsemble Advanced ExamplesThis notebook demonstrates advanced usage patterns and real-world scenarios:- Working with real datasets- Hyperparameter tuning- Feature importance analysis- Cross-validation strategies- Production deployment workflows

In [None]:
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.datasets import fetch_california_housing, load_diabetesfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerfrom paramsemble import ParamsembleRegressorimport jsonnp.random.seed(42)sns.set_style('whitegrid')plt.rcParams['figure.figsize'] = (14, 6)

## Example 1: California Housing Dataset

Let's apply ELM to the California Housing dataset to predict median house values.

In [None]:
# Load California Housing dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name='MedHouseValue')

print("Dataset shape:", X.shape)
print("\nFeatures:")
print(X.columns.tolist())
print("\nFirst few rows:")
print(X.head())

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Train Paramsemble modelregressor_housing = ParamsembleRegressor(    m=100,    f=4,    sample='unique',    method='elastic',    spread=15,    modeljson='models/housing_model.json',    random_state=42)print("Training Paramsemble on California Housing data...")regressor_housing.fit(X_train, y_train, X_test, y_test)print("Training complete!")

In [None]:
# Evaluate performancefrom sklearn.metrics import r2_score, mean_squared_error, mean_absolute_errory_pred = regressor_housing.predict(X_test)print("\n=== Model Performance ===")print(f"Baseline R²: {regressor_housing.baseline_metrics_['r2']:.4f}")print(f"Ensemble R²: {r2_score(y_test, y_pred):.4f}")print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")

## Example 2: Hyperparameter Tuning

Compare different hyperparameter configurations to find the best setup.

In [None]:
# Test different configurationsconfigs = [    {'m': 50, 'f': 3, 'spread': 10, 'method': 'elastic'},    {'m': 100, 'f': 4, 'spread': 15, 'method': 'elastic'},    {'m': 150, 'f': 5, 'spread': 20, 'method': 'elastic'},    {'m': 100, 'f': 4, 'spread': 15, 'method': 'mars'},]results = []for i, config in enumerate(configs):    print(f"\nTesting configuration {i+1}/{len(configs)}: {config}")        regressor = ParamsembleRegressor(        m=config['m'],        f=config['f'],        spread=config['spread'],        method=config['method'],        sample='unique',        random_state=42    )        paramsemble.fit(X_train, y_train, X_test, y_test)    y_pred = paramsemble.predict(X_test)        r2 = r2_score(y_test, y_pred)    mae = mean_absolute_error(y_test, y_pred)        results.append({        'config': f"m={config['m']}, f={config['f']}, spread={config['spread']}, {config['method']}",        'r2': r2,        'mae': mae    })        print(f"  R²: {r2:.4f}, MAE: {mae:.4f}")results_df = pd.DataFrame(results)print("\n=== Configuration Comparison ===")print(results_df.to_string(index=False))

In [None]:
# Visualize hyperparameter comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# R² comparison
axes[0].barh(range(len(results_df)), results_df['r2'], color='steelblue', alpha=0.7)
axes[0].set_yticks(range(len(results_df)))
axes[0].set_yticklabels([f"Config {i+1}" for i in range(len(results_df))])
axes[0].set_xlabel('R² Score', fontsize=12)
axes[0].set_title('R² Score by Configuration', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# MAE comparison
axes[1].barh(range(len(results_df)), results_df['mae'], color='coral', alpha=0.7)
axes[1].set_yticks(range(len(results_df)))
axes[1].set_yticklabels([f"Config {i+1}" for i in range(len(results_df))])
axes[1].set_xlabel('Mean Absolute Error', fontsize=12)
axes[1].set_title('MAE by Configuration', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## Example 3: Feature Importance Analysis

Analyze which features appear most frequently in the selected constituent models.

In [None]:
# Count feature appearances in selected modelsfeature_counts = {}for model in regressor_housing.selected_models_:    for feature in model['features']:        feature_counts[feature] = feature_counts.get(feature, 0) + 1# Create DataFrame and sortfeature_importance = pd.DataFrame([    {'Feature': feature, 'Count': count, 'Percentage': count / len(regressor_housing.selected_models_) * 100}    for feature, count in feature_counts.items()]).sort_values('Count', ascending=False)print("\n=== Feature Importance (by appearance in selected models) ===")print(feature_importance.to_string(index=False))

In [None]:
# Visualize feature importancefig, ax = plt.subplots(figsize=(12, 6))bars = ax.barh(feature_importance['Feature'], feature_importance['Count'],                color='teal', alpha=0.7)ax.set_xlabel('Number of Appearances in Selected Models', fontsize=12)ax.set_ylabel('Feature', fontsize=12)ax.set_title('Feature Importance in Paramsemble Ensemble', fontsize=14, fontweight='bold')ax.grid(True, alpha=0.3, axis='x')# Add percentage labelsfor i, (bar, pct) in enumerate(zip(bars, feature_importance['Percentage'])):    width = bar.get_width()    ax.text(width, bar.get_y() + bar.get_height()/2,            f' {pct:.1f}%',            ha='left', va='center', fontsize=10)plt.tight_layout()plt.show()

## Example 4: Model Inspection

Examine the saved model JSON to understand the ensemble structure.

In [None]:
# Load and inspect model JSON
with open('models/housing_model.json', 'r') as f:
    model_data = json.load(f)

print("=== Model Structure ===")
print(f"Ensemble method: {model_data['method']}")
print(f"Number of constituent models: {len(model_data['constituent_models'])}")
print(f"\nMetadata:")
for key, value in model_data['metadata'].items():
    print(f"  {key}: {value}")

In [None]:
# Examine a constituent model
print("\n=== Example Constituent Model ===")
example_model = model_data['constituent_models'][0]
print(f"Model ID: {example_model['model_id']}")
print(f"Features: {example_model['features']}")
print(f"wMAPE: {example_model['wmape']:.4f}")
print(f"R²: {example_model['r2']:.4f}")
print(f"\nEquation coefficients:")
for feature, coef in example_model['equation_dict'].items():
    print(f"  {feature}: {coef:.6f}")

In [None]:
# Examine ensemble equation
print("\n=== Ensemble Equation ===")
print("Coefficients for combining constituent model predictions:")
for key, value in model_data['ensemble_equation'].items():
    print(f"  {key}: {value:.6f}")

## Example 5: Production Deployment WorkflowDemonstrate a complete workflow for deploying Paramsemble models in production.

In [None]:
# Step 1: Train and save modelprint("Step 1: Training production model...")regressor_prod = ParamsembleRegressor(    m=100,    f=4,    spread=15,    method='elastic',    modeljson='models/production_model.json',    random_state=42)regressor_prod.fit(X_train, y_train, X_test, y_test)print("✓ Model trained and saved")

In [None]:
# Step 2: Export to SQL for database deploymentprint("\nStep 2: Exporting to SQL...")sql_code = regressor_prod.export_sql(    'models/production_model.json',    table_name='housing_features',    id_column='property_id')with open('models/production_model.sql', 'w') as f:    f.write(sql_code)print("✓ SQL export saved to: models/production_model.sql")print(f"  SQL length: {len(sql_code)} characters")

In [None]:
# Step 3: Simulate batch scoringprint("\nStep 3: Batch scoring simulation...")# Create batch of new databatch_size = 1000X_batch = X_test.head(batch_size).copy()X_batch['property_id'] = range(1, batch_size + 1)# Score using saved modelpredictions = regressor_prod.score_from_json(    X_batch.drop('property_id', axis=1),    'models/production_model.json',    id_column=X_batch['property_id'])print(f"✓ Scored {len(predictions)} properties")print("\nSample predictions:")print(predictions.head(10))

In [None]:
# Step 4: Save predictions to CSV for downstream use
print("\nStep 4: Saving predictions...")
predictions.to_csv('models/batch_predictions.csv', index=False)
print("✓ Predictions saved to: models/batch_predictions.csv")

In [None]:
# Step 5: Model monitoring - check prediction distribution
print("\nStep 5: Model monitoring...")

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Prediction distribution
axes[0].hist(predictions['predicted'], bins=30, alpha=0.7, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Predicted Value', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Predictions', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Prediction statistics
stats_text = f"""Prediction Statistics:

Mean: {predictions['predicted'].mean():.4f}
Median: {predictions['predicted'].median():.4f}
Std Dev: {predictions['predicted'].std():.4f}
Min: {predictions['predicted'].min():.4f}
Max: {predictions['predicted'].max():.4f}

Total Predictions: {len(predictions)}
"""

axes[1].text(0.1, 0.5, stats_text, fontsize=12, family='monospace',
             verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))
axes[1].axis('off')
axes[1].set_title('Summary Statistics', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("✓ Model monitoring complete")

## Example 6: Comparing Sampling Methods

Compare 'unique' vs 'replace' sampling strategies.

In [None]:
# Load diabetes dataset (smaller for quick comparison)
diabetes = load_diabetes()
X_diab = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y_diab = pd.Series(diabetes.target)

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_diab, y_diab, test_size=0.2, random_state=42
)

print(f"Diabetes dataset: {X_diab.shape[0]} samples, {X_diab.shape[1]} features")

In [None]:
# Train with 'unique' samplingprint("\nTraining with 'unique' sampling...")regressor_unique = ParamsembleRegressor(    m=50, f=5, sample='unique', method='elastic', spread=10, random_state=42)regressor_unique.fit(X_train_d, y_train_d, X_test_d, y_test_d)y_pred_unique = regressor_unique.predict(X_test_d)r2_unique = r2_score(y_test_d, y_pred_unique)print(f"R² with 'unique' sampling: {r2_unique:.4f}")

In [None]:
# Train with 'replace' samplingprint("\nTraining with 'replace' sampling...")regressor_replace = ParamsembleRegressor(    m=50, f=5, sample='replace', method='elastic', spread=10, random_state=42)regressor_replace.fit(X_train_d, y_train_d, X_test_d, y_test_d)y_pred_replace = regressor_replace.predict(X_test_d)r2_replace = r2_score(y_test_d, y_pred_replace)print(f"R² with 'replace' sampling: {r2_replace:.4f}")

In [None]:
# Compare sampling methods
fig, ax = plt.subplots(figsize=(10, 6))

methods = ['Unique Sampling', 'Replace Sampling']
scores = [r2_unique, r2_replace]
colors = ['#2ecc71', '#e74c3c']

bars = ax.bar(methods, scores, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
ax.set_ylabel('R² Score', fontsize=12)
ax.set_title('Sampling Method Comparison', fontsize=14, fontweight='bold')
ax.set_ylim([min(scores) * 0.95, max(scores) * 1.05])
ax.grid(True, alpha=0.3, axis='y')

for bar, score in zip(bars, scores):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{score:.4f}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## SummaryThis advanced notebook demonstrated:1. **Real-world datasets** - Applied Paramsemble to California Housing and Diabetes datasets2. **Hyperparameter tuning** - Compared different configurations to optimize performance3. **Feature importance** - Analyzed which features contribute most to the ensemble4. **Model inspection** - Examined the internal structure of saved models5. **Production workflow** - Complete deployment pipeline from training to scoring6. **Sampling strategies** - Compared 'unique' vs 'replace' sampling methods### Best Practices:- **Standardize features** before training for better Lasso performance- **Tune hyperparameters** (m, f, spread) based on your dataset size and complexity- **Monitor predictions** in production to detect data drift- **Use SQL export** for high-performance database deployments- **Save models** with descriptive names and version control### Performance Tips:- Larger `m` values explore more feature combinations but increase training time- Smaller `f` values create simpler models that may generalize better- Adjust `spread` based on the number of models that outperform baseline- MARS ensembles can capture non-linear patterns but are more complex- ElasticNet ensembles are faster and often sufficient for linear relationships