In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, QuantileRegressor
from sklearn.metrics import mean_squared_error, r2_score
from mapie.regression import MapieRegressor, MapieQuantileRegressor
from mapie.conformity_scores import AbsoluteConformityScore, ResidualNormalisedScore
from mapie.subsample import Subsample
from mapie.metrics import regression_coverage_score
from scipy.stats import uniform
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from keras import Sequential, Input
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Function to create a simple neural network model for regression
def create_nn_model(input_dim):
    """Generates a simple neural network model with two hidden layers."""
    model = Sequential([
        Input(shape=(input_dim,)), 
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(), loss='mean_squared_error')
    return model

# Dictionary to store different MAPIE strategy configurations
STRATEGIES = {
    "split_standard": dict(method="base", cv="split", conformity_score=AbsoluteConformityScore()),
    "split_alternate": dict(method="base", cv="split", conformity_score=ResidualNormalisedScore()),
    "jackknife_plus": dict(method="plus", cv=-1),
    "jackknife": dict(method="base", cv=-1),
    "jackknife_minmax": dict(method="minmax", cv=-1),
    "cv": dict(method="base", cv=10),
    "cv_plus": dict(method="plus", cv=10),
    "cv_minmax": dict(method="minmax", cv=10),
    "jackknife_plus_ab_standard": dict(method="plus", cv=Subsample(n_resamplings=50), conformity_score=AbsoluteConformityScore()),
    "cqr_standard": dict(method="quantile", cv="split", alpha=0.1),
}

# Function to store results in a list for further analysis
def save_results(strategy, dataset_name, random_state, mse, r2, coverage, avg_width):
    """Saves performance metrics for each MAPIE strategy into a list."""
    results.append({
        "Strategy": strategy,
        "Dataset": dataset_name,
        "Random State": random_state,
        "MSE": mse,
        "R2": r2,
        "Coverage": coverage,
        "Average Width": avg_width
    })

# Function to train MAPIE models and collect results
def train_mapie_models(X, y, dataset_name, random_state):
    """Trains various MAPIE models using predefined strategies and stores results."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=200, random_state=random_state)
    X_test, z, y_test, y = train_test_split(X_test, y_test, train_size=800, random_state=random_state)
    
    print(f"Shapes after splitting: X_train: {X_train.shape}, X_test: {X_test.shape}")
    
    # Base models for MAPIE strategies
    base_models = {
        "rf_model": RandomForestRegressor(n_estimators=50, criterion='squared_error'),
        "nn_model": KerasRegressor(model=create_nn_model, input_dim=X_train.shape[1], epochs=50, batch_size=10, verbose=0),
        "ridge_model": CustomRidge()
    }
    
    # Iterates over each strategy and base model to evaluate their performance
    for strategy, params in STRATEGIES.items():
        print(f"Starting {strategy}")
        start_time = time.time()

        # Specific case for conformal quantile regression
        if "cqr" in strategy:
            quantile = 0.5  # Median
            estimator = QuantileRegressor(quantile=quantile)
            
            # Optimizing hyperparameters using randomized search
            params_distributions = dict(alpha=uniform(0, 1))
            optim_model = RandomizedSearchCV(
                estimator,
                param_distributions=params_distributions,
                n_jobs=-1,
                n_iter=50,
                cv=KFold(n_splits=10, shuffle=True),
                random_state=random_state
            )
            optim_model.fit(X_train, y_train)
            best_estimator = optim_model.best_estimator_
            
            mapie_cqr = MapieQuantileRegressor(best_estimator, **params)
            mapie_cqr.fit(
                X_train, y_train,
                X_calib=None, y_calib=None,
                random_state=random_state
            )
            y_pred_cqr, y_pis_cqr = mapie_cqr.predict(X_test)
            
            mse_cqr = mean_squared_error(y_test, y_pred_cqr)
            r2_cqr = r2_score(y_test, y_pred_cqr)
            coverage_cqr = regression_coverage_score(y_test, y_pis_cqr[:, 0, 0], y_pis_cqr[:, 1, 0])
            avg_width_cqr = np.mean(y_pis_cqr[:, 1, 0] - y_pis_cqr[:, 0, 0])
            
            save_results(f"CQR_{strategy}", dataset_name, random_state, mse_cqr, r2_cqr, coverage_cqr, avg_width_cqr)
        
        else:
            for model_name, model in base_models.items():
                print(f"Training {model_name} with {strategy}")
                mapie = MapieRegressor(estimator=model, **params, n_jobs=-1)
                mapie.fit(X_train, y_train)
                y_pred, y_pis = mapie.predict(X_test, alpha=0.1)
                
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                coverage = regression_coverage_score(y_test, y_pis[:, 0], y_pis[:, 1])
                avg_width = np.mean(y_pis[:, 1] - y_pis[:, 0])
                    
                save_results(f"{model_name}_{strategy}", dataset_name, random_state, mse, r2, coverage, avg_width)
        
        end_time = time.time()
        print(f"Time taken for {strategy}: {end_time - start_time:.2f} seconds")

# Custom Ridge Regression class to customize regularization strength
class CustomRidge(Ridge):
    """Extends Ridge regression to dynamically set regularization based on data's spectral norm."""
    def fit(self, X, y, sample_weight=None):
        spectral_norm = np.linalg.norm(X, 2)
        self.alpha = 0.001 * spectral_norm
        return super().fit(X, y, sample_weight)

# Load datasets and initiate experiment
datasets = {
    "Dataset165": pd.read_csv("dataset_165.csv"),
    "Dataset183": pd.read_csv("dataset_183filter.csv"),
    "Dataset275": pd.read_csv("dataset_275filter.csv"),
}

results = []
for dataset_name, data in datasets.items():
    X = data.drop(columns=["target"]).values
    y = data["target"].values
    
    print(f"Dataset {dataset_name}: X shape: {X.shape}, y shape: {y.shape}")
    
    for random_state in range(10):
        print(f"Running for {dataset_name} with random state {random_state}")
        train_mapie_models(X, y, dataset_name, random_state)


In [None]:
# Save results to a DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv("results.csv", index=False)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load results from CSV file into DataFrame
results_df = pd.read_csv("results.csv")

def rename_strategy(strategy):
    """Rename strategy codes to more descriptive names for better clarity in plots."""
    mapping = {
        'split_standard': 'ARS_split',  # ARS: Absolute Conformity Score
        'split_alternate': 'NRS_split',  # NRS: Residual Normalized Score
        'cv': 'CCP',  # CCP: Cross Conformal Prediction
        'cv_plus': 'CCP_plus',  # CCP_plus: Cross Conformal Prediction Plus
        'cv_minmax': 'CCP_minmax',  # CCP_minmax: Cross Conformal Prediction Minmax
        'ab_standard': '',  # Placeholder for specific mapping
        'standard': 'CQR'  # CQR: Conformal Quantile Regression
    }
    for key, value in mapping.items():
        if strategy.endswith(key):
            return strategy.replace(key, value)
    return strategy

# Set the visual style of plots globally
sns.set(style="whitegrid")

# Apply the renaming function to the 'Strategy' column in DataFrame
results_df['Strategy'] = results_df['Strategy'].apply(rename_strategy)

# Mapping dataset codes to more readable names
dataset_rename_map = {
    'Dataset165': 'Concrete Compressive Strength',
    'Dataset183': 'Community and Crimes',
    'Dataset275': 'Bike Sharing'
}
results_df['Dataset'] = results_df['Dataset'].replace(dataset_rename_map)

# Define a color palette for the datasets to maintain consistency across plots
dataset_colors = sns.color_palette('pastel', n_colors=len(dataset_rename_map))

# Function to shorten strategy names for plot labels
def shorten_strategy_for_plot(strategy):
    """Shorten the strategy name to fit better on plot labels."""
    parts = strategy.split('_')
    return '_'.join(parts[2:]) if len(parts) > 1 else strategy

# Apply the shortening function to a new column for plotting
results_df['Strategy Name'] = results_df['Strategy'].apply(shorten_strategy_for_plot)

# Define y-axis limits for average width plots
average_width_ylim = (0, 10)  # Adjust as needed based on actual data range

# Plot coverage across all datasets and models using shortened strategy names
plt.figure(figsize=(12, 8))
sns.barplot(data=results_df, x='Strategy Name', y='Coverage', hue='Dataset', palette=dataset_colors)
plt.xticks(rotation=90)
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('coverage_all_datasets.png')
plt.show()

# Plot average width across all datasets and models using shortened strategy names
plt.figure(figsize=(12, 8))
sns.barplot(data=results_df, x='Strategy Name', y='Average Width', hue='Dataset', palette=dataset_colors)
plt.ylim(average_width_ylim)
plt.xticks(rotation=90)
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('average_width_all_datasets.png')
plt.show()

# Plot coverage and average width by base model types
base_models = ['rf_model', 'nn_model', 'ridge_model']
for base_model in base_models:
    # Filter data for the current base model and CQR strategy
    subset = results_df[(results_df['Strategy'].str.contains(base_model)) | (results_df['Strategy'].str.contains('CQR'))]
    
    # Plot coverage for the current base model
    plt.figure(figsize=(12, 8))
    sns.barplot(data=subset, x='Strategy Name', y='Coverage', hue='Dataset', palette=dataset_colors)
    plt.xticks(rotation=90)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(f'coverage_{base_model}.png')
    plt.show()

    # Plot average width for the current base model
    plt.figure(figsize=(12, 8))
    sns.barplot(data=subset, x='Strategy Name', y='Average Width', hue='Dataset', palette=dataset_colors)
    plt.ylim(average_width_ylim)
    plt.xticks(rotation=90)
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(f'average_width_{base_model}.png')
    plt.show()


In [None]:
#Plot all R2 scores
plt.figure(figsize=(12, 8))
sns.barplot(data=results_df, x='Strategy', y='R2', hue='Dataset')
plt.xticks(rotation=90)
plt.legend(loc='upper right')
plt.title('R² Scores by Strategy and Dataset')
plt.tight_layout()
plt.savefig('r2_scores.png')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load results
results_df = pd.read_csv("combined_results.csv")

# Rename datasets for clarity
dataset_rename_map = {
    'Dataset165': 'Concrete Compressive Strength',
    'Dataset183': 'Community and Crimes',
    'Dataset275': 'Bike Sharing'
}
results_df['Dataset'] = results_df['Dataset'].replace(dataset_rename_map)

# Define the base models
base_models = ['rf_model', 'nn_model', 'ridge_model']

# Create a new column to classify rows by the base model type
def classify_strategy(strategy):
    for model in base_models:
        if strategy.startswith(model):
            return model
    if 'CQR' in strategy:
        return 'CQR'
    return 'Other'

results_df['Model Type'] = results_df['Strategy'].apply(classify_strategy)
sns.set(style="whitegrid")
# Update model names to full names
full_model_names = {
    'rf_model': 'Random Forest',
    'nn_model': 'Neural Network',
    'ridge_model': 'Ridge Regression',
    'CQR': 'Quantile Regression'
}

dataset_colors = {
    'Concrete Compressive Strength': '#1f77b4',  # Blau
    'Community and Crimes': '#ff7f0e',          # Orange
    'Bike Sharing': '#2ca02c'                   # Grün
}
results_df['Model Type'] = results_df['Model Type'].map(full_model_names)

# Filter results to include only the base models and CQR
filtered_results_df = results_df[results_df['Model Type'].isin(full_model_names.values())]

# Plotting R² scores for the specified models across all datasets
plt.figure(figsize=(12, 8))
sns.barplot(data=filtered_results_df, x='Model Type', y='R2', hue='Dataset', palette=dataset_colors)
plt.xticks(rotation=45)  # Adjust rotation for better label visibility
plt.title('R² Scores by Base Model and Dataset')
plt.legend(title='Dataset', loc='upper right')
plt.tight_layout()
plt.savefig('filtered_r2_scores.png')
plt.show()
