# Calibration Playground

### Prepare Workspace

In [72]:
# Import system libraries
import os
import sys
import warnings
warnings.filterwarnings("ignore")

# Import data manipulation librariaes
import pandas as pd
import numpy as np

# Import statistics libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.isotonic import IsotonicRegression

# Set working directory
os.chdir('/Users/jessicarapson/Documents/GitHub/water-supply-forecast')

### Perform Isotonic Regression

In [292]:
# Import predictions
final_val = pd.read_csv('models/calibration_data/final_val.csv')
val_gt = pd.read_csv('models/calibration_data/val_gt.csv')

# Assuming you have predictions for the 10th, 50th, and 90th percentiles separately
quantiles = [0.1, 0.5, 0.9]
predictions_10th = final_val['volume_10']
predictions_50th = final_val['volume_50']
predictions_90th = final_val['volume_90']
ground_truth = val_gt['volume']

# Fit isotonic regression separately for each quantile
iso_reg_10th = IsotonicRegression(out_of_bounds='clip')
iso_reg_50th = IsotonicRegression(out_of_bounds='clip')
iso_reg_90th = IsotonicRegression(out_of_bounds='clip')

iso_reg_10th.fit(predictions_10th, ground_truth)
iso_reg_50th.fit(predictions_50th, ground_truth)
iso_reg_90th.fit(predictions_90th, ground_truth)

# Calibrate predictions for each quantile separately
calibrated_predictions_10th = iso_reg_10th.predict(predictions_10th)
calibrated_predictions_50th = iso_reg_50th.predict(predictions_50th)
calibrated_predictions_90th = iso_reg_90th.predict(predictions_90th)

### Calculate Pinball Loss

In [282]:
# Define a function to compute quantile loss for a single quantile
def quantile_loss(y_true, y_pred, q):
    residual = y_true - y_pred
    return np.mean(2 * np.maximum(q * residual, (q - 1) * residual))

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(ground_truth, predictions_10th, quantiles[0]),
    quantile_loss(ground_truth, predictions_50th, quantiles[1]),
    quantile_loss(ground_truth, predictions_90th, quantiles[2])
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(ground_truth, calibrated_predictions_10th, quantiles[0]),
    quantile_loss(ground_truth, calibrated_predictions_50th, quantiles[1]),
    quantile_loss(ground_truth, calibrated_predictions_90th, quantiles[2])
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

Average Mean Quantile Loss: 109.10394376162246
Average Mean Quantile Loss: 162.84570497572648


### Perform Isotonic Regression With Cross Validation

In [210]:
# Load data
final_val = pd.read_csv('models/calibration_data/final_val.csv')
val_gt = pd.read_csv('models/calibration_data/val_gt.csv')

# List of quantiles
quantiles = [10, 50, 90]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    final_val, val_gt, test_size=0.2, random_state=42)

# Number of folds for cross-validation
num_folds = 5

# Dictionary to store calibrated predictions for each quantile across folds
calibrated_predictions_mean = {}

for quantile in quantiles:
    col_name = f'volume_{quantile}'
    
    # Fit isotonic regression on training data for each quantile
    iso_reg = IsotonicRegression(out_of_bounds='clip')
    iso_reg.fit(X_train[col_name], y_train['volume'])
    
    # Initialize an empty list to store predictions for each fold
    predictions_per_fold = []
    
    # Perform manual 5-fold cross-validation and predict on the test set
    fold_size = len(X_test) // num_folds
    remainder = len(X_test) % num_folds
    start = 0
    
    for i in range(num_folds):
        end = start + fold_size + (1 if i < remainder else 0)
        
        # Predict on the test subset for this fold
        fold_predictions = iso_reg.predict(X_test.iloc[start:end][[col_name]])
        predictions_per_fold.append(fold_predictions)
        start = end
    
    # Pad predictions of shorter folds with the mean of others to make them uniform length
    max_len = max(len(pred) for pred in predictions_per_fold)
    for i in range(len(predictions_per_fold)):
        if len(predictions_per_fold[i]) < max_len:
            diff = max_len - len(predictions_per_fold[i])
            mean_to_pad = np.mean(predictions_per_fold[:i] + predictions_per_fold[i+1:], axis=0)[-diff:]
            predictions_per_fold[i] = np.concatenate((predictions_per_fold[i], mean_to_pad))
    
    # Store the predictions from each fold in the dictionary
    calibrated_predictions_mean[quantile] = np.mean(predictions_per_fold, axis=0)

# Create a DataFrame with the mean of 5 predictions for each quantile across 5 folds
X_test_mean = pd.DataFrame(calibrated_predictions_mean, columns=quantiles)

In [223]:
# Load data
final_val = pd.read_csv('models/calibration_data/final_val.csv')
val_gt = pd.read_csv('models/calibration_data/val_gt.csv')

# List of quantiles
quantiles = [0.1, 0.5, 0.9]

# Assuming you have predictions for 10th, 50th, and 90th quantiles
predictions_10th = final_val['volume_10']
predictions_50th = final_val['volume_50']
predictions_90th = final_val['volume_90']

# Assuming you have ground truth values
ground_truth = val_gt['volume']

quantiles = [0.1, 0.5, 0.9]

# Split data into training and validation sets for each quantile
X_train_10, X_val_10, y_train, y_val = train_test_split(predictions_10th, ground_truth, test_size=0.2, random_state=42)
X_train_50, X_val_50, _, _ = train_test_split(predictions_50th, ground_truth, test_size=0.2, random_state=42)
X_train_90, X_val_90, _, _ = train_test_split(predictions_90th, ground_truth, test_size=0.2, random_state=42)

# Initialize variables to store best y_min and minimum quantile loss for each quantile
best_y_min = {}
min_quantile_loss = {}

# Hyperparameter tuning for y_min for each quantile
for idx, X_train, quantile in zip([0, 1, 2], [X_train_10, X_train_50, X_train_90], quantiles):
    best_y_min[quantile] = None
    min_quantile_loss[quantile] = float('inf')
    
    for y_min_candidate in np.linspace(np.min(X_train), np.max(X_train), num=100):
        # Fit isotonic regression model with current y_min candidate
        isotonic_model = IsotonicRegression(y_min=y_min_candidate)
        isotonic_model.fit(X_train, y_train)
        
        # Select the appropriate validation set for the current quantile
        if idx == 0:
            X_val = X_val_10
        elif idx == 1:
            X_val = X_val_50
        else:
            X_val = X_val_90
        
        # Predict on the validation set
        calibrated_predictions = isotonic_model.transform(X_val)
        
        # Calculate quantile loss for the current quantile
        average_mean_quantile_loss = quantile_loss(y_val, calibrated_predictions, quantile)
        
        # Update best y_min and minimum quantile loss for the current quantile
        if average_mean_quantile_loss < min_quantile_loss[quantile]:
            min_quantile_loss[quantile] = average_mean_quantile_loss
            best_y_min[quantile] = y_min_candidate

print("Best y_min for 10th quantile:", best_y_min[0.1])
print("Minimum Quantile Loss for 10th quantile:", min_quantile_loss[0.1])
print("Best y_min for 50th quantile:", best_y_min[0.5])
print("Minimum Quantile Loss for 50th quantile:", min_quantile_loss[0.5])
print("Best y_min for 90th quantile:", best_y_min[0.9])
print("Minimum Quantile Loss for 90th quantile:", min_quantile_loss[0.9])

Best y_min for 10th quantile: -196.2016
Minimum Quantile Loss for 10th quantile: 184.24769699486214
Best y_min for 50th quantile: 32.552246
Minimum Quantile Loss for 50th quantile: 165.17173469323774
Best y_min for 90th quantile: 128.88217098989898
Minimum Quantile Loss for 90th quantile: 163.0222207465406


In [213]:
# Define a function to compute quantile loss for a single quantile
def quantile_loss(y_true, y_pred, q):
    residual = y_true - y_pred
    return np.mean(2 * np.maximum(q * residual, (q - 1) * residual))

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(ground_truth, predictions_10th, quantiles[0]),
    quantile_loss(ground_truth, predictions_50th, quantiles[1]),
    quantile_loss(ground_truth, predictions_90th, quantiles[2])
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(X_test_mean[10], calibrated_predictions_10th, quantiles[0]),
    quantile_loss(X_test_mean[50], calibrated_predictions_50th, quantiles[1]),
    quantile_loss(X_test_mean[90], calibrated_predictions_90th, quantiles[2])
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

Average Mean Quantile Loss: -10363.269924041899


ValueError: operands could not be broadcast together with shapes (145,) (3620,) 

In [151]:
# Load your data
final_val = pd.read_csv('models/calibration_data/final_val.csv')
val_gt = pd.read_csv('models/calibration_data/val_gt.csv')

# List of quantiles
quantiles = [10, 50, 90]

# Initialize arrays to store fold-wise predictions for each quantile
fold_wise_predictions = {quantile: np.full((len(val_gt), 5), np.nan) for quantile in quantiles}

# Define number of splits for k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over each fold and perform isotonic regression for each quantile
for i, (train_idx, val_idx) in enumerate(kfold.split(final_val)):
    X_tr, X_val = final_val.iloc[train_idx], final_val.iloc[val_idx]
    y_tr, y_val = val_gt.iloc[train_idx], val_gt.iloc[val_idx]

    for quantile in quantiles:
        col_name = f'volume_{quantile}'

        # Instantiate an Isotonic Regression model
        iso_reg = IsotonicRegression(out_of_bounds='clip')

        # Fit the isotonic regression model
        iso_reg.fit(X_tr[col_name], y_tr['volume'])

        # Make predictions on the entire dataset
        fold_preds = iso_reg.predict(final_val[col_name])

        # Store fold-wise predictions for each quantile only in validation split
        fold_wise_predictions[quantile][val_idx, i] = fold_preds[val_idx]

# Calculate the average of predictions from each fold for each quantile
final_predictions = {quantile: np.nanmean(fold_wise_predictions[quantile], axis=1) for quantile in quantiles}

In [162]:
# Define a function to compute quantile loss for a single quantile
def quantile_loss(y_true, y_pred, q):
    residual = y_true - y_pred
    return np.mean(2 * np.maximum(q * residual, (q - 1) * residual))

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(ground_truth, final_val['volume_10'], quantiles[0] / 100),
    quantile_loss(ground_truth, final_val['volume_50'], quantiles[1] / 100),
    quantile_loss(ground_truth, final_val['volume_90'], quantiles[2] / 100)
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

# Calculate average mean quantile loss across quantiles of interest
average_mean_quantile_loss = np.mean([
    quantile_loss(ground_truth, pd.Series(final_predictions[10]), quantiles[0] / 100),
    quantile_loss(ground_truth, pd.Series(final_predictions[50]), quantiles[1] / 100),
    quantile_loss(ground_truth, pd.Series(final_predictions[90]), quantiles[2] / 100)
])

print("Average Mean Quantile Loss:", average_mean_quantile_loss)

Average Mean Quantile Loss: 109.10394376162246
Average Mean Quantile Loss: 169.62822662551142


In [196]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np

# Assuming 'final_val' contains predictions and 'val_gt' contains ground truth values
# Load your data and ensure the columns are in the correct format

# Extract predictions from the 'final_val' DataFrame
predictions_10 = final_val['volume_10'].values
predictions_50 = final_val['volume_50'].values
predictions_90 = final_val['volume_90'].values

# Extract ground truth values from the 'val_gt' DataFrame
ground_truth_values = val_gt['volume'].values

quantiles = [10, 50, 90]
calibrated_predictions = []

# Apply Platt scaling separately for each quantile
for idx, quantile in enumerate(quantiles):
    # Select predictions for the current quantile
    if idx == 0:
        predictions = predictions_10
    elif idx == 1:
        predictions = predictions_50
    else:
        predictions = predictions_90

    # Sort the predictions and ground truth values
    sorted_indices = np.argsort(predictions)
    sorted_predictions = predictions[sorted_indices]
    sorted_ground_truth = ground_truth_values[sorted_indices]

    threshold_index = int(len(sorted_predictions) * quantile / 100)

    # Treat as binary classification (above or below quantile threshold)
    labels = np.where(np.arange(len(sorted_predictions)) < threshold_index, 0, 1)

    # Fit a logistic regression model for Platt scaling
    platt_model = LogisticRegression(solver='liblinear')
    platt_model.fit(sorted_predictions.reshape(-1, 1), labels)

    # Use the model to predict probabilities
    calibrated_probs = platt_model.predict_proba(sorted_predictions.reshape(-1, 1))[:, 1]

    # Calculate the transformed predictions based on calibrated probabilities
    calibrated_prediction = np.interp(predictions, sorted_predictions, calibrated_probs)
    calibrated_predictions.append(calibrated_prediction)

# 'calibrated_predictions' now contains Platt scaled predictions for each quantile
# You can use these calibrated predictions as needed

# Assuming min_volume and max_volume are the minimum and maximum values in your original "volume" column
min_volume = val_gt['volume'].min()
max_volume = val_gt['volume'].max()

# Rescale the calibrated predictions to the original volume scale
rescaled_predictions = []
for calibrated_prediction in calibrated_predictions:
    # Map the calibrated predictions from [0, 1] back to the original volume scale
    rescaled_prediction = calibrated_prediction * (max_volume - min_volume) + min_volume
    rescaled_predictions.append(rescaled_prediction)