# Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
import json
from itertools import product
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from itertools import product, combinations
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
import os
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('/Users/ormeiri/Desktop/predictive_models_git/data/chapter 4/data/multiple_imputation.csv')

In [None]:
target_variable = 'hospital_death'
X = df.drop(columns=[target_variable])
y = df[target_variable]

In [None]:
def preprocess_data(X, categorical_cols=None, is_training=True, scaler=None, dummy_cols=None):
    """
    Preprocess data by creating dummy variables and scaling
    """
    X_processed = X.copy()

    # Identify categorical columns if not provided
    if categorical_cols is None:
        categorical_cols = X_processed.select_dtypes(include=['object', 'category']).columns.tolist()

    # Create dummy variables for categorical columns
    if categorical_cols:
        if is_training:
            X_processed = pd.get_dummies(X_processed, columns=categorical_cols, drop_first=True)
            dummy_cols = X_processed.columns.tolist()
        else:
            # For test data, ensure same columns as training
            X_processed = pd.get_dummies(X_processed, columns=categorical_cols, drop_first=True)
            # Align columns with training data
            for col in dummy_cols:
                if col not in X_processed.columns:
                    X_processed[col] = 0
            X_processed = X_processed[dummy_cols]

    # Scale numerical features for LogReg and KNN
    if is_training:
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(
            scaler.fit_transform(X_processed),
            columns=X_processed.columns,
            index=X_processed.index
        )
    else:
        X_scaled = pd.DataFrame(
            scaler.transform(X_processed),
            columns=X_processed.columns,
            index=X_processed.index
        )

    return X_scaled, scaler, dummy_cols

In [None]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, stratify=y)

In [None]:
 # Preprocess data for LogReg and KNN (dummy variables + scaling)
X_train_scaled, scaler, dummy_cols = preprocess_data(X_train, categorical_cols, is_training=True)
X_test_scaled, _, _ = preprocess_data(X_test, categorical_cols, is_training=False,
                                    scaler=scaler, dummy_cols=dummy_cols)

# Preprocess data for XGBoost (dummy variables only, no scaling)
if categorical_cols:
    X_train_xgb = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
    X_test_xgb = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    # Align columns
    for col in X_train_xgb.columns:
        if col not in X_test_xgb.columns:
            X_test_xgb[col] = 0
    X_test_xgb = X_test_xgb[X_train_xgb.columns]
else:
    X_train_xgb = X_train
    X_test_xgb = X_test

In [None]:
knn_path = "/Users/ormeiri/Desktop/predictive_models_git/models/final_models/best_knn.pkl"
with open(knn_path, 'rb') as f:
    knn = pickle.load(f)
lr_path = "/Users/ormeiri/Desktop/predictive_models_git/models/final_models/best_logistic_regression.pkl"
with open(lr_path, 'rb') as f:
    lr = pickle.load(f)
xgboost_path = "/Users/ormeiri/Desktop/predictive_models_git/models/final_models/best_xgboost.pkl"
with open(xgboost_path, 'rb') as f:
    xgboost = pickle.load(f)

# Fairness

In [None]:
def generate_subgroups(query_lists):
    """
    Generates all possible subgroups (tuples) formed by combining elements from the input lists.

    Args:
        query_lists (list of lists): A list of lists, where each sublist represents a set of elements to combine.

    Returns:
        list: A list of tuples, where each tuple represents a subgroup formed by combining elements across the input lists.
    """
    subgroups = []
    for list1, list2 in combinations(query_lists, 2):
        subgroups.extend(product(list1, list2))
    for list1, list2, list3 in combinations(query_lists, 3):
        subgroups.extend(product(list1, list2, list3))
    for list1, list2, list3, list4 in combinations(query_lists, 4):
        subgroups.extend(product(list1, list2, list3, list4))
    return subgroups

In [None]:
def find_empty_subgroups(subpopulation_queries, X):
  """
    Identifies subgroups within a dataset that have no corresponding data points (i.e., empty subgroups).

    Args:
        subpopulation_queries (list): A list of queries defining the subgroups. Each query can be a tuple of conditions or a single condition string.
        X (pandas.DataFrame): The DataFrame containing the data to be analyzed.

    Returns:
        list: A list of queries that resulted in empty subgroups.
  """
  empty_queries = []
  for query in subpopulation_queries:
    S = X.query(" and ".join(query) if isinstance(query, tuple) else query).index

    if len(S) == 0:
      empty_queries.append(query)

  return empty_queries

In [None]:
gender_queries = ['gender_M == 1', 'gender_M == 0',]
age_queries = [f'age >= {a} and age < {a+10}' for a in range(10, 90, 10)]
# List of ethnicity columns that, if all are 0, imply 'African American'
other_ethnicities = ['Caucasian', 'Native American', 'Other/Unknown', 'Asian']

# Queries for each ethnicity
ethnicity_queries = [f'`ethnicity_{e}` == 1' for e in other_ethnicities]

# African American: if all others are 0
african_american_query = ' and '.join([f'`ethnicity_{e}` == 0' for e in other_ethnicities])
ethnicity_queries += [african_american_query]

all_query_lists = [gender_queries, age_queries, ethnicity_queries]
all_subgroups = generate_subgroups(all_query_lists)

subpopulation_queries = [
  *gender_queries,
  *age_queries,
  *ethnicity_queries,
  *all_subgroups
]

print(f'Total subgroups = {len(subpopulation_queries)}')

empty_subgroups = find_empty_subgroups(subpopulation_queries, X_test_xgb) # 6 minutes to run (pickled for later use)
final_subgroup_queries = set(subpopulation_queries) - set(empty_subgroups)
len(final_subgroup_queries)

sorted_queries = sorted(final_subgroup_queries, key=lambda x: 0 if isinstance(x, str) else len(x))
print(f'Valid subgroups = {len(sorted_queries)}')

In [None]:
class ClibratedPredictor:
  def __init__(self, model, subgroups, y_test):
    self.model = model
    self.subgroups = subgroups
    self.y_test = y_test
    self.proba = None

  def predict(self, X, y=None):
    if self.proba is None:
      self.predict_proba(X, y)

    p = self.proba[:, 1]
    preds = np.where(p >= 0.2, 1, 0)

    return preds

  def predict_proba(self, X, y=None):
    return self.multi_calibrate_predictor(X, y, self.subgroups)

  def multi_calibrate_predictor(self, X, y, subpopulation_queries, alpha=1e-3, max_iter=1):
    """
      Perform multi-calibration to ensure fairness across subgroups.

      :param predictor: A trained classifier with a predict_proba method.
      :param X: DataFrame containing the input features.
      :param y: Series containing the true labels.
      :param subpopulation_queries: List of strings representing the query to define each subgroup.
      :param alpha: The violation parameter.
      :param max_iter: Maximum number of iterations for the calibration process.
      :return: Calibrated probabilities.
    """

    y = pd.Series(self.y_test, index=X.index)
    p = self.model.predict_proba(X)[:, 1]
    calibrated_p = pd.Series(p.copy(), index=X.index)
    done = False

    for _ in range(max_iter):
      done = True
      for query in subpopulation_queries:
        S = X.query(" and ".join(query) if isinstance(query, tuple) else query).index

        delta_S = (y.loc[S] - calibrated_p.loc[S]).mean()
        if abs(delta_S) > alpha:
          calibrated_p.loc[S] += delta_S
          done = False

      if done: print("calibrated"); break

    calibrated_p[calibrated_p < 0] = 0
    calibrated_p[calibrated_p > 1] = 1

    calibrated_p = calibrated_p.values  # Not P-values XD
    zero_class = 1 - calibrated_p
    calibrated_p = np.concatenate([zero_class[:,np.newaxis], calibrated_p[:,np.newaxis]], axis=1)

    self.proba = calibrated_p

    return calibrated_p

In [None]:
def compare_performance_measures(X_test, y_test, model, subgroup_conds, model_name):
    """
    Function to compare all performance measures across subgroups for a single model.

    :param X_test: DataFrame containing the test features.
    :param y_test: Series containing the test labels.
    :param model: A single model to evaluate.
    :param subgroup_conds: List of conditions defining the subgroups.
    :return: None.
    """

    metrics_data = []
    metrics_types = ['AUC', 'Sensitivity', 'PPV', 'Specificity', 'NPV']

    # Iterate over each subgroup condition
    for cond, label in subgroup_conds.items():
        query_str = " and ".join(cond) if isinstance(cond, tuple) else cond
        sub_df = X_test.query(query_str)
        if sub_df.empty:
            continue
        true_labels = y_test.loc[sub_df.index]

        predictions = model.predict(sub_df)
        pred_proba = model.predict_proba(sub_df)[:, 1] if hasattr(model, "predict_proba") else None

        tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()


        # Calculate metrics
        metrics_data.append({
            'Subgroup': label,
            'Metric': 'AUC',
            'Value': roc_auc_score(true_labels, pred_proba) if hasattr(model, "predict_proba") and len(true_labels.value_counts()) > 1 else None

        })
        metrics_data.append({'Subgroup': label, 'Metric': 'Sensitivity', 'Value': tp / (tp + fn)})
        metrics_data.append({'Subgroup': label, 'Metric': 'PPV', 'Value': tp / (tp + fp)})
        metrics_data.append({'Subgroup': label, 'Metric': 'Specificity', 'Value': tn / (tn + fp)})
        metrics_data.append({'Subgroup': label, 'Metric': 'NPV', 'Value': tn / (tn + fn) if (tn + fn) != 0 else None})

    df_metrics = pd.DataFrame(metrics_data)

    # Plotting each metric for each subgroup using Plotly Express

    color_sequence = ['red', 'green', 'blue', 'orange', 'purple']
    fig = px.bar(df_metrics, x='Subgroup', y='Value', color='Metric', barmode='group', title=f"Performance Metrics by Subgroup with {model_name}", color_discrete_sequence=color_sequence)
    fig.update_xaxes(tickfont=dict(size=16))

    fig.update_layout(xaxis_title='Subgroup', yaxis_title='Metric Value')
    fig.show()


def model_comparison(model, model_name):
  print("Let's compare basic models")

  males_vs_females = {
    'age > 0' : 'All data',
    'gender_M == 1': 'Males',
    'gender_F == 1': 'Females',
  }
  compare_performance_measures(X_test, y_test, model, males_vs_females, model_name)

  age_subgroups = {
    'age > 0' : 'All data',
    'age < 30': 'Younger than 30',
    'age >= 30 and age < 40': 'Aged 30 to 39',
    'age >= 40 and age < 50': 'Aged 40 to 49',
    'age >= 50 and age < 60': 'Aged 50 to 59',
    'age >= 60 and age < 70': 'Aged 60 to 69',
    'age >= 70 and age < 80': 'Aged 70 to 79',
    'age >= 80': 'Aged 80 and older',
  }
  compare_performance_measures(X_test, y_test, xgboost, age_subgroups, model_name)

  ethnicity_subgroups = {
      'age > 0' : 'All data',
      'ethnicity_Caucasian == 1': 'Caucasian',
      '`ethnicity_Native American` == 1': 'Native American',
      '`ethnicity_Other/Unknown` == 1': 'Other/Unknown Ethnicity',
      '`ethnicity_African American` == 1': 'African American',
      'ethnicity_Asian == 1': 'Asian',
  }
  compare_performance_measures(X_test, y_test, xgboost, ethnicity_subgroups, model_name)

  basic_subgroups_dict = {
      'age > 0' : 'All data',
      '`hospital_id_30.0` == 1': 'Hospital #30 Patients',
      '`hospital_id_70.0` == 1': 'Hospital #70 Patients',
      '`hospital_id_100.0` == 1': 'Hospital #100 Patients',
      '`hospital_id_118.0` == 1': 'Hospital #118 Patients',
  }
  compare_performance_measures(X_test, y_test, xgboost, basic_subgroups_dict, model_name)

  print("Let's go deeper")

  intersecting_2_subgroups_dict = {
    'age > 0' : 'All data',
    ('age >= 80', 'gender_F == 1'): 'Females Aged 80 and older',
    ('ethnicity_Caucasian == 1', 'gender_M == 1'): 'Caucasian Males',
    ('ethnicity_Asian == 1', 'gender_F == 1'): 'Asian Females',
    ('`hospital_id_19.0` == 1', '`ethnicity_African American` == 1'): 'Hospital #19 African American Patients',
    ('gender_F == 1', 'ethnicity_Caucasian == 1'): 'Caucasian Females',
    ('`hospital_id_188.0` == 1', 'gender_F == 1'): 'Hospital #188 Female Patients',
  }

  compare_performance_measures(X_test, y_test, xgboost, intersecting_2_subgroups_dict, model_name)

  print("Even deeper")
  intersecting_3_subgroups_dict = {
    'age > 0' : 'All data',
    ('gender_M == 1', 'age < 30', '`hospital_id_30.0` == 1'): 'Young Males at Hospital #30',
    ('gender_F == 1', 'ethnicity_Caucasian == 1', '`hospital_id_70.0` == 1'): 'Caucasian Females at Hospital #70',
    ('age >= 80', '`ethnicity_Native American` == 1', '`hospital_id_100.0` == 1'): 'Native American Aged 80+ at Hospital #100',
    ('gender_F == 1', '`ethnicity_African American` == 1', 'age >= 50 and age < 60'): 'African American Females Aged 50 to 59',
    ('gender_M == 1', 'age >= 70 and age < 80', '`hospital_id_70.0` == 1'): 'Males Aged 70 to 79 at Hospital #70',
    ('ethnicity_Caucasian == 1', 'age >= 80', '`hospital_id_118.0` == 1'): 'Caucasian Aged 80+ at Hospital #118',
    ('age >= 60 and age < 70', '`ethnicity_Other/Unknown` == 1', '`hospital_id_70.0` == 1'): 'Unknown Ethnicity Aged 60 to 69 at Hospital #70',
    ('gender_F == 1', 'age >= 70', '`ethnicity_Caucasian` == 1'): 'Caucasian Females Aged 70+',
    ('age >= 80', 'gender_F == 1', '`hospital_id_118.0` == 1'): 'Females Aged 80+ at Hospital #118',
  }

  compare_performance_measures(X_test, y_test, xgboost, intersecting_3_subgroups_dict, model_name)

  intersecting_4_subgroups_dict = {
    'age > 0' : 'All data',
    ('age >= 20 and age < 40', 'gender_M == 1', '`hospital_id_118.0` == 1', 'ethnicity_Caucasian == 1'): "Caucasian Males Aged 20 to 39 at Hospital #118",
    ('age >= 60 and age < 70', 'gender_M == 1', '`hospital_id_118.0` == 0 and `hospital_id_19.0` == 0 and `hospital_id_188.0` == 0', '`ethnicity_Other/Unknown` == 1'): "Other/Unknown Ethnicity Males, 60-69, Outside Top 3 Hospitals",
    ('age >= 50 and age < 60', 'gender_F == 1', '`hospital_id_118.0` == 1', '`ethnicity_African American` == 1'): "African American Females Aged 50 to 59 at Hospital #161",
    ('age >= 20 and age < 50', 'gender_F == 1', '`hospital_id_19.0` == 1', 'ethnicity_Caucasian == 1'): "Caucasian Females Aged 20 to 49 at Hospital #19",
  }

  compare_performance_measures(X_test, y_test, xgboost, intersecting_4_subgroups_dict, model_name)




def calc_calibration_itl(model, X, y, subgroup_queries):
  """
    Calculates the calibration-in-the-large (ITL) for each subgroup defined by the provided queries.

    Calibration-in-the-large measures how well the average predicted probability for a subgroup
    matches the observed event rate within that subgroup.

    Args:
        model: A fitted model with a `predict_proba` method.
        X (pandas.DataFrame): The feature DataFrame.
        y (pandas.Series): The target variable.
        subgroup_queries (list): A list of queries defining the subgroups. Each query can be
                                 a tuple of conditions or a single condition string.

    Returns:
        list:  A list of calibration-in-the-large values (floats), one for each subgroup query.
  """

  y = pd.Series(y, index=X.index)
  p = pd.Series(model.predict_proba(X)[:, 1], index=X.index)

  calibrations_list = []

  for query in subgroup_queries:
    S = X.query(" and ".join(query) if isinstance(query, tuple) else query).index
    avg_predicted_proba = np.mean(p[S])
    overall_event_rate = np.mean(y[S])
    calibration_intercept = avg_predicted_proba - overall_event_rate
    calibrations_list.append(calibration_intercept)

  return calibrations_list


def test_mc_across_models(model, sorted_queries, y_test, model_name):
  """
  Evaluates the calibration-in-the-large (ITL) of a model across subgroups, both before and after calibration. Plots the results for comparison.

  Args:
      model: The fitted model to evaluate.
      sorted_queries: A list of queries defining the subgroups, sorted by subgroup size.
      y_test: The true labels (target values) for the test set.
      model_name (str): The name of the model for the plot title.

  Returns:
      tuple:
         - The calibrated model object (ClibratedPredictor)
         - The original ITL calibration values
         - The ITL calibration values after calibration
  """
  if model_name == 'KNN' or model_name == 'LR':
      calibrated_model = ClibratedPredictor(model, sorted_queries, y_test)
      calibrations_list = calc_calibration_itl(model, X_test_scaled, y_test, sorted_queries) # 1.5 min. to run
      calibrated_calibrations_list = calc_calibration_itl(calibrated_model, X_test_scaled, y_test, sorted_queries)
  else:
      calibrated_model = ClibratedPredictor(model, sorted_queries, y_test)
      calibrations_list = calc_calibration_itl(model, X_test_xgb, y_test, sorted_queries) # 1.5 min. to run
      calibrated_calibrations_list = calc_calibration_itl(calibrated_model, X_test_xgb, y_test, sorted_queries)

  data = {
    'Original': calibrations_list,
    'Calibrated': calibrated_calibrations_list,
    'Index': range(len(calibrations_list))
  }

  df = pd.DataFrame(data)

  # Melt the DataFrame so that it's in a tidy format Plotly can use to automatically generate a legend
  df_melted = df.melt(id_vars=['Index'], value_vars=['Original', 'Calibrated'],
                      var_name='Type', value_name='Value')

  # Plot using the melted DataFrame
  fig = px.line(df_melted,
                x='Index',
                y='Value',
                color='Type', # This will automatically create a legend based on the 'Type' column
                title=f'Calibration in The Large Across Subgroups ({model_name} model)')

  fig.update_layout(
      xaxis_title="Subgroup Index (Widest to Narrowest)",
      yaxis_title="Calibration in The Large"
  )

  fig.show()

  return calibrated_model, calibrations_list, calibrated_calibrations_list

In [None]:

calibrated_dict = {
    'KNN': {'model': knn},
    'LR': {'model': lr},
    'XGB': {'model': xgboost}
}

for model_name, model_dict in calibrated_dict.items():
  model_dict['calibrated-model'], model_dict['calibrations_list'], model_dict['calibrated-calibrations-list'] = \
  test_mc_across_models(model_dict['model'], sorted_queries, y_test, model_name)

In [None]:
def plot_calibrated_metrics(model_name):
  """
  Calculates and plots performance metrics for a calibrated model.

  Args:
      model_name (str): The name of the model to retrieve from the 'calibrated_dict'.

  Explanation:
      1. Retrieves the calibrated model and predictions from the 'calibrated_dict'.
      2. Calculates performance metrics (AUC, Sensitivity, PPV, Specificity, NPV).
      3. Creates a DataFrame to store the calculated metric values.
      4. Generates a bar chart using Plotly to visualize the performance metrics.
  """

  predictions = calibrated_dict[model_name]['calibrated-model'].predict(X_test)
  probas = calibrated_dict[model_name]['calibrated-model'].proba[:,1]
  metrics = ['Sensitivity', 'PPV', 'Specificity', 'NPV']

  metrics_dict = {k: None for k in metrics}

  tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

  metrics_dict['AUC'] = roc_auc_score(y_test, probas)
  metrics_dict['Sensitivity'] = tp / (tp + fn)
  metrics_dict['PPV'] = tp / (tp + fp)
  metrics_dict['Specificity'] = tn / (tn + fp)
  metrics_dict['NPV'] = tn / (tn + fn) if (tn + fn) != 0 else None

  metrics_df = pd.DataFrame.from_dict(metrics_dict, orient='index', columns=['Value'])
  metrics_df.reset_index(inplace=True)
  metrics_df.rename(columns={'index': 'Metric'}, inplace=True)

  fig = px.bar(metrics_df, x='Metric', y='Value', title=f'Model Calibrated Performance Metrics on {model_name}')

  fig.update_layout(
      xaxis_title=None,
      yaxis_title="Metric Value",
  )

  fig.show()


def plot_regular_metrics(model_name):
  """
    Calculates and plots performance metrics for the original (uncalibrated) model.

    Args:
        model_name (str): The name of the model to retrieve from the 'calibrated_dict'.

    Differences from 'plot_calibrated_metrics':
        *  Retrieves the original, uncalibrated model from 'calibrated_dict'.
        *  The title indicates that these are performance metrics for the original model.

    Explanation:
        1. Retrieves the original model and predictions from the 'calibrated_dict'.
        2. Calculates performance metrics (AUC, Sensitivity, PPV, Specificity, NPV).
        3. Creates a DataFrame to store the calculated metric values.
        4. Generates a bar chart using Plotly to visualize the performance metrics.
  """
  predictions = calibrated_dict[model_name]['model'].predict(X_test)
  probas = calibrated_dict[model_name]['model'].predict_proba(X_test)[:,1]
  metrics = ['Sensitivity', 'PPV', 'Specificity', 'NPV']

  metrics_dict = {k: None for k in metrics}

  tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

  metrics_dict['AUC'] = roc_auc_score(y_test, probas)
  metrics_dict['Sensitivity'] = tp / (tp + fn)
  metrics_dict['PPV'] = tp / (tp + fp)
  metrics_dict['Specificity'] = tn / (tn + fp)
  metrics_dict['NPV'] = tn / (tn + fn) if (tn + fn) != 0 else None

  metrics_df = pd.DataFrame.from_dict(metrics_dict, orient='index', columns=['Value'])
  metrics_df.reset_index(inplace=True)
  metrics_df.rename(columns={'index': 'Metric'}, inplace=True)

  fig = px.bar(metrics_df, x='Metric', y='Value', title=f'Model Performance Metrics on {model_name}')
  fig.update_traces(marker_color='#960b0b')  # Your desired color

  fig.update_layout(
      xaxis_title=None,
      yaxis_title="Metric Value",
  )

  fig.show()


def plot_distribution(model_name):
  """
    Visualizes the distribution of calibration-in-the-large (ITL) values before and after applying a multi-calibration process.

    Args:
        model_name (str): The name of the model to retrieve data from the 'calibrated_dict'.

    Explanation:
        1. Retrieves calibration-in-the-large values (both before and after calibration) from the 'calibrated_dict'.
        2. Creates a DataFrame to organize the calibration data.
        3. Generates an overlaid histogram using Plotly to compare the distributions.
        4. Sets the y-axis to a logarithmic scale for better visualization of distributions.
  """
  vector1 = calibrated_dict[model_name]['calibrations_list']
  vector2 = calibrated_dict[model_name]['calibrated-calibrations-list']
  df = pd.DataFrame({'Data 1': vector1, 'Data 2': vector2})
  fig = px.histogram(df, x=df.columns, barmode='overlay', opacity=0.6,
                   title=f'Distribution of Calibration on {model_name} Before and After Multi-Calibration')

  fig.update_layout(bargap=0.1, yaxis_type='log')  # Set y-axis to log scale

  fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Prepare the data
gender_counts = df['gender'].value_counts()
ethnicity_counts = df['ethnicity'].value_counts()
age_data = df['age']

# Create a subplot figure with 1 row and 3 columns
fig = make_subplots(
    rows=1, cols=3,
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "histogram"}]],
    subplot_titles=("Gender Distribution", "Ethnicity Distribution", "Age Distribution"),
    horizontal_spacing=0.1
)

# Add the plots
fig.add_trace(go.Bar(x=gender_counts.index, y=gender_counts.values, marker_color='green'), row=1, col=1)
fig.add_trace(go.Bar(x=ethnicity_counts.index, y=ethnicity_counts.values, marker_color='orange'), row=1, col=2)
fig.add_trace(go.Histogram(x=age_data, nbinsx=20, marker_color='red'), row=1, col=3)

# Update layout for a better view
fig.update_layout(showlegend=False, height=400, width=1200)
fig.update_xaxes(title_text="Gender", row=1, col=1)
fig.update_xaxes(title_text="Ethnicity", row=1, col=2)
fig.update_xaxes(title_text="Age", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=3)

# Show the figure
fig.show()

# Multi - Calibration

In [None]:
def generate_subgroups(query_lists):
    """Generate all possible combinations of subgroups from different categorical lists"""
    subgroups = []

    # Generate all combinations
    for r in range(2, len(query_lists) + 1):
        for combination in product(*[query_lists[i] for i in range(len(query_lists))]):
            subgroups.append(combination[:r])

    # Also add pairwise combinations
    for i in range(len(query_lists)):
        for j in range(i + 1, len(query_lists)):
            for q1 in query_lists[i]:
                for q2 in query_lists[j]:
                    subgroups.append((q1, q2))

    return subgroups

In [None]:
def find_empty_subgroups(subpopulation_queries, X):
    """Find subgroups that have no data points"""
    empty_subgroups = []

    for query in subpopulation_queries:
        try:
            if isinstance(query, tuple):
                combined_query = " and ".join(query)
            else:
                combined_query = query

            subset = X.query(combined_query)
            if len(subset) == 0:
                empty_subgroups.append(query)
        except Exception as e:
            # If query fails, consider it empty
            empty_subgroups.append(query)
            print(f"Query failed: {query}, Error: {e}")

    return empty_subgroups

In [None]:
# Define subgroups (removed hospital queries)
gender_queries = ['gender_M == 1', 'gender_F == 1']
age_queries = [f'age >= {a} and age < {a+10}' for a in range(10, 90, 10)]
ethnicity_queries = [f'`ethnicity_{e}` == 1' for e in ['Caucasian', 'Native American', 'Other/Unknown', 'African American', 'Asian']]

# Generate all subgroups without hospital
all_query_lists = [gender_queries, age_queries, ethnicity_queries]
all_subgroups = generate_subgroups(all_query_lists)

subpopulation_queries = [
    *gender_queries,
    *age_queries,
    *ethnicity_queries,
    *all_subgroups
]

print(f'Total subgroups (without hospital) = {len(subpopulation_queries)}')

In [None]:
class CalibratedPredictor:
    def __init__(self, model, scaler, subgroups, y_test):
        self.model = model
        self.scaler = scaler
        self.subgroups = subgroups
        self.y_test = y_test
        self.proba = None

    def predict(self, X, y=None):
        if self.proba is None:
            self.predict_proba(X, y)

        p = self.proba[:, 1]
        preds = np.where(p >= 0.2, 1, 0)
        return preds

    def predict_proba(self, X, y=None):
        return self.multi_calibrate_predictor(X, y, self.subgroups)

    def multi_calibrate_predictor(self, X, y, subpopulation_queries, alpha=1e-3, max_iter=1):
        """
        Perform multi-calibration to ensure fairness across subgroups.

        :param X: DataFrame containing the input features.
        :param y: Series containing the true labels.
        :param subpopulation_queries: List of strings representing the query to define each subgroup.
        :param alpha: The violation parameter.
        :param max_iter: Maximum number of iterations for the calibration process.
        :return: Calibrated probabilities.
        """

        y = pd.Series(self.y_test, index=X.index)

        # Apply scaling if scaler exists
        if self.scaler is not None:
            X_scaled = pd.DataFrame(
                self.scaler.transform(X),
                columns=X.columns,
                index=X.index
            )
        else:
            X_scaled = X

        # Get initial predictions
        p = self.model.predict_proba(X_scaled)[:, 1]
        calibrated_p = pd.Series(p.copy(), index=X.index)
        done = False

        for iteration in range(max_iter):
            done = True
            violations_found = 0

            for query in subpopulation_queries:
                try:
                    if isinstance(query, tuple):
                        combined_query = " and ".join(query)
                    else:
                        combined_query = query

                    S = X.query(combined_query).index

                    if len(S) == 0:  # Skip empty subgroups
                        continue

                    delta_S = (y.loc[S] - calibrated_p.loc[S]).mean()
                    if abs(delta_S) > alpha:
                        calibrated_p.loc[S] += delta_S
                        done = False
                        violations_found += 1

                except Exception as e:
                    print(f"Error processing query {query}: {e}")
                    continue

            print(f"Iteration {iteration + 1}: {violations_found} violations found")
            if done:
                print("Calibration completed!")
                break

        # Ensure probabilities are in [0, 1]
        calibrated_p = np.clip(calibrated_p.values, 0, 1)

        # Create probability matrix for binary classification
        zero_class = 1 - calibrated_p
        calibrated_p_matrix = np.column_stack([zero_class, calibrated_p])

        self.proba = calibrated_p_matrix
        return calibrated_p_matrix


In [None]:
def test_mc_across_models(model, scaler, sorted_queries, y_test, model_name):
    """Test multi-calibration across models"""
    print(f"\n=== Testing Multi-Calibration for {model_name} ===")

    # Create calibrated predictor
    calibrated_model = CalibratedPredictor(model, scaler, sorted_queries, y_test)

    # You can add more evaluation logic here
    # For now, just return the calibrated model
    return calibrated_model, sorted_queries, sorted_queries

In [None]:
def run_multi_calibration(X_test, y_test, models_path="/Users/ormeiri/Desktop/predictive_models_git/models/final_models"):
    """Main function to run multi-calibration on all models"""

    # Load models
    models_dict = load_models_and_scalers(models_path)

    # Find empty subgroups
    print("Finding empty subgroups...")
    empty_subgroups = find_empty_subgroups(subpopulation_queries, X_test)
    final_subgroup_queries = set(subpopulation_queries) - set(empty_subgroups)

    sorted_queries = sorted(final_subgroup_queries, key=lambda x: 0 if isinstance(x, str) else len(x))
    print(f'Valid subgroups = {len(sorted_queries)}')

    # Create calibrated models dictionary
    calibrated_dict = {}

    for model_name, model_data in models_dict.items():
        print(f"\nProcessing {model_name}...")

        calibrated_model, calibrations_list, calibrated_calibrations_list = test_mc_across_models(
            model_data['model'],
            model_data['scaler'],
            sorted_queries,
            y_test,
            model_name
        )

        calibrated_dict[model_name] = {
            'original_model': model_data['model'],
            'scaler': model_data['scaler'],
            'calibrated_model': calibrated_model,
            'calibrations_list': calibrations_list,
            'calibrated_calibrations_list': calibrated_calibrations_list,
            'metadata': model_data['metadata']
        }

    return calibrated_dict

In [None]:
calibrated_models = run_multi_calibration(X_test, y_test)