# Evaluation Pipeline Construction

## High Level Todos
- Establish guidelines
    - columns for attributes should be named `attribute_{NAME}`, if more than 2 groups in the attribute can be coded {0,1,2,...}
    - make one function that can process one csv file, evaluate_file
    - make another function that can do batch processing using the single file
  

In [7]:
%load_ext autoreload
%autoreload 2

In [21]:
# General Imports
import numpy as np
import pandas as pd
import os
import pathlib
import json

from fairlearn.metrics import (
    MetricFrame,
    selection_rate,
    false_positive_rate, # false positive error rate balance
    true_positive_rate, # false negative error rate balance
)

from sklearn.metrics import (
    accuracy_score, # use for both performance and fairness parts
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

## Synthetic CSV Generation, for testing evaluation pipeline

input: file_name, num_rows
output: none

In [5]:
def generate_binary_prediction_csv(file_path, data_size = 1000):
    # Generate random data for y_true, y_pred, and attribute_gender
    y_true = np.random.randint(2, size=data_size)
    y_pred = np.random.randint(2, size=data_size)
    attribute_gender = np.random.randint(2, size=data_size) # assume binary attribute

    # Create a DataFrame using pandas
    df = pd.DataFrame({
        'y_true': y_true,
        'y_pred': y_pred,
        'attribute_gender': attribute_gender
    })

    # Write the DataFrame to a CSV file
    df.to_csv(file_path, index=False)

In [6]:
generate_binary_prediction_csv('synthetic_data.csv')

## Evaluation Metrics

Check what metrics only good for binary attributes

Aggregate Performance:
- accuracy
- f1-score
- precision
- recall

Fairness Metrics:
- Disparity
- predictive value parity
- Equalized Odds (Error Rate balance)
- Accuracy equality
- Treatment Equality


Further Ideas:
- should we process class probability labels

Functions:
- process file
    - input: file_name, assume it follows the formate assumptions, assume model_name is the file name before csv
    - output: returns a dict
- process folder (batch processing)

In [51]:
# Quick utility function to get confusion matrix
def _get_conf_mat_values(y_true, y_pred):
  cm = confusion_matrix(y_true, y_pred)

  # Extracting confusion matrix values
  TN = cm[0, 0]
  FP = cm[0, 1]
  FN = cm[1, 0]
  TP = cm[1, 1]

  return TN, FP, FN, TP

def predictive_parity(y_true, y_pred):
  TN, FP, FN, TP = _get_conf_mat_values(y_true, y_pred)
  return TP/(TP+FP)

def error_rate_ratio(y_true, y_pred):
  TN, FP, FN, TP = _get_conf_mat_values(y_true, y_pred)
  return FN/FP

def evaluate_file(path_obj):
    df = pd.read_csv(path_obj)

    model_name = path_obj.name

    y_true = df['y_true']
    y_pred = df['y_pred']
    sensitive_attribute = df['attribute_gender']

    # Fairness measurements processing
    metrics = {
              'selection_rate': selection_rate,
              'ppv': predictive_parity,
              'fp_err_rate_balance': false_positive_rate,
              'tp_error_rate_balance': true_positive_rate,
              'accuracy': accuracy_score,
              'error_rate_ratio': error_rate_ratio,
               }

    mf = MetricFrame(
                    metrics=metrics,
                    y_true=y_true,
                    y_pred=y_pred,
                    sensitive_features=sensitive_attribute
                    )

    results = {
        'model_performance': {'accuracy': accuracy_score(y_true, y_pred),
                              'f1_score': f1_score(y_true, y_pred),
                              'precision': precision_score(y_true, y_pred),
                              'recall': recall_score(y_true, y_pred),
                              },
        'fairness_performance': {
            'by_group_data': mf.by_group.to_dict(), # raw data
            'difference': mf.difference().to_dict(), # max inter-group diff per stat
            },
    }

    return results

def batch_evaluate(folder_path, write_name = None):
  """
  Processes model result CSV files in the

  input: folder_path: str
  returns: list of dicts containing performance results
  """
  folder = pathlib.Path(folder_path)

  perf_data = {}

  # Iterate through all the csv files in the folder
  for path in list(folder.glob('*.csv')):
    perf_data[str(path)] = evaluate_file(path)

  # Write data to 'write_name' json file
  if write_name is not None:
    with open(str(folder/write_name), "w") as outfile:
      json.dump(perf_data, outfile, indent=4)

  return perf_data

In [52]:
data = batch_evaluate('./model_data', 'perf_data.json')

In [53]:
pathlib.Path('./model_path').resolve()

PosixPath('/content/model_path')

## Constructing Multiplicity Metrics

In [40]:
import eval as ev

In [41]:
base_path = './synth_data/'

ev.batch_generate_binary_prediction_csv(base_path)

In [44]:
def load_data_folder(folder_path):
    folder = pathlib.Path(folder_path)
    data_files = list(folder.glob('*.csv'))

    dframes = [pd.read_csv(path_obj) for path_obj in data_files]

    return dframes


def compute_ambiguity(dframes, group = None, attribute_name = None):
    """
    Parameters:
    dframes: list of dataframes
    """

    data = []

    # If group is specified, then limit view to protected group
    if group is not None:
        for df in dframes:
            data.append(df[df[attribute_name]==group])
    else:
        data = dframes
    
    all_preds = np.array([df['y_pred'].to_numpy() for df in data])

    # num_models, num_preds = all_preds.shape --> should hold
    # compute number of unique values for each column
    unique_counts = np.array([len(np.unique(all_preds[:, i])) for i in range(all_preds.shape[1])])

    return (unique_counts > 1).mean()


def compute_discrepancy(dframes, group = None, attribute_name = None):
    data = []

    # If group is specified, then limit view to protected group
    if group is not None:
        for df in dframes:
            data.append(df[df[attribute_name]==group])
    else:
        data = dframes
    
    all_preds = np.array([df['y_pred'].to_numpy() for df in data])
    num_models, num_preds = all_preds.shape

    max_disc = 0

    # Pass through all model pairings to compute discrepancy
    for i in range(num_models):
        for j in range(i,num_models):
            disagree = (all_preds[i] != all_preds[j]).sum()

            # Change return if needed
            max_disc = max(max_disc, disagree)
    
    return max_disc/num_preds

def evaluate_model_multiplicity(folder_path, attr_list):
    dframes = load_data_folder(folder_path)

    results = {}

    # compute total amgig, disc metrics
    results["aggregate"] = {
        "ambiguity": compute_ambiguity(dframes),
        "discrepancy": compute_discrepancy(dframes)
    }

    # compute group level metrics for each attr
    results["attribute"] = {}

    for attr in attr_list:
        results["attribute"][attr] = {}
        for group in [0,1]:
            results["attribute"][attr][group] = {
                "ambiguity": compute_ambiguity(dframes, group, attr),
                "discrepancy": compute_discrepancy(dframes, group, attr)
            }

    return results

In [43]:
dfs = load_data_folder(base_path)

In [45]:
results = evaluate_model_multiplicity(base_path, ['attribute_gender'])

0 attribute_gender
1 attribute_gender


In [46]:
results

{'aggregate': {'ambiguity': 1.0, 'discrepancy': 0.561},
 'attribute': {'attribute_gender': {0: {'ambiguity': 1.0,
    'discrepancy': 0.591182364729459},
   1: {'ambiguity': 1.0, 'discrepancy': 0.5788423153692615}}}}

## Test Gen Perf, Fairness, and Mult

In [None]:
import eval as ev

In [47]:
base_path = './synth_data/'

results = ev.batch_evaluate(base_path, ['attribute_gender'])