In [2]:
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.preprocessing import StandardScaler
# define a XGBoost classifier
import xgboost as xgb
import warnings
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from aif360.sklearn.datasets import fetch_compas
from aif360.sklearn.metrics import disparate_impact_ratio, consistency_score, generalized_entropy_error
from aif360.sklearn.detectors import bias_scan
from aif360.sklearn.inprocessing import AdversarialDebiasing
from aif360.datasets import BinaryLabelDataset, StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric
warnings.filterwarnings("ignore")  # Ignore runtime warnings
# Temporarily adjust pandas display settings for large DataFrames
pd.set_option('display.max_rows', 100)  # Ensure 100 rows can be displayed
pd.set_option('display.max_columns', None)  # Ensure all columns can be displayed
pd.set_option('display.width', None)  # Automatically adjust display width to terminal size
pd.set_option('display.max_colwidth', None)  # Ensure full width of column content is shown
pd.set_option('display.float_format', '{:.4f}'.format)  # Format the float numbers for better readability




pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'
pip install 'aif360[FACTS]'


In [3]:
data = pd.read_csv('C:\\Users\\91948\\Desktop\\SE\\Software-Testing-Project\\data\\investigation_train_large_checked_adjusted.csv')

In [4]:
def zero_features_by_keywords(data, keywords):
    data_copy = data.copy()
    zeroed_columns = []

    for column in data_copy.columns:
        if any(column.startswith(keyword) for keyword in keywords):
            data_copy[column] = 0
            zeroed_columns.append(column)

    return data_copy, zeroed_columns
# Define non-fair keywords (same as before)
non_fair_keywords = [
    "relatie_kind","relatie_partner","ontheffing","belemmering","competentie","persoon","persoonlijke","adres","Ja","Nee"
]

In [5]:
# Let's specify the features and the target
y = data['checked']
X = data.drop(['checked'], axis=1)
X = X.astype(np.float32)

In [6]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the dataset
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

X_train,zeroed_cols = zero_features_by_keywords(X_train,non_fair_keywords)

In [7]:
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth=3, min_samples_leaf=4, subsample=0.9, min_samples_split=10, random_state=0)

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=200)

# Initialize SelectFromModel using the classifier to determine feature importances
sfm = SelectFromModel(clf, threshold='mean')  # Adjust threshold as needed

pipeline = Pipeline([
    ('scaling', StandardScaler()),
    ('feature_selection', sfm),
    ('classification', classifier)
])

# Train the model
pipeline.fit(X_train, y_train)

In [8]:
# Let's evaluate the model
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.4f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.4f}')

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')

# print confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)


Accuracy: 0.9376
Precision: 0.9664
Recall: 0.9067
F1 Score: 0.9356


array([[21404,   696],
       [ 2062, 20037]], dtype=int64)

In [15]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset=12)

onnx.save(onnx_model, "good_model3.onnx")
# Let's check the accuracy of the converted model
sess = rt.InferenceSession(onnx_model.SerializeToString())
y_pred_onnx =  sess.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9376003981990543


In [20]:
testing_session = rt.InferenceSession("good_model3.onnx")

In [17]:
def evaluate_model_by_group(testing_session, X_test, y_test, group_column, group_mapping=None, bins=None, labels=None):
    """
    Evaluate model performance across demographic groups.

    Parameters:
        testing_session (InferenceSession): The ONNX model inference session.
        X_test (pd.DataFrame): Test dataset features.
        y_test (pd.Series): True labels for the test dataset.
        group_column (str): Column used to define groups.
        group_mapping (dict): Mapping of values in `group_column` to group labels.
        bins (list): Bin edges for numeric grouping.
        labels (list): Labels for bins if numeric grouping is used.

    Returns:
        dict: A dictionary containing performance metrics for each group.
    """
    # Copy test data to avoid modification
    X_test_grouped = X_test.copy()

    # Handle group definitions (mapping or binning)
    if group_mapping:
        X_test_grouped['group'] = X_test_grouped[group_column].map(group_mapping)
    elif bins and labels:
        X_test_grouped['group'] = pd.cut(X_test_grouped[group_column], bins=bins, labels=labels)
    else:
        raise ValueError("Provide either `group_mapping` or `bins` and `labels` to define groups.")

    # Drop rows with NaN in the 'group' column
    X_test_grouped = X_test_grouped.dropna(subset=['group'])

    # Align y_test with X_test_grouped
    y_test_aligned = y_test.loc[X_test_grouped.index]

    # Debug: Check group distribution
    print("Group distribution:\n", X_test_grouped['group'].value_counts())

    # Initialize dictionary to store results
    results = {}

    # Evaluate performance for each group
    for group in X_test_grouped['group'].unique():
        # Filter data for the current group
        X_group = X_test_grouped[X_test_grouped['group'] == group].drop(columns=['group'])
        y_group = y_test_aligned[X_test_grouped['group'] == group]

        # Predict using the model
        y_pred_group = testing_session.run(None, {'X': X_group.values.astype(np.float32)})[0]

        # Calculate evaluation metrics
        accuracy_group = accuracy_score(y_group, y_pred_group)
        precision_group = precision_score(y_group, y_pred_group, zero_division=0)
        recall_group = recall_score(y_group, y_pred_group, zero_division=0)
        f1_score_group = f1_score(y_group, y_pred_group, zero_division=0)

        # Store metrics for the group
        results[group] = {
            'Accuracy': accuracy_group,
            'Precision': precision_group,
            'Recall': recall_group,
            'F1 Score': f1_score_group
        }

    return results


In [18]:
age_bins = [0,30,30, 40, 50, 60, 120]
age_labels = ['young_adult', 'youngish_adult', 'middle_aged_adult', 'older_adult', 'senior']

age_results = evaluate_model_by_group(
    testing_session=testing_session,
    X_test=X_test,
    y_test=y_test,
    group_column='persoon_leeftijd_bij_onderzoek',
    bins=age_bins,
    labels=age_labels
)

print("Age Group Results:")
for group, metrics in age_results.items():
    print(f"{group}: {metrics}")

Group distribution:
 group
middle_aged_adult    15113
older_adult          12158
youngish_adult        9417
senior                5568
young_adult           1943
Name: count, dtype: int64
Age Group Results:
middle_aged_adult: {'Accuracy': 0.946139085555482, 'Precision': 0.9675961786728634, 'Recall': 0.9301315462894019, 'F1 Score': 0.9484940521386991}
older_adult: {'Accuracy': 0.9564895542029939, 'Precision': 0.9342915811088296, 'Recall': 0.9302325581395349, 'F1 Score': 0.9322576514278397}
youngish_adult: {'Accuracy': 0.9306573218647127, 'Precision': 0.9841982701264138, 'Recall': 0.9138223938223938, 'F1 Score': 0.9477056138383919}
senior: {'Accuracy': 0.8987068965517241, 'Precision': 0.9666001994017946, 'Recall': 0.7959770114942529, 'F1 Score': 0.8730301665916254}
young_adult: {'Accuracy': 0.8980957282552754, 'Precision': 0.9748369058713886, 'Recall': 0.8594905505341003, 'F1 Score': 0.9135371179039301}


In [21]:
import numpy as np
import pandas as pd

def calculate_fairness_stats(privileged_total, privileged_checked, unprivileged_total, unprivileged_checked):
    """
    Calculates fairness statistics: Disparate Impact and Statistical Parity Difference.
    """
    unprivileged_unchecked = unprivileged_total - unprivileged_checked
    privileged_unchecked = privileged_total - privileged_checked

    favorable_outcome_given_unprivileged = unprivileged_unchecked / unprivileged_total if unprivileged_total > 0 else 0
    favorable_outcome_given_privileged = privileged_unchecked / privileged_total if privileged_total > 0 else 0

    spd = favorable_outcome_given_unprivileged - favorable_outcome_given_privileged
    di = (
        favorable_outcome_given_unprivileged / favorable_outcome_given_privileged
        if favorable_outcome_given_privileged > 0 else 0
    )

    return {
        "Disparate Impact": di,
        "Statistical Parity Difference": spd
    }

def analyze_age_groups(rt_session, x_test, y_pred, age_col, ranges=None):
    """
    Analyzes fairness for multiple age groups using predictions from an ONNX inference session.
    
    Args:
        rt_session: ONNX Runtime InferenceSession.
        x_test: Test features as a pandas DataFrame.
        y_pred: Predictions as a numpy array.
        age_col: The column name representing age in `x_test`.
        ranges: List of tuples defining age ranges (e.g., [(18, 30), (30, 40), ...]).
    
    Returns:
        pd.DataFrame: Results with counts and fairness metrics.
    """
    if ranges is None:
        ranges = [(18, 30), (30, 40), (40, 50), (50, 80)]

    results = []

    for age_range in ranges:
        min_age, max_age = age_range

        # Privileged group condition
        privileged_condition = (x_test[age_col] >= min_age) & (x_test[age_col] < max_age)

        # Separate privileged and unprivileged groups using conditions
        privileged_indices = privileged_condition.values
        unprivileged_indices = ~privileged_condition.values

        # Calculate total and favorable outcome counts
        privileged_total = privileged_indices.sum()
        privileged_checked = y_pred[privileged_indices].sum()

        unprivileged_total = unprivileged_indices.sum()
        unprivileged_checked = y_pred[unprivileged_indices].sum()

        # Calculate fairness statistics
        fairness_stats = calculate_fairness_stats(
            privileged_total, privileged_checked, unprivileged_total, unprivileged_checked
        )

        # Store results
        results.append({
            'Age Range': f"{min_age}-{max_age}",
            'Privileged Total': privileged_total,
            'Privileged Favorable': privileged_checked,
            'Unprivileged Total': unprivileged_total,
            'Unprivileged Favorable': unprivileged_checked,
            'Disparate Impact': fairness_stats["Disparate Impact"],
            'Statistical Parity Difference': fairness_stats["Statistical Parity Difference"]
        })

    # Convert results to DataFrame and display
    results_df = pd.DataFrame(results)
    print(results_df)
    return results_df


# Example usage:
# Assuming `session` is your ONNX Runtime InferenceSession, and `x_test` is a DataFrame.
ranges = [(18, 25), (25, 40), (40, 50), (50, 80)]

# Get predictions from the ONNX Runtime session
y_pred = testing_session.run(None, {'X': X_test.values.astype(np.float32)})[0].round().astype(int)

# Analyze fairness across age groups
results_df = analyze_age_groups(rt_session=testing_session, x_test=X_test, y_pred=y_pred, age_col='persoon_leeftijd_bij_onderzoek', ranges=ranges)


  Age Range  Privileged Total  Privileged Favorable  Unprivileged Total  \
0     18-30              1723                  1021               42476   
1     30-40              8987                  5962               35212   
2     40-50             14756                  7744               29443   
3     50-80             18733                  6006               25466   

   Unprivileged Favorable  Disparate Impact  Statistical Parity Difference  
0                   19712            1.3154                         0.1285  
1                   14771            1.7246                         0.2439  
2                   12989            1.1760                         0.0836  
3                   14727            0.6207                        -0.2577  
