In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score
from itertools import combinations, chain
import math

def powerset(iterable):
    """Create a powerset of the given iterable."""
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def mutual_information(df, features, target):
    """Calculate mutual information between features and target."""
    features_list = list(features)
    return mutual_info_score(df[target], df[features_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1))


def conditional_mutual_information(df, features, target, conditioned_on):
    """Calculate conditional mutual information between features and target, conditioned on other features."""
    features_list = list(features)
    conditioned_on_list = list(conditioned_on)

    df_combined = df.copy()
    df_combined['combined_features'] = df_combined[features_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    df_combined['combined_conditioned'] = df_combined[conditioned_on_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

    grouped = df_combined.groupby('combined_conditioned')
    cmi = sum(mutual_info_score(group['combined_features'], group[target]) * len(group) / len(df_combined) for name, group in grouped)

    return cmi



from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


def calculate_svm_importance(df, features, target):
    """Estimate feature importance using SVM."""
    features_list = list(features)

    svm = SVC(kernel='linear')
    print(df[features_list])
    print(df[target])
    svm.fit(df[features_list], df[target])


    if svm.kernel == 'linear':
        importances = svm.coef_[0]
        return abs(importances).sum()
    else:
        raise ValueError("SVM kernel is not linear. Feature importance can't be extracted.")




def calculate_unique_information(df, features, target):
    """Estimate unique information provided by a set of features with respect to the target using SVM."""
    return calculate_svm_importance(df, features, target)

def calculate_synergistic_information(df, features, target, complement_features):
    """Estimate synergistic information provided by a set of features and its complement with respect to the target using SVM."""
    features_list = list(features)
    combined_features = features_list + complement_features
    total_info = calculate_svm_importance(df, combined_features, target)
    unique_info = calculate_unique_information(df, features_list, target)
    synergistic_info = total_info - unique_info
    return synergistic_info


def calculate_accuracy_coefficient(df, feature_set, target, protected_attribute):
    """Calculate the accuracy coefficient for a given set of features using SVM."""
    complement_features = [col for col in df.columns if col not in feature_set and col != target and col != protected_attribute]

    unique_info = calculate_unique_information(df, feature_set, target)

    synergistic_info = calculate_synergistic_information(df, feature_set, target, complement_features)

    accuracy_coefficient = unique_info + synergistic_info
    return accuracy_coefficient



def calculate_discrimination_coefficient(df, feature_set, target, protected_attribute):
    """Calculate the discrimination coefficient for a given set of features."""
    shared_info = mutual_information(df, feature_set, target)
    mutual_info_xs_a = mutual_information(df, feature_set, protected_attribute)
    conditional_mutual_info_xs_a_y = conditional_mutual_information(df, feature_set, protected_attribute, [target])
    return shared_info * mutual_info_xs_a * conditional_mutual_info_xs_a_y


def shapley_value(df, features, target, protected_attribute, accuracy_or_discrimination):
    """Calculate the Shapley value for each feature."""
    shapley_values = dict.fromkeys(features, 0)
    total_features = len(features)

    for feature in features:
        for feature_set in powerset(set(features) - {feature}):
            feature_set_with = set(feature_set).union({feature})
            if accuracy_or_discrimination == 'accuracy':
                contribution = calculate_accuracy_coefficient(df, feature_set_with, target, protected_attribute) - calculate_accuracy_coefficient(df, feature_set, target, protected_attribute)
            else:
                contribution = calculate_discrimination_coefficient(df, feature_set_with, target, protected_attribute) - calculate_discrimination_coefficient(df, feature_set, target, protected_attribute)
            shapley_values[feature] += (contribution * math.factorial(len(feature_set)) * math.factorial(total_features - len(feature_set) - 1)) / math.factorial(total_features)

    return shapley_values


def generate_feature_comparison_table(df, features, target, protected_attribute):
    """Generate a table comparing features with their accuracy and discrimination coefficients."""
    accuracy_shapley = shapley_value(df, features, target, protected_attribute, 'accuracy')
    discrimination_shapley = shapley_value(df, features, target, protected_attribute, 'discrimination')

    comparison_table = pd.DataFrame({
        'Feature': features,
        'Marginal Accuracy Coefficient': [accuracy_shapley[feature] for feature in features],
        'Marginal Discrimination Coefficient': [discrimination_shapley[feature] for feature in features]
    })

    return comparison_table

In [2]:
'''
compas_data_final_copy = compas_data_final.copy()

features = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
for col in columns_to_convert:
    compas_data_final_copy[col] = compas_data_final_copy[col].astype(int)

feature_list = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
sensitive_feature = 'race'
feature_comparison_table = generate_feature_comparison_table(compas_data_final_copy, feature_list, 'two_year_recid', sensitive_feature)

print(feature_comparison_table)
'''

"\ncompas_data_final_copy = compas_data_final.copy()\n\nfeatures = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']\nfor col in columns_to_convert:\n    compas_data_final_copy[col] = compas_data_final_copy[col].astype(int)\n\nfeature_list = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']\nsensitive_feature = 'race'\nfeature_comparison_table = generate_feature_comparison_table(compas_data_final_copy, feature_list, 'two_year_recid', sensitive_feature)\n\nprint(feature_comparison_table)\n"