### Import relative modules and the dataset COMPAS

In [16]:
import pandas as pd
import os, sys
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import log_loss
from datetime import datetime
from google.colab import drive
drive.mount("/content/drive")

#upload 6 files to google drive
compas_data = pd.read_csv('/content/drive/MyDrive/ADSProj4/compas-scores-two-years.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
compas_data

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


### Encoding COMPAS dataset

*   age_cat: 2 for > 45, 1 for 25 - 45 and 0 for < 25
*   charge_degree: 1 for F and 0 for M
*   sex: 1 for male and 0 for female
*   priors_count_cat: 0 for 0, 1 for 1 to 3, 2 for larger than 3
*   length_of_stay: 0 for ≤ 1 week, 1 for 1 week to 3 months, 2 for > 3 months
*   race: 1 for caucasian and 0 for african-american

In [18]:
# Encode variables with dummy variables
def map_age_category(age):
    if age < 25:
        return '0'
    elif 25 <= age < 45:
        return '1'
    else:
        return '2'

def map_charge_degree(charge):
    return {'F': '1', 'M': '0'}.get(charge[0], 'Other')

def map_prior_counts(priors):
    if priors == 0:
        return '0'
    elif 1 <= priors <= 3:
        return '1'
    else:
        return '2'

# We don't have direct data for Length of Stay, but we can compute it if 'in_custody' and 'out_custody' are dates
def calculate_length_of_stay(in_custody, out_custody):
    if pd.isnull(in_custody) or pd.isnull(out_custody):
        return None  # If either date is missing, return None
    in_date = datetime.strptime(in_custody, "%Y-%m-%d")
    out_date = datetime.strptime(out_custody, "%Y-%m-%d")
    return (out_date - in_date).days

def map_length_of_stay(length):
    if length < 7:
        return '0'
    elif 7 <= length < 84:
        return '1'
    else:
        return '2'

In [19]:
# Process the COMPAS dataset
compas_data_processed = compas_data.copy()

# Map the age to categories
compas_data_processed['age_cat'] = compas_data_processed['age'].apply(map_age_category)

# Map the charge degree to categories
compas_data_processed['charge_degree'] = compas_data_processed['c_charge_degree'].apply(map_charge_degree)

# Map the priors count to categories
compas_data_processed['priors_counts'] = compas_data_processed['priors_count.1'].apply(map_prior_counts)

# Calculate the length of stay
compas_data_processed['length_of_stay'] = compas_data_processed.apply(
    lambda row: calculate_length_of_stay(row['in_custody'], row['out_custody']), axis=1)

# Map the length of stay to categories
compas_data_processed['length_of_stay'] = compas_data_processed['length_of_stay'].apply(map_length_of_stay)

# Encode gender as binary
compas_data_processed['sex'] = compas_data_processed['sex'].map({'Male': 0, 'Female': 1})

# Restrict the dataset to African American and Caucasian only
compas_data_processed = compas_data_processed[compas_data_processed['race'].isin(['African-American', 'Caucasian'])]
compas_data_processed['race'] = compas_data_processed['race'].map({'African-American': 0, 'Caucasian': 1})

# Select the relevant columns for the feature vector and the label
feature_columns = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
label_column = 'two_year_recid'
compas_data_final = compas_data_processed[feature_columns + [label_column, 'race']]
compas_data_final.dropna(subset=['length_of_stay'], inplace=True)


compas_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6150 entries, 1 to 7212
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age_cat         6150 non-null   object
 1   charge_degree   6150 non-null   object
 2   sex             6150 non-null   int64 
 3   priors_counts   6150 non-null   object
 4   length_of_stay  6150 non-null   object
 5   two_year_recid  6150 non-null   int64 
 6   race            6150 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 384.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compas_data_processed['race'] = compas_data_processed['race'].map({'African-American': 0, 'Caucasian': 1})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  compas_data_final.dropna(subset=['length_of_stay'], inplace=True)


In [20]:
compas_data_final

Unnamed: 0,age_cat,charge_degree,sex,priors_counts,length_of_stay,two_year_recid,race
1,1,1,0,0,1,1,0
2,0,1,0,2,0,1,0
3,0,1,0,1,2,0,0
6,1,1,0,2,1,1,1
8,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...
7207,1,0,0,0,0,1,0
7208,0,1,0,0,1,0,0
7209,0,1,0,0,0,0,0
7210,0,1,0,0,0,0,0


### Feature Selection and Split data

In [21]:
# Store features
features = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
sensitive = 'race'
target = 'two_year_recid'

# Function to process and shuffle data
def process_df(df):
    y_label = df[target]
    protected_attr = df[sensitive]
    df_new = df[features]
    y_label, protected_attr, df_new = shuffle(y_label, protected_attr, df_new, random_state = 617)

    return y_label.to_numpy(), protected_attr.to_numpy(), df_new.to_numpy()

# Split data into train and test
y_label, protected_attr, compas_new =  process_df(compas_data_final)
train_index = int(len(compas_new) * 0.7)
x_train, y_train, race_train = compas_new[:train_index], y_label[:train_index], protected_attr[:train_index]
x_test, y_test, race_test = compas_new[train_index:], y_label[train_index:],protected_attr[train_index:]

Functions to determine the p-rule (p%) and a function to compute calibration.

*   Protected: Caucasians (i.e., race == 1)
*   Not protected: African-Americans (i.e., race == 0)

In [22]:
# Function to compute p-rule
def p_rule(sensitive_var, y_pred):
    protected = np.where(sensitive_var == 1)[0]
    not_protected = np.where(sensitive_var == 0)[0]
    protected_pred = np.where(y_pred[protected] == 1)
    not_protected_pred = np.where(y_pred[not_protected] == 1)
    protected_percent = protected_pred[0].shape[0]/protected.shape[0]
    not_protected_percent = not_protected_pred[0].shape[0]/not_protected.shape[0]
    ratio = min(protected_percent/not_protected_percent, not_protected_percent/protected_percent)

    return ratio, protected_percent, not_protected_percent

In [23]:
# Function to compute calibration
def calibration(sensitive_var, y_pred, y_true):
    protected_point = np.where(sensitive_var == 1)[0]
    y_predcau = y_pred[protected_point]
    y_truecau = y_true[protected_point]
    pcau = sum(y_predcau==y_truecau)/len(y_truecau)
    not_protected_point = np.where(sensitive_var == 0)[0]
    y_predafa = y_pred[not_protected_point]
    y_trueafa = y_true[not_protected_point]
    pafa = sum(y_predafa==y_trueafa)/len(y_trueafa)
    calibration = abs(pcau-pafa)
    return(calibration)

### Train model using Logistic Regression and SVM

In [24]:
# Train model and print results
clf = LogisticRegression(random_state = 0).fit(x_train, y_train)
coeff = clf.coef_
intercept = clf.intercept_
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))
results_lr = {"Classifier": ["LR", "LR"],
              "Set": ["Train", "Test"],
              "Accuracy (%)": [clf.score(x_train, y_train)*100, clf.score(x_test, y_test)*100],
              "P-rule (%)": [p_rule(race_train, clf.predict(x_train))[0]*100, p_rule(race_test, clf.predict(x_test))[0]*100],
              "Protected (%)": [p_rule(race_train, clf.predict(x_train))[1]*100, p_rule(race_test, clf.predict(x_test))[1]*100],
              "Not protected (%)": [p_rule(race_train, clf.predict(x_train))[2]*100, p_rule(race_test, clf.predict(x_test))[2]*100],
              "Calibration (%)": [calibration(race_train, clf.predict(x_train), y_train)*100, calibration(race_test, clf.predict(x_test), y_test)*100]}
pd.DataFrame(results_lr)

Unnamed: 0,Classifier,Set,Accuracy (%),P-rule (%),Protected (%),Not protected (%),Calibration (%)
0,LR,Train,65.598142,54.395537,27.970012,51.419681,2.658277
1,LR,Test,65.582656,58.145944,29.305556,50.4,1.638889


In [25]:
# Train model and print results
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

svm_model = SVC(kernel = 'linear', probability = True)

# Train model and print results
clf = svm_model.fit(x_train, y_train)
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))
results_svm = {"Classifier": ["SVM", "SVM"],
               "Set": ["Train", "Test"],
               "Accuracy (%)": [clf.score(x_train, y_train)*100, clf.score(x_test, y_test)*100],
               "P-rule (%)": [p_rule(race_train, clf.predict(x_train))[0]*100, p_rule(race_test, clf.predict(x_test))[0]*100],
               "Protected (%)": [p_rule(race_train, clf.predict(x_train))[1]*100, p_rule(race_test, clf.predict(x_test))[1]*100],
               "Not protected (%)": [p_rule(race_train, clf.predict(x_train))[2]*100, p_rule(race_test, clf.predict(x_test))[2]*100],
               "Calibration (%)": [calibration(race_train, clf.predict(x_train), y_train)*100, calibration(race_test, clf.predict(x_test), y_test)*100]}
pd.DataFrame(results_svm)

Unnamed: 0,Classifier,Set,Accuracy (%),P-rule (%),Protected (%),Not protected (%),Calibration (%)
0,SVM,Train,63.344948,67.683001,46.885813,69.272657,1.795984
1,SVM,Test,61.95122,71.548151,49.861111,69.688889,6.161111


### Train LR and SVM implementing FFS

In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss
from sklearn.feature_selection import mutual_info_classif
from itertools import combinations
import numpy as np

selected_features = ['sex', 'age_cat', 'priors_counts', 'charge_degree', 'race', 'two_year_recid']
data_subset = compas_data_final[selected_features].copy()


label_encoders = {}
for column in data_subset.select_dtypes(include=['object']).columns:
  le = LabelEncoder()
  data_subset[column] = le.fit_transform(data_subset[column])
  label_encoders[column] = le


X = data_subset.drop('two_year_recid', axis=1)
y = data_subset['two_year_recid']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


parameters = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc = svm.SVC(probability=True)
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train)


best_svm_model = clf.best_estimator_


y_pred = best_svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


y_pred_proba = best_svm_model.predict_proba(X_test)
calibration_score = log_loss(y_test, y_pred_proba)

print("Best SVM parameter:", clf.best_params_)
print("Accuracy:", accuracy)
print("Summary:\n", report)
print("Calibration score:", calibration_score)


mutual_info_scores = mutual_info_classif(X, y)
feature_info_scores = dict(zip(X.columns, mutual_info_scores))
sorted_features = sorted(feature_info_scores.items(), key=lambda x: x[1], reverse=True)

print("Feature score:", sorted_features)


def shapley_value_estimation(X, y, model, feature_names):
  shapley_values = dict.fromkeys(feature_names, 0.0)
  n_features = len(feature_names)

Best SVM parameter: {'C': 0.1, 'kernel': 'rbf'}
Accuracy: 0.6552845528455284
Summary:
               precision    recall  f1-score   support

           0       0.67      0.70      0.68       974
           1       0.64      0.61      0.63       871

    accuracy                           0.66      1845
   macro avg       0.65      0.65      0.65      1845
weighted avg       0.65      0.66      0.65      1845

Calibration score: 0.6379364978970188
Feature score: [('priors_counts', 0.03221193075869433), ('age_cat', 0.01746151011506969), ('sex', 0.015288095365355137), ('charge_degree', 0.014988736610985098), ('race', 0.007064207134711564)]


In [26]:
# Store features (Removed 'Charge Degree')
features = ['sex', 'age_cat', 'priors_counts', 'length_of_stay']
sensitive = 'race'
target = 'two_year_recid'

# Split data into train and test
y_label, protected_attr, compas_new =  process_df(compas_data_final)
train_index = int(len(compas_new) * 0.7)
x_train, y_train, race_train = compas_new[:train_index], y_label[:train_index], protected_attr[:train_index]
x_test, y_test, race_test = compas_new[train_index:], y_label[train_index:],protected_attr[train_index:]

In [27]:
# Train model and print results
clf = LogisticRegression(random_state = 0).fit(x_train, y_train)
coeff = clf.coef_
intercept = clf.intercept_
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))
results_ffs_lr = {"Classifier": ["FFS-LR", "FFS-LR"],
                  "Set": ["Train", "Test"],
                  "Accuracy (%)": [clf.score(x_train, y_train)*100, clf.score(x_test, y_test)*100],
                  "P-rule (%)": [p_rule(race_train, clf.predict(x_train))[0]*100, p_rule(race_test, clf.predict(x_test))[0]*100],
                  "Protected (%)": [p_rule(race_train, clf.predict(x_train))[1]*100, p_rule(race_test, clf.predict(x_test))[1]*100],
                  "Not protected (%)": [p_rule(race_train, clf.predict(x_train))[2]*100, p_rule(race_test, clf.predict(x_test))[2]*100],
                  "Calibration (%)": [calibration(race_train, clf.predict(x_train), y_train)*100, calibration(race_test, clf.predict(x_test), y_test)*100]}
pd.DataFrame(results_ffs_lr)

Unnamed: 0,Classifier,Set,Accuracy (%),P-rule (%),Protected (%),Not protected (%),Calibration (%)
0,FFS-LR,Train,65.737515,54.328746,28.316032,52.119798,2.231773
1,FFS-LR,Test,65.636856,58.184003,29.583333,50.844444,1.272222


In [28]:
svm_model = SVC(kernel = 'linear', probability = True)

# Train model and print results
clf = svm_model.fit(x_train, y_train)
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))
results_ffs_svm = {"Classifier": ["FFS-SVM", "FFS-SVM"],
                   "Set": ["Train", "Test"],
                   "Accuracy (%)": [clf.score(x_train, y_train)*100, clf.score(x_test, y_test)*100],
                   "P-rule (%)": [p_rule(race_train, clf.predict(x_train))[0]*100, p_rule(race_test, clf.predict(x_test))[0]*100],
                   "Protected (%)": [p_rule(race_train, clf.predict(x_train))[1]*100, p_rule(race_test, clf.predict(x_test))[1]*100],
                   "Not protected (%)": [p_rule(race_train, clf.predict(x_train))[2]*100, p_rule(race_test, clf.predict(x_test))[2]*100],
                   "Calibration (%)": [calibration(race_train, clf.predict(x_train), y_train)*100, calibration(race_test, clf.predict(x_test), y_test)*100]}
pd.DataFrame(results_ffs_svm)

Unnamed: 0,Classifier,Set,Accuracy (%),P-rule (%),Protected (%),Not protected (%),Calibration (%)
0,FFS-SVM,Train,63.391405,67.99308,45.32872,66.666667,2.007889
1,FFS-SVM,Test,61.680217,72.050938,47.777778,66.311111,4.577778


### Summary

In [29]:
# Results summary
results_summary = pd.concat([pd.DataFrame(results_lr).iloc[1,],pd.DataFrame(results_ffs_lr).iloc[1,],
                pd.DataFrame(results_svm).iloc[1,],pd.DataFrame(results_ffs_svm).iloc[1,]], axis=1)
results_summary.columns = results_summary.iloc[0]
results_summary = results_summary.iloc[1:,:]
results_summary

Classifier,LR,FFS-LR,SVM,FFS-SVM
Set,Test,Test,Test,Test
Accuracy (%),65.582656,65.636856,61.95122,61.680217
P-rule (%),58.145944,58.184003,71.548151,72.050938
Protected (%),29.305556,29.583333,49.861111,47.777778
Not protected (%),50.4,50.844444,69.688889,66.311111
Calibration (%),1.638889,1.272222,6.161111,4.577778


In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score
from itertools import combinations, chain
import math

def powerset(iterable):
    """Create a powerset of the given iterable."""
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def mutual_information(df, features, target):
    """Calculate mutual information between features and target."""
    features_list = list(features)
    return mutual_info_score(df[target], df[features_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1))


def conditional_mutual_information(df, features, target, conditioned_on):
    """Calculate conditional mutual information between features and target, conditioned on other features."""
    features_list = list(features)
    conditioned_on_list = list(conditioned_on)

    df_combined = df.copy()
    df_combined['combined_features'] = df_combined[features_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    df_combined['combined_conditioned'] = df_combined[conditioned_on_list].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

    grouped = df_combined.groupby('combined_conditioned')
    cmi = sum(mutual_info_score(group['combined_features'], group[target]) * len(group) / len(df_combined) for name, group in grouped)

    return cmi



from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


def calculate_svm_importance(df, features, target):
    """Estimate feature importance using SVM."""
    features_list = list(features)

    svm = SVC(kernel='linear')
    print(df[features_list])
    print(df[target])
    svm.fit(df[features_list], df[target])


    if svm.kernel == 'linear':
        importances = svm.coef_[0]
        return abs(importances).sum()
    else:
        raise ValueError("SVM kernel is not linear. Feature importance can't be extracted.")




def calculate_unique_information(df, features, target):
    """Estimate unique information provided by a set of features with respect to the target using SVM."""
    return calculate_svm_importance(df, features, target)

def calculate_synergistic_information(df, features, target, complement_features):
    """Estimate synergistic information provided by a set of features and its complement with respect to the target using SVM."""
    features_list = list(features)
    combined_features = features_list + complement_features
    total_info = calculate_svm_importance(df, combined_features, target)
    unique_info = calculate_unique_information(df, features_list, target)
    synergistic_info = total_info - unique_info
    return synergistic_info


def calculate_accuracy_coefficient(df, feature_set, target, protected_attribute):
    """Calculate the accuracy coefficient for a given set of features using SVM."""
    complement_features = [col for col in df.columns if col not in feature_set and col != target and col != protected_attribute]

    unique_info = calculate_unique_information(df, feature_set, target)

    synergistic_info = calculate_synergistic_information(df, feature_set, target, complement_features)

    accuracy_coefficient = unique_info + synergistic_info
    return accuracy_coefficient



def calculate_discrimination_coefficient(df, feature_set, target, protected_attribute):
    """Calculate the discrimination coefficient for a given set of features."""
    shared_info = mutual_information(df, feature_set, target)
    mutual_info_xs_a = mutual_information(df, feature_set, protected_attribute)
    conditional_mutual_info_xs_a_y = conditional_mutual_information(df, feature_set, protected_attribute, [target])
    return shared_info * mutual_info_xs_a * conditional_mutual_info_xs_a_y


def shapley_value(df, features, target, protected_attribute, accuracy_or_discrimination):
    """Calculate the Shapley value for each feature."""
    shapley_values = dict.fromkeys(features, 0)
    total_features = len(features)

    for feature in features:
        for feature_set in powerset(set(features) - {feature}):
            feature_set_with = set(feature_set).union({feature})
            if accuracy_or_discrimination == 'accuracy':
                contribution = calculate_accuracy_coefficient(df, feature_set_with, target, protected_attribute) - calculate_accuracy_coefficient(df, feature_set, target, protected_attribute)
            else:
                contribution = calculate_discrimination_coefficient(df, feature_set_with, target, protected_attribute) - calculate_discrimination_coefficient(df, feature_set, target, protected_attribute)
            shapley_values[feature] += (contribution * math.factorial(len(feature_set)) * math.factorial(total_features - len(feature_set) - 1)) / math.factorial(total_features)

    return shapley_values


def generate_feature_comparison_table(df, features, target, protected_attribute):
    """Generate a table comparing features with their accuracy and discrimination coefficients."""
    accuracy_shapley = shapley_value(df, features, target, protected_attribute, 'accuracy')
    discrimination_shapley = shapley_value(df, features, target, protected_attribute, 'discrimination')

    comparison_table = pd.DataFrame({
        'Feature': features,
        'Marginal Accuracy Coefficient': [accuracy_shapley[feature] for feature in features],
        'Marginal Discrimination Coefficient': [discrimination_shapley[feature] for feature in features]
    })

    return comparison_table

In [37]:
'''
compas_data_final_copy = compas_data_final.copy()

features = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
for col in columns_to_convert:
    compas_data_final_copy[col] = compas_data_final_copy[col].astype(int)

feature_list = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']
sensitive_feature = 'race'
feature_comparison_table = generate_feature_comparison_table(compas_data_final_copy, feature_list, 'two_year_recid', sensitive_feature)

print(feature_comparison_table)
'''

"\ncompas_data_final_copy = compas_data_final.copy()\n\nfeatures = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']\nfor col in columns_to_convert:\n    compas_data_final_copy[col] = compas_data_final_copy[col].astype(int)\n\nfeature_list = ['age_cat', 'charge_degree', 'sex', 'priors_counts', 'length_of_stay']\nsensitive_feature = 'race'\nfeature_comparison_table = generate_feature_comparison_table(compas_data_final_copy, feature_list, 'two_year_recid', sensitive_feature)\n\nprint(feature_comparison_table)\n"