In [78]:
import pandas as pd
import numpy as np
import onnxruntime as rt
import joblib
from sklearn.metrics import classification_report, accuracy_score
import random
from collections import defaultdict

In [2]:
model_1 = rt.InferenceSession('For Group 28/model_1.onnx')

In [3]:
def build_input_data(sess, data):
    """
    Builds the input_data dictionary for ONNX inference.

    Parameters:
    - sess: onnxruntime.InferenceSession, the ONNX model session.
    - data: pandas.DataFrame or dict-like, the input data.

    Returns:
    - input_data: dict, formatted input data for the ONNX model.
    """
    input_data = {}

    for input_tensor in sess.get_inputs():
        name = input_tensor.name
        shape = input_tensor.shape  # Shape of the expected input (e.g., [None, 1])
        data_type = input_tensor.type  # Expected data type (e.g., 'tensor(float)', 'tensor(string)')

        # Extract the column from the input data
        column_data = data[name]

        # Ensure it's in the correct format
        if "float" in data_type.lower():
            input_data[name] = np.array(column_data).astype(np.float32).reshape(-1, shape[1] if len(shape) > 1 else 1)
        elif "string" in data_type.lower():
            input_data[name] = np.array(column_data).astype(str).reshape(-1, shape[1] if len(shape) > 1 else 1)
        elif "int64" in data_type.lower():
            input_data[name] = np.array(column_data).astype(np.int64).reshape(-1, shape[1] if len(shape) > 1 else 1)
        else:
            raise ValueError(f"Unsupported data type for input '{name}': {data_type}")

    return input_data

In [4]:
data = pd.read_csv('investigation_train_large_checked.csv')
y_data = data['checked']
data = data.drop(['Ja', 'Nee', 'checked'], axis=1)

In [5]:
input_data = build_input_data(model_1, data)

In [6]:
result = model_1.run(None, input_data)[0]

In [7]:
accuracy_onnx_model = accuracy_score(y_data, result)
print('Accuracy of the ONNX model: ', accuracy_onnx_model)

Accuracy of the ONNX model:  0.9112692307692307


In [8]:
model_2 = rt.InferenceSession('For Group 28/model_2.onnx')

In [9]:
input_data2 = build_input_data(model_2, data)

In [30]:
result2 = model_2.run(None, input_data2)[0]

In [31]:
accuracy_onnx_model2 = accuracy_score(y_data, result2)
print('Accuracy of the ONNX model[2]: ', accuracy_onnx_model2)

Accuracy of the ONNX model[2]:  0.8952769230769231


In [10]:
sensitive_features = [
    "adres_dagen_op_adres",
    "adres_recentste_buurt_groot_ijsselmonde",
    "adres_recentste_buurt_nieuwe_westen",
    "adres_recentste_buurt_other",
    "adres_recentste_buurt_oude_noorden",
    "adres_recentste_buurt_vreewijk",
    "adres_recentste_plaats_other",
    "adres_recentste_plaats_rotterdam",
    "adres_recentste_wijk_charlois",
    "adres_recentste_wijk_delfshaven",
    "adres_recentste_wijk_feijenoord",
    "adres_recentste_wijk_ijsselmonde",
    "adres_recentste_wijk_kralingen_c",
    "adres_recentste_wijk_noord",
    "adres_recentste_wijk_other",
    "adres_recentste_wijk_prins_alexa",
    "adres_recentste_wijk_stadscentru",
    "afspraak_aantal_woorden",
    "afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel",
    "afspraak_afgelopen_jaar_ontheffing_taaleis",
    "afspraak_laatstejaar_aantal_woorden",
    "afspraak_verzenden_beschikking_i_v_m__niet_voldoen_aan_wet_taaleis",
    "belemmering_dagen_lichamelijke_problematiek",
    "belemmering_dagen_psychische_problemen",
    "belemmering_hist_lichamelijke_problematiek",
    "belemmering_hist_psychische_problemen",
    "belemmering_hist_taal",
    "belemmering_hist_verslavingsproblematiek",
    "belemmering_niet_computervaardig",
    "belemmering_psychische_problemen",
    "beschikbaarheid_aantal_historie_afwijkend_wegens_medische_omstandigheden",
    "beschikbaarheid_huidig_afwijkend_wegens_medische_omstandigheden",
    "beschikbaarheid_recent_afwijkend_wegens_medische_omstandigheden",
    "competentie_vakdeskundigheid_toepassen",
    "contacten_onderwerp_beoordelen_taaleis",
    "contacten_onderwerp_boolean_beoordelen_taaleis",
    "contacten_onderwerp_boolean_taaleis___voldoet",
    "contacten_onderwerp_boolean_ziek__of_afmelding",
    "contacten_onderwerp_boolean_zorg",
    "contacten_onderwerp_ziek__of_afmelding",
    "contacten_onderwerp_zorg",
    "instrument_aantal_laatstejaar",
    "ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden",
    "ontheffing_reden_hist_medische_gronden",
    "persoon_geslacht_vrouw",
    "persoon_leeftijd_bij_onderzoek",
    "persoonlijke_eigenschappen_dagen_sinds_taaleis",
    "persoonlijke_eigenschappen_nl_begrijpen3",
    "persoonlijke_eigenschappen_nl_lezen3",
    "persoonlijke_eigenschappen_nl_lezen4",
    "persoonlijke_eigenschappen_nl_schrijven0",
    "persoonlijke_eigenschappen_nl_schrijven1",
    "persoonlijke_eigenschappen_nl_schrijven2",
    "persoonlijke_eigenschappen_nl_schrijven3",
    "persoonlijke_eigenschappen_nl_schrijvenfalse",
    "persoonlijke_eigenschappen_nl_spreken1",
    "persoonlijke_eigenschappen_nl_spreken2",
    "persoonlijke_eigenschappen_nl_spreken3",
    "persoonlijke_eigenschappen_spreektaal",
    "persoonlijke_eigenschappen_spreektaal_anders",
    "persoonlijke_eigenschappen_taaleis_schrijfv_ok",
    "persoonlijke_eigenschappen_taaleis_voldaan",
    "relatie_kind_basisschool_kind",
    "relatie_kind_heeft_kinderen",
    "relatie_kind_huidige_aantal",
    "relatie_kind_jongvolwassen",
    "relatie_kind_tiener",
    "relatie_kind_volwassen",
    "relatie_overig_actueel_vorm__kostendeler",
    "relatie_overig_actueel_vorm__ouders_verzorgers",
    "relatie_overig_historie_vorm__andere_inwonende",
    "relatie_overig_historie_vorm__kostendeler",
    "relatie_overig_kostendeler",
    "relatie_partner_aantal_partner___partner__gehuwd_",
    "relatie_partner_aantal_partner___partner__ongehuwd_",
    "relatie_partner_huidige_partner___partner__gehuwd_",
    "relatie_partner_totaal_dagen_partner"
]

In [11]:
possible_sensitives = {}
possible_nonsensitives = {}

for feature in data.columns:
    unique_values = data[feature].unique().tolist()
    if feature in sensitive_features:
        possible_sensitives[feature] = unique_values
    else:
        possible_nonsensitives[feature] = unique_values

In [76]:
def get_adj_values(feature, val):
    possible_vals = possible_sensitives.get(feature, [])
    initial_val_index = possible_vals.index(val)
    
    prev_value = possible_vals[initial_val_index - 1] if initial_val_index > 0 else possible_vals[initial_val_index]
    next_value = possible_vals[initial_val_index + 1] if initial_val_index < len(possible_vals)-1 else possible_vals[initial_val_index]
    
    return prev_value, next_value

In [86]:
print("Initial probability value: ")
seed = data.iloc[3]
output_name = model_1.get_outputs()[1].name
input_data = build_input_data(model_1, seed)
ini_result = model_1.run([output_name], input_data)[0]
print(ini_result[0][1])
most_suspicious_data = seed
most_suspicious_val = ini_result[0][1]

feature_impact = defaultdict(float)

for i in range (3000):
    print("Iteration {0}: ".format(i+1))
    
    new_seed_plus = seed.copy()
    new_seed_minus = seed.copy()
    
    col_idx = random.randint(0, len(sensitive_features) - 1)
    selected_sensitive_feature = sensitive_features[col_idx]
    
    prev_value, next_value = get_adj_values(selected_sensitive_feature, seed[selected_sensitive_feature])
    new_seed_minus[selected_sensitive_feature] = prev_value
    new_seed_plus[selected_sensitive_feature] = next_value
    
    input_data_minus = build_input_data(model_1, new_seed_minus)
    minus_result = model_1.run([output_name], input_data_minus)[0][0][1]
    
    input_data_plus = build_input_data(model_1, new_seed_plus)
    plus_result = model_1.run([output_name], input_data_plus)[0][0][1]
    
    vals = [plus_result, minus_result, most_suspicious_val]
    max_idx = np.argmax(vals)
    
    if max_idx == 0:
        most_suspicious_data = input_data_plus
        most_suspicious_val = vals[max_idx]
        feature_impact[selected_sensitive_feature] += abs(plus_result - ini_result[0][1])
        print("Updated probability: {0}".format(most_suspicious_val))
        
    elif max_idx == 1:
        most_suspicious_data = input_data_minus
        most_suspicious_val = vals[max_idx]
        feature_impact[selected_sensitive_feature] += abs(minus_result - ini_result[0][1])
        print("Updated probability: {0}".format(most_suspicious_val))
        
print("------------------------------------------------------")
print("Final Suspicious Val: {0}".format(most_suspicious_val))
print("Features Analysis: ")

print(feature_impact)
    
    

Initial probability value: 
0.46095067262649536
Iteration 1: 
Iteration 2: 
Updated probability: 0.524198591709137
Iteration 3: 
Iteration 4: 
Iteration 5: 
Iteration 6: 
Iteration 7: 
Iteration 8: 
Updated probability: 0.8141757249832153
Iteration 9: 
Iteration 10: 
Iteration 11: 
Iteration 12: 
Iteration 13: 
Iteration 14: 
Iteration 15: 
Iteration 16: 
Iteration 17: 
Iteration 18: 
Iteration 19: 
Iteration 20: 
Iteration 21: 
Iteration 22: 
Iteration 23: 
Iteration 24: 
Iteration 25: 
Iteration 26: 
Iteration 27: 
Iteration 28: 
Iteration 29: 
Iteration 30: 
Iteration 31: 
Iteration 32: 
Iteration 33: 
Iteration 34: 
Iteration 35: 
Iteration 36: 
Iteration 37: 
Iteration 38: 
Iteration 39: 
Iteration 40: 
Iteration 41: 
Iteration 42: 
Iteration 43: 
Iteration 44: 
Iteration 45: 
Iteration 46: 
Iteration 47: 
Iteration 48: 
Iteration 49: 
Iteration 50: 
Iteration 51: 
Iteration 52: 
Iteration 53: 
Iteration 54: 
Iteration 55: 
Iteration 56: 
Iteration 57: 
Iteration 58: 
Iteration 59

In [85]:
print("Initial probability value: ")
seed = data.iloc[3]
output_name = model_2.get_outputs()[1].name
input_data = build_input_data(model_2, seed)
ini_result = model_2.run([output_name], input_data)[0]
print(ini_result[0][1])
most_suspicious_data = seed
most_suspicious_val = ini_result[0][1]

feature_impact = defaultdict(float)

for i in range (3000):
    print("Iteration {0}: ".format(i+1))
    
    new_seed_plus = seed.copy()
    new_seed_minus = seed.copy()
    
    col_idx = random.randint(0, len(sensitive_features) - 1)
    selected_sensitive_feature = sensitive_features[col_idx]
    
    prev_value, next_value = get_adj_values(selected_sensitive_feature, seed[selected_sensitive_feature])
    new_seed_minus[selected_sensitive_feature] = prev_value
    new_seed_plus[selected_sensitive_feature] = next_value
    
    input_data_minus = build_input_data(model_2, new_seed_minus)
    minus_result = model_2.run([output_name], input_data_minus)[0][0][1]
    
    input_data_plus = build_input_data(model_2, new_seed_plus)
    plus_result = model_2.run([output_name], input_data_plus)[0][0][1]
    
    vals = [plus_result, minus_result, most_suspicious_val]
    max_idx = np.argmax(vals)
    
    if max_idx == 0:
        most_suspicious_data = input_data_plus
        most_suspicious_val = vals[max_idx]
        feature_impact[selected_sensitive_feature] += abs(plus_result - ini_result[0][1])
        print("Updated probability: {0}".format(most_suspicious_val))
        
    elif max_idx == 1:
        most_suspicious_data = input_data_minus
        most_suspicious_val = vals[max_idx]
        feature_impact[selected_sensitive_feature] += abs(minus_result - ini_result[0][1])
        print("Updated probability: {0}".format(most_suspicious_val))
        
print("------------------------------------------------------")
print("Final Suspicious Val: {0}".format(most_suspicious_val))
print("Features Analysis: ")

print(feature_impact)
    
    

Initial probability value: 
0.619999885559082
Iteration 1: 
Updated probability: 0.619999885559082
Iteration 2: 
Updated probability: 0.619999885559082
Iteration 3: 
Updated probability: 0.619999885559082
Iteration 4: 
Updated probability: 0.619999885559082
Iteration 5: 
Updated probability: 0.619999885559082
Iteration 6: 
Updated probability: 0.619999885559082
Iteration 7: 
Updated probability: 0.619999885559082
Iteration 8: 
Updated probability: 0.619999885559082
Iteration 9: 
Updated probability: 0.619999885559082
Iteration 10: 
Updated probability: 0.619999885559082
Iteration 11: 
Updated probability: 0.619999885559082
Iteration 12: 
Updated probability: 0.619999885559082
Iteration 13: 
Updated probability: 0.619999885559082
Iteration 14: 
Updated probability: 0.619999885559082
Iteration 15: 
Updated probability: 0.619999885559082
Iteration 16: 
Updated probability: 0.619999885559082
Iteration 17: 
Updated probability: 0.619999885559082
Iteration 18: 
Updated probability: 0.6199998