In [1]:
import onnx
import random
import onnxruntime as rt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
onnx_model = onnx.load("model_2.onnx")
onnx.checker.check_model(onnx_model)

In [4]:
## GENETIC testing pseudocode
# 1. initialize population
#   each individual should be an individual: a set of values for each feature
# 2. selection
# 3. crossover
# 4. mutation
# 5. repeat 2-4 until steps OR evil test case found idk
# 

# APPROACH:
# 1. Take a set of non-sensitive features. 
# 2. Run two evolutionary searches over sensitive features: one to find the most "suspect" individual, one to find the least "suspect" individual for those non-senstitive features.
# 

# GOAL: to find test cases where the model does badly. 

In [5]:
sensitive_features = [
    "adres_dagen_op_adres",
    "adres_recentste_buurt_groot_ijsselmonde",
    "adres_recentste_buurt_nieuwe_westen",
    "adres_recentste_buurt_other",
    "adres_recentste_buurt_oude_noorden",
    "adres_recentste_buurt_vreewijk",
    "adres_recentste_plaats_other",
    "adres_recentste_plaats_rotterdam",
    "adres_recentste_wijk_charlois",
    "adres_recentste_wijk_delfshaven",
    "adres_recentste_wijk_feijenoord",
    "adres_recentste_wijk_ijsselmonde",
    "adres_recentste_wijk_kralingen_c",
    "adres_recentste_wijk_noord",
    "adres_recentste_wijk_other",
    "adres_recentste_wijk_prins_alexa",
    "adres_recentste_wijk_stadscentru",
    "afspraak_aantal_woorden",
    "afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel",
    "afspraak_afgelopen_jaar_ontheffing_taaleis",
    "afspraak_laatstejaar_aantal_woorden",
    "afspraak_verzenden_beschikking_i_v_m__niet_voldoen_aan_wet_taaleis",
    "belemmering_dagen_lichamelijke_problematiek",
    "belemmering_dagen_psychische_problemen",
    "belemmering_hist_lichamelijke_problematiek",
    "belemmering_hist_psychische_problemen",
    "belemmering_hist_taal",
    "belemmering_hist_verslavingsproblematiek",
    "belemmering_niet_computervaardig",
    "belemmering_psychische_problemen",
    "beschikbaarheid_aantal_historie_afwijkend_wegens_medische_omstandigheden",
    "beschikbaarheid_huidig_afwijkend_wegens_medische_omstandigheden",
    "beschikbaarheid_recent_afwijkend_wegens_medische_omstandigheden",
    "competentie_vakdeskundigheid_toepassen",
    "contacten_onderwerp_beoordelen_taaleis",
    "contacten_onderwerp_boolean_beoordelen_taaleis",
    "contacten_onderwerp_boolean_taaleis___voldoet",
    "contacten_onderwerp_boolean_ziek__of_afmelding",
    "contacten_onderwerp_boolean_zorg",
    "contacten_onderwerp_ziek__of_afmelding",
    "contacten_onderwerp_zorg",
    "instrument_aantal_laatstejaar",
    "ontheffing_dagen_hist_vanwege_uw_medische_omstandigheden",
    "ontheffing_reden_hist_medische_gronden",
    "persoon_geslacht_vrouw",
    "persoon_leeftijd_bij_onderzoek",
    "persoonlijke_eigenschappen_dagen_sinds_taaleis",
    "persoonlijke_eigenschappen_nl_begrijpen3",
    "persoonlijke_eigenschappen_nl_lezen3",
    "persoonlijke_eigenschappen_nl_lezen4",
    "persoonlijke_eigenschappen_nl_schrijven0",
    "persoonlijke_eigenschappen_nl_schrijven1",
    "persoonlijke_eigenschappen_nl_schrijven2",
    "persoonlijke_eigenschappen_nl_schrijven3",
    "persoonlijke_eigenschappen_nl_schrijvenfalse",
    "persoonlijke_eigenschappen_nl_spreken1",
    "persoonlijke_eigenschappen_nl_spreken2",
    "persoonlijke_eigenschappen_nl_spreken3",
    "persoonlijke_eigenschappen_spreektaal",
    "persoonlijke_eigenschappen_spreektaal_anders",
    "persoonlijke_eigenschappen_taaleis_schrijfv_ok",
    "persoonlijke_eigenschappen_taaleis_voldaan",
    "relatie_kind_basisschool_kind",
    "relatie_kind_heeft_kinderen",
    "relatie_kind_huidige_aantal",
    "relatie_kind_jongvolwassen",
    "relatie_kind_tiener",
    "relatie_kind_volwassen",
    "relatie_overig_actueel_vorm__kostendeler",
    "relatie_overig_actueel_vorm__ouders_verzorgers",
    "relatie_overig_historie_vorm__andere_inwonende",
    "relatie_overig_historie_vorm__kostendeler",
    "relatie_overig_kostendeler",
    "relatie_partner_aantal_partner___partner__gehuwd_",
    "relatie_partner_aantal_partner___partner__ongehuwd_",
    "relatie_partner_huidige_partner___partner__gehuwd_",
    "relatie_partner_totaal_dagen_partner"
]

In [6]:
# sensitives and nonsensitives are dictionaries
# we need the creation of a dictionary with the possible values for each feature in the dataset
import pandas as pd

dataset = pd.read_csv("C:/Users/kovac/Downloads/investigation_train_large_checked.csv")
dataset = dataset.drop(['Ja', 'Nee', 'checked'], axis=1)

possible_sensitives = {}
possible_nonsensitives = {}

for feature in dataset.columns:
    unique_values = dataset[feature].unique().tolist()
    if feature in sensitive_features:
        possible_sensitives[feature] = unique_values
    else:
        possible_nonsensitives[feature] = unique_values

In [7]:
class Individual:
    def __init__(self, sensitives, nonsensitives):
        self.sensitives = sensitives
        self.nonsensitives = nonsensitives
    
    def fullfeatures(self):
        #combine the sensitive and non sensitive features
        return self.sensitives | self.nonsensitives

In [8]:
#create an initial population, where each individual has the same nonsensitive features.
def initialize_population(population_size, nonsensitives):
    population = []
    for i in range(population_size):
        #generate random sensitive features
        sensitives = {}
        for sensitive_feature in sensitive_features:
            sensitives[sensitive_feature] = random.choice(possible_sensitives[sensitive_feature])
        #create and add individual
        population.append(Individual(sensitives, nonsensitives))
    return population

In [9]:
#given a population and a fitness function, create a "survivor population" using tournament selection
def selection(population, fitness_function):
    survivors = []
    for i in range(len(population)):
        fighter1 = random.choice(population)
        fighter2 = random.choice(population)
        score1 = fitness_function(fighter1)
        score2 = fitness_function(fighter2)
        winner = fighter1 if score1 > score2 else fighter2
        survivors.append(winner)
    return survivors

In [10]:
#given a population, create a new population with single-point crossover 
def crossover(population, crossover_probability):
    # take two parents, and either cross them or dont, and then add them to the new population
    children = []
    for i in range(int(len(population)/2)):
        parent1 = random.choice(population)
        parent2 = random.choice(population)
        child1 = parent1
        child2 = parent2
        if random.random() < crossover_probability:
            crossover_point = random.randint(0, len(child1.sensitives)) 
            for i in range(crossover_point, len(child1.sensitives.keys())):
                child1.sensitives[list(child1.sensitives.keys())[i]] = parent2.sensitives[list(parent2.sensitives.keys())[i]]
                child2.sensitives[list(child2.sensitives.keys())[i]] = parent1.sensitives[list(parent1.sensitives.keys())[i]]
        children.append(child1)
        children.append(child2)
    return children

In [11]:
#mutate each individual; each feature is changed with chance 1/numberoffeatures to some random value in that feature
#is this too aggressive mutation?
def mutation(population):
    for person in population:
        features = person.sensitives.keys()
        for feature in features:
            if random.random() < (1/len(features)):
                person.sensitives[feature] = random.choice(possible_sensitives[feature]) #now i take a completely random, so same can be retaken
    return population

### EXPERIMENTS

In [12]:
nonsensitives_experiment = {}
for nonsensitive_feature in possible_nonsensitives.keys():
    nonsensitives_experiment[nonsensitive_feature] = random.choice(possible_nonsensitives[nonsensitive_feature])
print(nonsensitives_experiment)

{'adres_aantal_brp_adres': 1, 'adres_aantal_verschillende_wijken': 7, 'adres_aantal_verzendadres': 0, 'adres_aantal_woonadres_handmatig': 1, 'adres_recentst_onderdeel_rdam': 1, 'adres_unieke_wijk_ratio': 0, 'afspraak_aanmelding_afgesloten': 7, 'afspraak_afgelopen_jaar_afsprakenplan': 0, 'afspraak_afgelopen_jaar_ontheffing': 2, 'afspraak_afgelopen_jaar_plan_van_aanpak': 2, 'afspraak_afgelopen_jaar_signaal_voor_medewerker': 11, 'afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant': 1, 'afspraak_afgelopen_jaar_voortgang_aanmelding_en_deelname': 5, 'afspraak_afsprakenplan': 3, 'afspraak_controle_aankondiging_maatregel': 3, 'afspraak_controle_verwijzing': 0, 'afspraak_deelname_compleet_uit_webapplicatie': 3, 'afspraak_galo_gesprek': 3, 'afspraak_gespr__einde_zoekt___galo_gesprek_': 2, 'afspraak_inspanningsperiode': 2, 'afspraak_laatstejaar_resultaat_ingevuld': 5, 'afspraak_laatstejaar_resultaat_ingevuld_uniek': 0, 'afspraak_other': 5, 'afspraak_participatietrede_vervolgme

In [23]:
model_path = "model_2.onnx"
sess = rt.InferenceSession(model_path)

In [14]:
def build_input_data(sess, data):
    """
    Builds the input_data dictionary for ONNX inference.

    Parameters:
    - sess: onnxruntime.InferenceSession, the ONNX model session.
    - data: pandas.DataFrame or dict-like, the input data.

    Returns:
    - input_data: dict, formatted input data for the ONNX model.
    """
    input_data = {}

    for input_tensor in sess.get_inputs():
        name = input_tensor.name
        shape = input_tensor.shape  # Shape of the expected input (e.g., [None, 1])
        data_type = input_tensor.type  # Expected data type (e.g., 'tensor(float)', 'tensor(string)')

        # Extract the column from the input data
        column_data = data[name]

        # Ensure it's in the correct format
        if "float" in data_type.lower():
            input_data[name] = np.array(column_data).astype(np.float32).reshape(-1, shape[1] if len(shape) > 1 else 1)
        elif "string" in data_type.lower():
            input_data[name] = np.array(column_data).astype(str).reshape(-1, shape[1] if len(shape) > 1 else 1)
        elif "int64" in data_type.lower():
            input_data[name] = np.array(column_data).astype(np.int64).reshape(-1, shape[1] if len(shape) > 1 else 1)
        else:
            raise ValueError(f"Unsupported data type for input '{name}': {data_type}")

    return input_data

In [25]:
def highly_suspect(individual:Individual):
    output_name = sess.get_outputs()[1].name
    value = individual.fullfeatures()
    value = build_input_data(sess, value)
    result = sess.run([output_name], value)[0]
    #print(result[0][0], end=" ")
    return result[0][0]

population = initialize_population(1000, nonsensitives_experiment)
for steps in range(1):
    #print(steps, ":")
    survivors = selection(population, highly_suspect)
    children = crossover(survivors, 0.9)
    population = mutation(children)
    #print()
    
    
most_suspicious = None
highest_suspicion = -1
for individual in population:
    suspicion_score = highly_suspect(individual)
    if suspicion_score > highest_suspicion:
        most_suspicious = individual
        highest_suspicion = suspicion_score
print("nonsensitive_features: ", nonsensitives_experiment)
print("highest suspicion: ",highest_suspicion)
print("most suspect individual sensitive values: ",most_suspicious.sensitives)

nonsensitive_features:  {'adres_aantal_brp_adres': 1, 'adres_aantal_verschillende_wijken': 7, 'adres_aantal_verzendadres': 0, 'adres_aantal_woonadres_handmatig': 1, 'adres_recentst_onderdeel_rdam': 1, 'adres_unieke_wijk_ratio': 0, 'afspraak_aanmelding_afgesloten': 7, 'afspraak_afgelopen_jaar_afsprakenplan': 0, 'afspraak_afgelopen_jaar_ontheffing': 2, 'afspraak_afgelopen_jaar_plan_van_aanpak': 2, 'afspraak_afgelopen_jaar_signaal_voor_medewerker': 11, 'afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant': 1, 'afspraak_afgelopen_jaar_voortgang_aanmelding_en_deelname': 5, 'afspraak_afsprakenplan': 3, 'afspraak_controle_aankondiging_maatregel': 3, 'afspraak_controle_verwijzing': 0, 'afspraak_deelname_compleet_uit_webapplicatie': 3, 'afspraak_galo_gesprek': 3, 'afspraak_gespr__einde_zoekt___galo_gesprek_': 2, 'afspraak_inspanningsperiode': 2, 'afspraak_laatstejaar_resultaat_ingevuld': 5, 'afspraak_laatstejaar_resultaat_ingevuld_uniek': 0, 'afspraak_other': 5, 'afspraak_par

In [24]:
def lowly_suspect(individual:Individual):
    output_name = sess.get_outputs()[1].name
    value = individual.fullfeatures()
    value = build_input_data(sess, value)
    result = sess.run(None, value)[0]
    return -result[0]

population = initialize_population(1000, nonsensitives_experiment)
for steps in range(1):
    survivors = selection(population, lowly_suspect)
    children = crossover(survivors, 0.9)
    population = mutation(children)
    
most_suspicious = None
lowest_suspicion = 2
for individual in population:
    suspicion_score = highly_suspect(individual)
    if suspicion_score < lowest_suspicion:
        most_suspicious = individual
        lowest_suspicion = suspicion_score
print("nonsensitive_features: ", nonsensitives_experiment)
print("lowest suspicion: ",lowest_suspicion)
print("most suspect individual sensitive values: ",most_suspicious.sensitives)

nonsensitive_features:  {'adres_aantal_brp_adres': 1, 'adres_aantal_verschillende_wijken': 7, 'adres_aantal_verzendadres': 0, 'adres_aantal_woonadres_handmatig': 1, 'adres_recentst_onderdeel_rdam': 1, 'adres_unieke_wijk_ratio': 0, 'afspraak_aanmelding_afgesloten': 7, 'afspraak_afgelopen_jaar_afsprakenplan': 0, 'afspraak_afgelopen_jaar_ontheffing': 2, 'afspraak_afgelopen_jaar_plan_van_aanpak': 2, 'afspraak_afgelopen_jaar_signaal_voor_medewerker': 11, 'afspraak_afgelopen_jaar_vervolgmeting_matchbaarheid_werkzoekende_klant': 1, 'afspraak_afgelopen_jaar_voortgang_aanmelding_en_deelname': 5, 'afspraak_afsprakenplan': 3, 'afspraak_controle_aankondiging_maatregel': 3, 'afspraak_controle_verwijzing': 0, 'afspraak_deelname_compleet_uit_webapplicatie': 3, 'afspraak_galo_gesprek': 3, 'afspraak_gespr__einde_zoekt___galo_gesprek_': 2, 'afspraak_inspanningsperiode': 2, 'afspraak_laatstejaar_resultaat_ingevuld': 5, 'afspraak_laatstejaar_resultaat_ingevuld_uniek': 0, 'afspraak_other': 5, 'afspraak_par