# Basic Example

First, let's load the libraries and example data:

In [1]:
import pandas as pd
import BlackBoxAuditing as BBA

import pylab
%matplotlib inline

from BlackBoxAuditing.model_factories import SVM, DecisionTree, TensorFlow
german_data = BBA.load_data("german")
ricci_data = BBA.load_data("ricci")
adult_data = BBA.load_data("adult")
synthetic_data = BBA.load_data("sample")
dark_data = BBA.load_data("DRP")
compass_data = BBA.load_data("Compass")
hof_data = BBA.load_data("HOF")
housing_data = BBA.load_data("Housing")
disease_data = BBA.load_data("Disease")
mutations_data = BBA.load_data("Mutations")
student_data = BBA.load_data("Student")
student_data_v2 = BBA.load_data("Student-V2")
loan_data = BBA.load_data("Loan")
mhs_data = BBA.load_data("MHS")

In [None]:
print(synthetic_data)

In [None]:
import csv

# Specify the filename and mode ('w' for writing)
filename = 'german.csv'

# Open the file in write mode and specify newline='' to prevent extra line breaks
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    
    # Write the header row if needed
    writer.writerow(['First Name', 'Last Name', 'Age'])
    
    # Write each tuple as a row in the CSV file
    for row in german_data:
        writer.writerow(row)


Next, we create an "Auditor" object, which will run the model with obscured features in order to check for potential influence. The `Auditor` object needs to know about how to build a model, and so it takes a `model` field. This is a `ModelFactory` instance, and our library provides you with a few predefined choices about this. (TBD: do we want to add a section about how to create a new `ModelFactory` subclass?)

It takes a bit of time for this to run (a few seconds per attribute in our laptop):

In [None]:
import pandas as pd
import BlackBoxAuditing as BBA

import pylab
%matplotlib inline

from BlackBoxAuditing.model_factories import SVM, NeuralNetwork
print(type(SVM))
print(type(NeuralNetwork))

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = TensorFlow
auditor(adult_data, output_dir="adult-audit-output-NN")

In [None]:
audits_data = auditor._audits_data
print(audits_data["rep_test"])

In [None]:
from BlackBoxAuditing import find_contexts
auditor.find_contexts('Race', output_dir="ricci_context_output", beam_width=10, min_covered_examples=1, max_rule_length=5, by_original=True, epsilon=0.05)

## Our auditing technique always works relatively to some existing model, and some measure of accuracy. The list of ranked features can be different depending on the measure used, and that's sometimes important. Often, however, they tend to correlate fairly strongly:

In [None]:
bcr_data = pd.read_csv("german-audit-output-SVM/BCR.png.data")
acc_data = pd.read_csv("german-audit-output-SVM/accuracy.png.data")

def compute_influence(dataset):
    return (dataset.iloc[0][1:] - dataset.iloc[-1][1:])

bcr_influence = compute_influence(bcr_data)
acc_influence = compute_influence(acc_data)

In [None]:
pylab.plot(acc_influence, bcr_influence, 'ko')

## Loading your own data

In order to use your own data with our auditing, you'll probably need to make a few conversions. Our code uses a minimal encoding of this metadata required. Specifically, you will need to tell our code about the types of your columns, and which column is the value to be predicted.

Let's create some synthetic data:

In [None]:
import numpy as np
import random
import pandas as pd
from BlackBoxAuditing.data import load_from_file

iq = np.array(np.random.randn(20)) * 20 + 100
gender = [random.choice(["man", "woman"]) for i in range(20)]
sat = [i * 10 + (0 if g == "man" else 0) for (i, g) in zip(iq, gender)]
admit = ["True" if s > 1100 else "False" for s in sat]

df = pd.DataFrame(
    {"admit": admit,
     "gender": gender,
     "iq": iq,
     "sat": sat})
df.to_csv("/tmp/test.csv", 
          index=False, 
          columns=['gender', 'admit', 'iq', 'sat']) # Make sure this order matches the order you're loading below
synthetic_data = load_from_file("/tmp/test.csv", correct_types = [str, str, float, float], response_header = 'admit')

In [None]:
synthetic_data

And now we can audit this dataset with one of the existing classifiers:

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = DecisionTree
auditor(german_data, output_dir="testing")

# Auditing your own model

But what if you want to audit your own model? Here we show a very simple example of a (hard-coded) classifier.

In [None]:
from BlackBoxAuditing.model_factories.AbstractModelFactory import AbstractModelFactory
from BlackBoxAuditing.model_factories.AbstractModelVisitor import AbstractModelVisitor

class SATPredictor(AbstractModelVisitor):
    def __init__(self):
        pass
    def test(self, test_set, test_name=""):
        return [(v[1], "True" if v[3] > 1100 else "False")
                for v in test_set]
class SATPredictorBuilder(AbstractModelFactory):
    def __init__(self, *args, **kwargs):
        AbstractModelFactory.__init__(self, *args, **kwargs)
        self.verbose_factory_name = "SATPredictor"
    def build(self, train_set):
        return SATPredictor()
    
auditor = BBA.Auditor()
auditor.ModelFactory = SATPredictorBuilder
auditor(synthetic_data, output_dir="synthetic-audit-output")

In [None]:
import numpy as np
import random
import pandas as pd
from BlackBoxAuditing.data import load_from_file


np.random.seed(42)  # Set a random seed for reproducibility

# Generate the features
n_samples = 6000



# Features directly encoding row number i
A = np.arange(1, n_samples + 1)
B = 2 * A
B = B.tolist()
C = -A
C = C.tolist()

# Random feature and constant feature
Random = np.random.randn(n_samples) + 0.00001
Random = Random.tolist()


# Generate the labels
admit = np.repeat([False, True], n_samples // 2)

# Create the DataFrame
df = pd.DataFrame(
    {"admit": admit,
    "A": A,
    "B": B,
    "C": C,
    "Random": Random})


# Save the DataFrame to a CSV file
df.to_csv("/tmp/test.csv", index=False, columns = ['A','admit','B','C','Random'])

# Load the synthetic data using BlackBoxAuditing
synthetic_data = load_from_file("/tmp/test.csv", correct_types=[float, str, float, float, float], response_header='admit')



In [None]:
print(synthetic_data)

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = DecisionTree
auditor(compass_data, output_dir="Test3")

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = DecisionTree
auditor.RETRAIN_MODEL_PER_REPAIR = True
auditor(synthetic_data, output_dir="synthetic-audit-output-DT-Retrain")

In [None]:
import numpy as np
import random
import pandas as pd
from BlackBoxAuditing.data import load_from_file


np.random.seed(42)  # Set a random seed for reproducibility

# Generate the features
n_samples = 6000
n_features = 5

# Features directly encoding row number i
A = np.arange(n_samples)
B = 2 * A
C = -A

# Random feature and constant feature
Random = np.random.randn(n_samples)
Constant = np.ones(n_samples) + np.random.normal(0, 0.01, n_samples)  # Add small random noise

# Concatenate the features into a numpy array
features = np.column_stack((A, B, C, Random, Constant))

# Generate the labels
labels = np.array([False]*(n_samples//2) + [True]*(n_samples//2))


# Replace NaN values with zeros
features[np.isnan(features)] = 0.0

# Create a dictionary with column names and data
column_names = {
    "A": A,
    "B": B,
    "C": C,
    "Random": Random,
    "Constant": Constant,
    "admit": labels.astype(str)
}

# Create the DataFrame
df = pd.DataFrame(column_names)

# Save the DataFrame to a CSV file
df.to_csv("/tmp/test.csv", index=False)

# Load the synthetic data using BlackBoxAuditing
synthetic_data = load_from_file(
    "/tmp/test.csv",
    correct_types=[int, int, int, float, float, str],
    response_header='admit'
)


In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = DecisionTree 
auditor.RETRAIN_MODEL_PER_REPAIR = True
auditor(synthetic_data, output_dir="Test")

In [None]:
import fileinput
import glob

def search_keyword(keyword, folder_path):
    file_pattern = folder_path + '/**/*.*'
    
    # Use glob to get a list of file paths matching the pattern
    file_paths = glob.glob(file_pattern, recursive=True)
    
    # Use fileinput to iterate over the files and search for the keyword
    for line in fileinput.input(file_paths):
        if keyword in line:
            print(f"Match found in {fileinput.filename()} at line {fileinput.lineno()}: {line.strip()}")

# Example usage
folder_path = "/Users/roccotrinci/Documents/Concordia Bachelor's/Honour's Project/New/auditing-tutorial/venv/lib/python3.10/site-packages/BlackBoxAuditing"
keyword = 'load_data'
search_keyword(keyword, folder_path)


Test

In [None]:
import numpy as np
import random
import pandas as pd
from BlackBoxAuditing.data import load_from_file


# Generate the features
n_samples = 6000
n_features = 5

# Features directly encoding row number i
A = np.arange(n_samples)
B = 2 * A
C = -A

# Random feature and constant feature
Random = np.random.randn(n_samples)
Constant = np.ones(n_samples) + np.random.normal(0, 0.01, n_samples)  # Add small random noise

# Concatenate the features into a numpy array
features = np.column_stack((A, B, C, Random, Constant))

# Generate the labels
labels = np.repeat([False, True], n_samples // 2)

# Replace NaN values with zeros
features[np.isnan(features)] = 0.0

# Create a dictionary with column names and data
column_names = {
    "A": A,
    "B": B,
    "C": C,
    "Random": Random,
    "Constant": Constant,
    "admit": labels.astype(str)
}

# Create the DataFrame
df = pd.DataFrame(column_names)

# Save the DataFrame to a CSV file
df.to_csv("/tmp/test2.csv", index=False)

# Load the synthetic data using BlackBoxAuditing
test_data = load_from_file(
    "/tmp/test2.csv",
    correct_types=[int, int, int, float, float, str],
    response_header='admit'
)


In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = TensorFlow
auditor(synthetic_data, output_dir="Test")

In [None]:
data_row = "1 180 48 no 5 no 1 1 0 2 17.26 41.18 55.03 6.89 5.91 21.58 3.07 13.78 17.95 16.56 365.55 365.55 282.95 82.6 269.21 96.34 55.28 0 6 17.26 41.18 55.03 6.89 5.91 21.58 3.07 13.78 17.95 16.56 365.55 365.55 282.95 82.6 269.21 96.34 55.28 0 6 17.26 41.18 55.03 6.89 5.91 21.58 3.07 13.78 17.95 16.56 365.55 365.55 282.95 82.6 269.21 96.34 55.28 0 6 17.26 41.18 55.03 6.89 5.91 21.58 3.07 13.78 17.95 16.56 365.55 365.55 282.95 82.6 269.21 96.34 55.28 0 6 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0.0597 0.0078 0 -0.047 7.6195 0.0676 no no no no no yes no no yes yes yes no no no no no no no no no no no no no no no no no no yes no no no no no no no yes no no no no no no no no no no no no no no no no no no yes no 684.3 71.9 2.16 378.1 306.2 190 684.3 71.9 2.16 378.1 306.2 190 684.3 71.9 2.16 378.1 306.2 190 684.3 71.9 2.16 378.1 306.2 190 2.4502 0.2574 0.0077 1.3538 1.0964 0.6803 2.4502 0.2574 0.0077 1.3538 1.0964 0.6803 2.4502 0.2574 0.0077 1.3538 1.0964 0.6803 2.4502 0.2574 0.0077 1.3538 1.0964 0.6803 1"

data_elements = data_row.split()
data_row_types = []

for element in data_elements:
    try:
        float(element)
        data_row_types.append("float")
    except ValueError:
        data_row_types.append("str")

           
print(data_row_types)

In [None]:
data_row_types[164] = 'float'

In [None]:
print(data_row_types)

In [None]:
auditor = BBA.Auditor()
auditor.ModelFactory = SVM
auditor(synthetic_data, output_dir="RepairTest")

Training initial model. (23:02:52)
Calculating original model statistics on test data:
	Training Set:
		Conf-Matrix: {'A': {'A': 1989, 'B': 4}, 'B': {'B': 2006, 'A': 1}}
		accuracy: 0.99875
		BCR: 0.9987473596551558
	Testing Set:
		Conf-Matrix {'A': {'A': 1006, 'B': 1}, 'B': {'B': 993}}
		accuracy: 0.9995
		BCR: 0.9995034756703078
Auditing: 'Feature A (i)' (1/5). (23:02:52)
repair level: "0.0"
repair level: "0.1"
repair level: "0.2"
repair level: "0.30000000000000004"
repair level: "0.4"
repair level: "0.5"
repair level: "0.6"
repair level: "0.7"
repair level: "0.7999999999999999"
repair level: "0.8999999999999999"
repair level: "0.9999999999999999"
> [0;32m/Users/roccotrinci/Documents/Concordia Bachelor's/Honour's Project/New/auditing-tutorial/venv/lib/python3.10/site-packages/BlackBoxAuditing/repairers/CategoricRepairer.py[0m(118)[0;36mrepair[0;34m()[0m
[0;32m    116 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32