In [2]:
from arguseyes.retrospective import PipelineRun, FairnessRetrospective

run = PipelineRun(run_id='1bb75c7a0b3643e6954e5ca15e46f057')

In [3]:
run.show_source_code()

```Python
# https://www.openml.org/search?type=flow&id=8774
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize
from sklearn.compose import ColumnTransformer


def denormalize(records, workclasses, education, occupation, sex, race):
    data = records.merge(workclasses, on='workclass_id')
    data = data.merge(education, on='education_id', how='left')
    data = data.merge(occupation, on='occupation_id', how='left')
    data = data.merge(sex, on='sex_id', how='left')
    data = data.merge(race, on='race_id', how='left')
    return data


def load_train_and_test_data(data_location, employment_types):

    train = pd.read_csv(f'{data_location}/income_train.csv')
    test = pd.read_csv(f'{data_location}/income_test.csv')

    workclasses = pd.read_csv(f'{data_location}/workclass.csv')
    education = pd.read_csv(f'{data_location}/education.csv')
    occupation = pd.read_csv(f'{data_location}/occupation.csv')
    sex = pd.read_csv(f'{data_location}/sex.csv')
    race = pd.read_csv(f'{data_location}/race.csv')

    workclasses = workclasses[workclasses.workclass.isin(employment_types)]

    train = denormalize(train, workclasses, education, occupation, sex, race)
    test = denormalize(test, workclasses, education, occupation, sex, race)

    return train, test


def extract_labels(train, test):
    train_labels = label_binarize(train['income-per-year'], classes=['<=50K', '>50K'])
    # The test data has a dot in the class names for some reason...
    test_labels = label_binarize(test['income-per-year'], classes=['<=50K.', '>50K.'])

    return train_labels, test_labels


# https://www.openml.org/search?type=flow&id=8774
def openmlflow(numerical_columns, categorical_columns):

    num_pipe = Pipeline([('imputer', SimpleImputer(add_indicator=True)),
                         ('standardscaler', StandardScaler())])
    cat_pipe = Pipeline([('simpleimputer', SimpleImputer(strategy='most_frequent')),
                         ('onehotencoder', OneHotEncoder())])
    return Pipeline([
        ('columntransformer', ColumnTransformer([
            ('num', num_pipe, numerical_columns),
            ('cat', cat_pipe, categorical_columns),
        ])),
        ('decisiontreeclassifier', DecisionTreeClassifier(random_state=0))])


data_location = 'datasets/income/'

government_employed = ['Federal-gov', 'State-gov', 'Local-gov']

train, test = load_train_and_test_data(data_location, employment_types=government_employed)


train_labels, test_labels = extract_labels(train, test)

categorical_columns = ['workclass', 'education', 'occupation']
numerical_columns = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

openml_pipeline = openmlflow(numerical_columns, categorical_columns)

model = openml_pipeline.fit(train, train_labels)

score = model.score(test, test_labels)

print("Accuracy", score)

```

In [3]:
run.show_plan()

CytoscapeWidget(cytoscape_layout={'name': 'dagre'}, cytoscape_style=[{'selector': 'node', 'css': {'content': '…

In [None]:
data, provenance = run.load_input(1)
data

In [None]:
data, provenance = run.load_input(5)
data

In [None]:
retrospective = FairnessRetrospective(run)

In [None]:
retrospective.fairness_criteria()

In [None]:
retrospective.confusion_matrices_for_groups()

In [None]:
retrospective.fairness_metrics('sex', 'male')

In [None]:
retrospective.fairness_metrics('race', 'white')

In [None]:
retrospective.plot_fairness_metrics('sex', 'male')

In [None]:
retrospective.plot_fairness_metrics('race', 'white')