# Statistical parity test

In [None]:
import os
import warnings
from pathlib import Path

import pandas as pd
from sklearn.metrics import roc_auc_score

from src.utils.models_pkl import load_pickle
from src.modeling.create_data_split import split_data
from src.fairness.statistical_parity import statistical_parity_test
from config.config_modeling import CAT_COLS, TRAIN_SIZE, TEST_FROM_VAL, RANDOM_STATE

## Notebook settings

In [None]:
warnings.filterwarnings("ignore")

## Define constants

In [None]:
# Paths.
IN_PATH = Path(os.path.join("..", "..", "data", "processed")) / "data_plain.csv"
MODEL_PATH = Path(os.path.join("..", "..", "models")) / "XGB.pkl"

## Load data

In [None]:
data = split_data(
    cols=CAT_COLS,
    df=pd.read_csv(IN_PATH),
    train_size=TRAIN_SIZE,
    test_size=TEST_FROM_VAL,
    random_state=RANDOM_STATE,
)

X_test, Y_test = data["test"]
X_test.reset_index(drop=True, inplace=True)
Y_test.reset_index(drop=True, inplace=True)

## Load model

In [None]:
model = load_pickle(MODEL_PATH)

# Validate model and data.
print(roc_auc_score(Y_test, model.predict_proba(X_test)[:, 1]))

## Statistical parity testing

### Define parameters

In [None]:
TARGET_COLUMN = "Citation"
POSITIVE_OUTCOME = 0

SENSITIVE_ATTRIBUTE_1 = "Gender_F"
SENSITIVE_ATTRIBUTE_1_PROTECTED = 1

SENSITIVE_ATTRIBUTE_2 = "Race_BLACK"
SENSITIVE_ATTRIBUTE_2_PROTECTED = 1

GROUPING_ATTRIBUTE = "VehicleType"

### Prepare dataframe

In [None]:
# Choose between analysis of data or model's fairness.
# target = Y_test  # data
target = model.predict(X_test)  # model

population = pd.concat([X_test, pd.DataFrame(data={TARGET_COLUMN: target})], axis=1)

### Statistical parity

#### Sensitive attribute #1 (Gender)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[population],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result)

In [None]:
population.groupby(SENSITIVE_ATTRIBUTE_1)[TARGET_COLUMN].value_counts()

#### Sensitive attribute #2 (Race)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[population],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result)

In [None]:
population.groupby(SENSITIVE_ATTRIBUTE_2)[TARGET_COLUMN].value_counts()

### Conditional statistical parity

#### Prepare the subpopulations based on the "VehicleType" attribute.

In [None]:
sub_population_1 = population[population["VehicleType_Standard Vehicles"] == 1]
sub_population_2 = population[population["VehicleType_Trucks"] == 1]
sub_population_3 = population[population["VehicleType_other"] == 1]
sub_population_4 = population[population["VehicleType_Motorcycles"] == 1]

#### Sensitive attribute #1 (Gender)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[sub_population_1, sub_population_2, sub_population_3, sub_population_4],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[sub_population_1],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_2],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_3],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_4],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_1,
    protected_group=SENSITIVE_ATTRIBUTE_1_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")

Note: The main source of unfairness is the subpopulation_1 (VehicleType == "Standard Vehicles")

#### Sensitive attribute #2 (Race)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[sub_population_1, sub_population_2, sub_population_3, sub_population_4],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result)

In [None]:
test_result = statistical_parity_test(
    dataset_list=[sub_population_1],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_2],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_3],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")


test_result = statistical_parity_test(
    dataset_list=[sub_population_4],
    sensitive_attribute=SENSITIVE_ATTRIBUTE_2,
    protected_group=SENSITIVE_ATTRIBUTE_2_PROTECTED,
    target_column=TARGET_COLUMN,
    positive_outcome=POSITIVE_OUTCOME,
)
print(test_result, "\n")