In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data and Preprocessing

In [None]:
# Keep only needed columns
feature_names = [
    "Age", "Workclass", "Education", 
    "Occupation", "Relationship", "Race", 
    "Sex", "Capital Gain", "Capital Loss", 
    "Hours per week", "Country", "label"
]

df = pd.DataFrame(
  np.genfromtxt('dataset/adult.csv', delimiter=', ', dtype=str, usecols=(0,1,3,6,7,8,9,10,11,12,13,14)),    
  columns=feature_names
)

# Drop missing values denoted as ?
cols = list(df.columns)
df[cols] = df[cols].replace(['?'], np.nan)
df = df.dropna()


# Declare categorical columns
categorical_columns=[
   "Workclass", "Education", "Occupation", "Relationship", "Race", "Sex", "Country", "label"
]

# Assign type to features
for feature in feature_names:
    if feature in categorical_columns:
        df[feature] = df[feature].astype("category")
    else:
        df[feature] = df[feature].astype("int")

# Make bins for age and hours per week
df['Age'] = pd.cut(df['Age'], bins=[16, 35 , 90],labels=['Young','Aged'])
df["Hours per week"] = pd.cut(
            x=df["Hours per week"],
            bins=[0.9, 25, 39, 40, 55, 100],
            labels=["PartTime", "MidTime", "FullTime", "OverTime", "BrainDrain"])

# Replace "Husband" and "Wife" values with the unisex "Married"
df["Relationship"] = df["Relationship"].replace('Husband','Married')
df["Relationship"] = df["Relationship"].replace('Wife','Married')

# Will have income >50k (1 or 0)
df["label"] = df["label"].replace('>50K',1)
df["label"] = df["label"].replace('<=50K',0)

df.sample(5)

In [None]:
df.info()

## Model Creation

In [None]:
# Data preprocessing to train model
from omnixai.data.tabular import Tabular
from omnixai.preprocessing.tabular import TabularTransform
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

tabular_data = Tabular(
   df,
   categorical_columns=[
  "Age", "Workclass", "Education", "Occupation", "Relationship", "Race", "Sex", "Hours per week", "Country"
],
   target_column='label'
)
transformer = TabularTransform().fit(tabular_data)
class_names = transformer.class_names
x = transformer.transform(tabular_data)

# Split data into training and (validation + test) datasets
train, X_temp, train_labels, y_temp  = \
    train_test_split(x[:, :-1], x[:, -1], train_size=0.70, random_state = 123)

# Split data validation and test sets
val, test, val_labels, test_labels = train_test_split(X_temp, y_temp, test_size=0.5, random_state=123)

test_labels = test_labels.astype(int)

print('Training data shape:   {}'.format(train.shape))
print('Validation data shape:  {}'.format(val.shape))
print('Test data shape:        {}'.format(test.shape))

# Train a Random Forest model
model = RandomForestClassifier(random_state=123)
model.fit(train, train_labels)

predict_function=lambda z: model.predict_proba(transformer.transform(z))

# Convert the transformed data back to Tabular instances
train_data = transformer.invert(train)
test_data = transformer.invert(test)

display(tabular_data.target_column)
display(train_labels[:2])

In [None]:
# Model Predictions
test_df = test_data.to_pd()
test_df["label"] = test_labels
predictions = model.predict(test)
test_df["prediction"] = predictions

test_df.sample(5)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

accuracy = accuracy_score(test_labels, predictions)
print(f'Test: {accuracy=:.4f}')

cm = confusion_matrix(test_labels, predictions)

TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
print(f"Test: {TP=}, {TN=}, {FP=}, {FN=}")

disp = ConfusionMatrixDisplay(confusion_matrix=cm, )
disp.plot()

## Fairness

## Privacy

In [None]:
# Helper Functions for local differential privacy

import random
import math

# randomized response 
def rand_resp(x, p=0.75, q=0.75):
    toss = random.random()
    if x == 0:
        y = 0 if toss <= q else 1
    else:
        y = 1 if toss <= p else 0
    return y

def estimate(column, p=0.75, q=0.75):
    n_people = len(column)
    n_reported = np.sum(column.astype(int))
    return (n_reported/n_people + q - 1)/(p+q-1)*n_people

# apply attribute to a attribute
def privatize_attribute(column, true_label, false_label, p, q):
    # Convert labels to binary values
    binary_values = column.apply(lambda x: 1 if x == true_label else 0).values
    
    # Apply randomized response
    privatized_values = pd.Series([rand_resp(x, p, q) for x in binary_values])

    # Convert back to original labels
    return privatized_values.apply(lambda x: true_label if x == 1 else false_label)

def get_epsilon(p=0.75, q=0.75):
    return math.log( max(q/(1-p), p/(1-q)) )

def get_p_q(epsilon):
    p = math.exp(epsilon)/(1+math.exp(epsilon))
    return p, p

In [None]:
# Create a copy of the original data
df_private = df.copy(deep=True)

# Set values of p and q
p_age, q_age = 0.9, 0.9
p_sex, q_sex = 0.9, 0.9  

epsilon_age = get_epsilon(p_age, q_age)
epsilon_sex = get_epsilon(p_sex, q_sex)
print(f"We will apply {epsilon_age:.3f}-LDP setting p={p_age}, q={q_age} for age \
AND {epsilon_sex:.3f}-LDP setting p={p_sex}, q={q_sex} for sex.")

# Apply randomized response to Age and Sex
df_private['Age'] = privatize_attribute(df_private['Age'], 'Aged', 'Young', p_age, q_age)
df_private['Sex'] = privatize_attribute(df_private['Sex'], 'Male', 'Female', p_sex, q_sex)

# Display the new DataFrame
df_private.head()

In [None]:
# Showing cross tabulation
print("\nOriginal Cross-tabulation:")
print(pd.crosstab(df['Age'], df['Sex']))

print("\nPrivatized Cross-tabulation:")
print(pd.crosstab(df_private['Age'], df_private['Sex']))

In [None]:
# Quantify the Error    

In [None]:
# Implement Private Classifier

tabular_data_private = Tabular(
   df_private,
   categorical_columns=[
  "Age", "Workclass", "Education", "Occupation", "Relationship", "Race", "Sex", "Hours per week", "Country"
],
   target_column='label'
)
transformer_private = TabularTransform().fit(tabular_data_private)
class_names = transformer_private.class_names
x_private = transformer_private.transform(tabular_data_private)

# Split data into training and (validation + test) datasets
train_private, X_private_temp, train_labels_private, y_private_temp  = \
    train_test_split(x_private[:, :-1], x_private[:, -1], train_size=0.70, random_state = 123)

# Split data validation and test sets
val_private, test_private, val_labels_private, test_labels_private = train_test_split(X_private_temp, y_private_temp, test_size=0.5, random_state=123)

test_labels_private = test_labels_private.astype(int)

print('Private Classfier')
print('Training data shape:   {}'.format(train_private.shape))
print('Validation data shape:  {}'.format(val_private.shape))
print('Test data shape:        {}'.format(test_private.shape))

# Train a Random Forest model
model_private = RandomForestClassifier(random_state=123)
model_private.fit(train_private, train_labels_private)

predict_function_private=lambda z: model_private.predict_proba(transformer_private.transform(z))

# # Convert the transformed data back to Tabular instances
train_data_private = transformer_private.invert(train_private)
test_data_private = transformer_private.invert(test_private)

display(tabular_data_private.target_column)
display(train_labels_private[:2])

In [None]:
# Model Predictions
test_df_private = test_data_private.to_pd()
test_df_private["label"] = test_labels_private
predictions_private = model_private.predict(test_private)
test_df_private["prediction"] = predictions_private

test_df_private.sample(5)

In [None]:
accuracy_private = accuracy_score(test_labels_private, predictions_private)
print(f'Test: {accuracy_private=:.4f}')

cm_private = confusion_matrix(test_labels_private, predictions_private)

TN = cm_private[0][0]
FN = cm_private[1][0]
TP = cm_private[1][1]
FP = cm_private[0][1]
print(f"Test: {TP=}, {TN=}, {FP=}, {FN=}")

disp = ConfusionMatrixDisplay(confusion_matrix=cm_private, )
disp.plot()