In [4]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import joblib
import re
import sys
sys.path.insert(0, '../')
import numpy as np

# Datasets
from aif360.datasets import MEPSDataset19
from aif360.datasets import MEPSDataset20
from aif360.datasets import MEPSDataset21
from aif360.datasets import GermanDataset
# Fairness metrics
from aif360.metrics import BinaryLabelDatasetMetric

# Explainers
from aif360.explainers import MetricTextExplainer

# Scalers
from sklearn.preprocessing import StandardScaler


# Bias mitigation techniques
from aif360.algorithms.preprocessing import Reweighing,DisparateImpactRemover
from aif360.algorithms.preprocessing import LFR
from aif360.algorithms.preprocessing import OptimPreproc
from sklearn.model_selection import train_test_split

  warn_deprecated('vmap', 'torch.vmap')


In [5]:
dataset_orig_panel19_train = MEPSDataset19()
dataset_orig_panel19_train.features
sens_ind = 0
sens_attr = dataset_orig_panel19_train.protected_attribute_names[sens_ind]
unprivileged_groups = [{sens_attr: v} for v in
                    dataset_orig_panel19_train.unprivileged_protected_attributes[sens_ind]]
privileged_groups = [{sens_attr: v} for v in
                    dataset_orig_panel19_train.privileged_protected_attributes[sens_ind]]

In [6]:
metric_orig_panel19_train = BinaryLabelDatasetMetric(
        dataset_orig_panel19_train,
        unprivileged_groups=unprivileged_groups,
        privileged_groups=privileged_groups)

In [7]:
from aif360.datasets import MEPSDataset19

data_orig_sex=MEPSDataset19()

In [8]:
sens_ind = 0
sens_attr = data_orig_sex.protected_attribute_names[sens_ind]
unprivileged_groups = [{sens_attr: v} for v in
                    data_orig_sex.unprivileged_protected_attributes[sens_ind]]
privileged_groups = [{sens_attr: v} for v in
                    data_orig_sex.privileged_protected_attributes[sens_ind]]

In [9]:
import pandas as pd
# Combining features and labels into one DataFrame for correlation analysis
full_df = pd.read_csv('MEPS_FINAL.csv')

In [10]:
import numpy as np
np.random.seed(42)

data_orig_sex_train, data_orig_sex_test = data_orig_sex.split([0.7], shuffle=True)

print("Perpetrator Sex :",data_orig_sex_train.features.shape)
print("Perpetrator Sex :",data_orig_sex_test.features.shape)

Perpetrator Sex : (11081, 138)
Perpetrator Sex : (4749, 138)


In [19]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

# Configuring the model to allow for incremental training
model = LogisticRegression(solver='liblinear')

# Arrays to store metrics for plotting
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []

# Simulating training over multiple iterations to mimic epochs
iterations = 30
for i in range(iterations):
    model.fit(data_orig_sex_train.features, data_orig_sex_train.labels.ravel(), 
              sample_weight=data_orig_sex_train.instance_weights)
    # Predict probabilities for calculating log loss
    y_pred_train_prob = model.predict_proba(data_orig_sex_train.features)
    y_pred_test_prob = model.predict_proba(data_orig_sex_test.features)
    
    # Calculate accuracy
    y_pred_train = model.predict(data_orig_sex_train.features)
    y_pred_test = model.predict(data_orig_sex_test.features)
    train_accuracy = accuracy_score(data_orig_sex_train.labels, y_pred_train)
    test_accuracy = accuracy_score(data_orig_sex_test.labels, y_pred_test)
    
    # Calculate log loss
    train_loss = log_loss(data_orig_sex_train.labels, y_pred_train_prob, 
                          sample_weight=data_orig_sex_train.instance_weights)
    test_loss = log_loss(data_orig_sex_test.labels, y_pred_test_prob, 
                         sample_weight=data_orig_sex_test.instance_weights)
    
    # Store metrics
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Visualization of the metrics over iterations
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(iterations), y=train_losses, mode='lines+markers', name='Train Loss'))
fig.add_trace(go.Scatter(x=np.arange(iterations), y=test_losses, mode='lines+markers', name='Test Loss'))
fig.update_layout(title='Training and Test Loss Over Iterations', xaxis_title='Iteration', yaxis_title='Log Loss')
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(iterations), y=train_accuracies, mode='lines+markers', name='Train Accuracy'))
fig.add_trace(go.Scatter(x=np.arange(iterations), y=test_accuracies, mode='lines+markers', name='Test Accuracy'))
fig.update_layout(title='Training and Test Accuracy Over Iterations', xaxis_title='Iteration', yaxis_title='Accuracy')
fig.show()

# Feature Importance (if applicable and model converged)
if hasattr(model, 'coef_'):
    importances = model.coef_[0]
    fig = go.Figure([go.Bar(x=[f'Feature {i}' for i in range(len(importances))], y=importances)])
    fig.update_layout(title='Feature Importance', xaxis_title='Feature', yaxis_title='Coefficient Value')
    fig.show()


In [20]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

# Assuming y_pred_test is obtained from model.predict(X_test_scaled)
cm = confusion_matrix(data_orig_sex_test.labels, y_pred_test)

# Plotting using Plotly's Figure Factory
fig = ff.create_annotated_heatmap(z=cm, x=[str(x) for x in model.classes_], y=[str(x) for x in model.classes_],
                                  colorscale='Viridis', showscale=True)
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted Label', yaxis_title='True Label')
fig.show()
