<a href="https://colab.research.google.com/github/sireesha2021/fairness_metric_tool/blob/main/Loan_Eligibility_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# For fairness metrics and visualizations (we'll use this later)
!pip install fairlearn
import fairlearn.metrics as fl_metrics
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries loaded successfully and Fairlearn installed!")

# --- Generate Synthetic Data ---
np.random.seed(42)
n_samples = 5000

# Sensitive Attributes
genders = np.random.choice(['Male', 'Female', 'Non-binary'], n_samples, p=[0.48, 0.50, 0.02])
races = np.random.choice(['White', 'Black', 'Asian', 'Hispanic'], n_samples, p=[0.6, 0.2, 0.1, 0.1])
ages = np.random.randint(18, 70, n_samples) # Age between 18 and 69

# Financial Attributes (correlated with eligibility)
income = np.random.normal(loc=50000, scale=15000, size=n_samples)
credit_score = np.random.normal(loc=680, scale=50, size=n_samples)
loan_amount_requested = np.random.normal(loc=15000, scale=5000, size=n_samples)

# Introduce some simulated bias in 'true' eligibility (Y) for demo purposes
# For example, let's make Black individuals and Non-binary individuals slightly less 'truly' eligible (Y=1)
true_eligibility = np.zeros(n_samples)

# Base eligibility rate
base_prob = 0.6

# Adjust based on income and credit score (more likely to be truly eligible with higher income/score)
prob_income = (income - np.min(income)) / (np.max(income) - np.min(income)) * 0.2
prob_credit = (credit_score - np.min(credit_score)) / (np.max(credit_score) - np.min(credit_score)) * 0.3

# Introduce simulated bias for 'true' eligibility
for i in range(n_samples):
    current_prob = base_prob + prob_income[i] + prob_credit[i]

    if races[i] == 'Black':
        current_prob -= 0.2 # Lower true eligibility for Black individuals
    if genders[i] == 'Non-binary':
        current_prob -= 0.15 # Lower true eligibility for Non-binary individuals
    if ages[i] > 60:
        current_prob -= 0.1 # Lower true eligibility for older individuals

    true_eligibility[i] = 1 if np.random.rand() < current_prob else 0

# Create DataFrame
data = pd.DataFrame({
    'Gender': genders,
    'Race': races,
    'Age': ages,
    'Income': income,
    'CreditScore': credit_score,
    'LoanAmount': loan_amount_requested,
    'True_Eligibility': true_eligibility # This is our 'ground truth' or 'true label' (Y)
})

# Display first few rows and check true eligibility distribution
print("\nSynthetic Dataset Head:")
print(data.head())
print("\nTrue Eligibility Distribution:")
print(data['True_Eligibility'].value_counts(normalize=True))

# --- Prepare data for a simple model (features X) ---
X = data[['Income', 'CreditScore', 'LoanAmount']] # Numerical features for simplicity
y = data['True_Eligibility']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split data (we'll use test set for audit)
X_train, X_test, y_train, y_test = train_train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# --- Train a simple Logistic Regression model ---
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train, y_train)

# Get model predictions on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # Probability of being eligible

# Add sensitive attributes and true labels back to the test set for audit
test_df = data.loc[y_test.index].copy() # Ensure original indices are used to get sensitive attrs
test_df['y_true'] = y_test
test_df['y_pred'] = y_pred
test_df['y_proba'] = y_proba

print("\nModel Training Complete. Test set prepared for audit.")
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.2f}")

Collecting fairlearn
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.12.0
Libraries loaded successfully and Fairlearn installed!

Synthetic Dataset Head:
   Gender   Race  Age        Income  CreditScore    LoanAmount  \
0    Male  White   49  52029.026137   719.223476  18476.829110   
1  Female  White   53  14072.956715   688.526257  14857.509276   
2  Female  Asian   67  53324.620577   660.622824   9860.402742   
3  Female  White   54  24007.105356   700.445533  19450.146388   
4    Male  Asian   65  45704.022777   642.490659  -1323.179143   

   True_Eligibility  
0               1.0  
1               1.0  
2               1.0  
3               1.0  
4               0.0  

True Eligibility Distribution:
True_Eligi

NameError: name 'train_train_test_split' is not defined