### Bias & Fairness in Data: Bias Mitigation Techniques
**Question**: Use the Adult Income dataset and apply reweighing technique to balance the
class weights based on sensitive attributes (e.g., gender).

In [2]:
pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
Installing collected packages: fairlearn
Successfully installed fairlearn-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from fairlearn.preprocessing import LabelEncoder
from collections import defaultdict

# Load the Adult Income dataset
try:
    df = pd.read_csv('adult.csv')
    print("Adult Income dataset loaded successfully.\n")
except FileNotFoundError:
    print("Error: adult.csv not found. Please make sure the file is in the correct directory.")
    exit()

# Rename columns for easier analysis
df.rename(columns={'sex': 'gender', 'income': 'income_level'}, inplace=True)

# Drop rows with missing values (for simplicity in this example)
df.dropna(inplace=True)

# Convert categorical features to numerical using Label Encoding
categorical_features = ['workclass', 'education', 'marital-status', 'occupation',
                        'relationship', 'race', 'gender', 'native-country']
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Convert income level to binary labels (0 and 1)
df['income_level'] = df['income_level'].apply(lambda x: 1 if x == '>50K' else 0)

# Separate features (X) and target (y)
X = df.drop('income_level', axis=1)
y = df['income_level']
sensitive_features = df['gender']  # Our sensitive attribute

# Split data into training and testing sets
X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
    X, y, sensitive_features, test_size=0.2, random_state=42
)

# --- Reweighing Technique ---
print("\n--- Applying Reweighing Technique ---")

def calculate_reweighting(y, sensitive_features):
    """
    Calculates weights for each sample based on the combination of target and sensitive feature.
    Weight = 1 / (N * P(y=yi, s=si))
    where:
    N is the total number of samples
    P(y=yi, s=si) is the probability of the combination of target yi and sensitive feature si
    """
    weights = defaultdict(float)
    total_samples = len(y)
    joint_probabilities = (
        pd.concat([y, sensitive_features], axis=1)
        .value_counts(normalize=True)
        .to_dict()
    )

    for index, row in pd.concat([y, sensitive_features], axis=1).iterrows():
        target = row['income_level']
        sensitive = row['gender']
        joint_prob = joint_probabilities.get((target, sensitive), 0)
        if joint_prob > 0:
            weights[index] = 1 / (total_samples * joint_prob)
        else:
            weights[index] = 0  # Avoid division by zero

    return pd.Series(weights)

train_weights = calculate_reweighting(y_train, sensitive_train)

print("\nCalculated Reweighting for the Training Data (first 10 weights):")
print(train_weights.head(10))

# --- Train a Logistic Regression Model with Reweighting ---
print("\n--- Training Logistic Regression with Reweighting ---")

model_reweighted = LogisticRegression(solver='liblinear', random_state=42)
model_reweighted.fit(X_train, y_train, sample_weight=train_weights)

# --- Evaluate the Model on the Test Set ---
print("\n--- Evaluating Model with Reweighting on Test Set ---")

y_pred_reweighted = model_reweighted.predict(X_test)
accuracy_reweighted = accuracy_score(y_test, y_pred_reweighted)
print(f"Accuracy with Reweighing: {accuracy_reweighted:.4f}")
print("\nClassification Report with Reweighing:")
print(classification_report(y_test, y_pred_reweighted))

# --- Train a Baseline Logistic Regression Model (without reweighting) for Comparison ---
print("\n--- Training Baseline Logistic Regression (without Reweighting) ---")

model_baseline = LogisticRegression(solver='liblinear', random_state=42)
model_baseline.fit(X_train, y_train)

# --- Evaluate the Baseline Model on the Test Set ---
print("\n--- Evaluating Baseline Model on Test Set ---")

y_pred_baseline = model_baseline.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Accuracy: {accuracy_baseline:.4f}")
print("\nBaseline Classification Report:")
print(classification_report(y_test, y_pred_baseline))

print("\nReweighing technique applied to balance class weights based on gender.")
print("The performance of the model trained with reweighting can be compared to the baseline model.")
print("Ideally, reweighting helps to mitigate bias without significantly sacrificing overall accuracy.")

ImportError: cannot import name 'LabelEncoder' from 'fairlearn.preprocessing' (/workspaces/AI_DATA_ANALYSIS_/.venv/lib/python3.10/site-packages/fairlearn/preprocessing/__init__.py)