In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
from fairlearn.metrics import MetricFrame
from sklearn.linear_model import LogisticRegression
from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference, demographic_parity_ratio



In [2]:
df = pd.read_csv('car_insurance.csv')

In [5]:
df.describe()

Unnamed: 0,id,age,gender,credit_score,vehicle_ownership,married,children,postal_code,annual_mileage,speeding_violations,duis,past_accidents,outcome
count,10000.0,10000.0,10000.0,9018.0,10000.0,10000.0,10000.0,10000.0,9043.0,10000.0,10000.0,10000.0,10000.0
mean,500521.9068,1.4895,0.499,0.515813,0.697,0.4982,0.6888,19864.5484,11697.003207,1.4829,0.2392,1.0563,0.3133
std,290030.768758,1.025278,0.500024,0.137688,0.459578,0.500022,0.463008,18915.613855,2818.434528,2.241966,0.55499,1.652454,0.463858
min,101.0,0.0,0.0,0.053358,0.0,0.0,0.0,10238.0,2000.0,0.0,0.0,0.0,0.0
25%,249638.5,1.0,0.0,0.417191,0.0,0.0,0.0,10238.0,10000.0,0.0,0.0,0.0,0.0
50%,501777.0,1.0,0.0,0.525033,1.0,0.0,1.0,10238.0,12000.0,0.0,0.0,0.0,0.0
75%,753974.5,2.0,1.0,0.618312,1.0,1.0,1.0,32765.0,14000.0,2.0,0.0,2.0,1.0
max,999976.0,3.0,1.0,0.960819,1.0,1.0,1.0,92101.0,22000.0,22.0,6.0,15.0,1.0


In [9]:
df.head(2)

Unnamed: 0,id,age,gender,driving_experience,education,income,credit_score,vehicle_ownership,vehicle_year,married,children,postal_code,annual_mileage,vehicle_type,speeding_violations,duis,past_accidents,outcome
0,569520,3,0,0-9y,high school,upper class,0.629027,1.0,after 2015,0.0,1.0,10238,12000.0,sedan,0,0,0,0.0
1,750365,0,1,0-9y,none,poverty,0.357757,0.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,0,1.0


In [10]:
df.drop('id',axis=1)

Unnamed: 0,age,gender,driving_experience,education,income,credit_score,vehicle_ownership,vehicle_year,married,children,postal_code,annual_mileage,vehicle_type,speeding_violations,duis,past_accidents,outcome
0,3,0,0-9y,high school,upper class,0.629027,1.0,after 2015,0.0,1.0,10238,12000.0,sedan,0,0,0,0.0
1,0,1,0-9y,none,poverty,0.357757,0.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,0,1.0
2,0,0,0-9y,high school,working class,0.493146,1.0,before 2015,0.0,0.0,10238,11000.0,sedan,0,0,0,0.0
3,0,1,0-9y,university,working class,0.206013,1.0,before 2015,0.0,1.0,32765,11000.0,sedan,0,0,0,0.0
4,1,1,10-19y,none,working class,0.388366,1.0,before 2015,0.0,0.0,32765,12000.0,sedan,2,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,0,10-19y,university,upper class,0.582787,1.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,1,0.0
9996,1,0,10-19y,none,middle class,0.522231,1.0,after 2015,0.0,1.0,32765,,sedan,1,0,0,0.0
9997,1,1,0-9y,high school,middle class,0.470940,1.0,before 2015,0.0,1.0,10238,14000.0,sedan,0,0,0,0.0
9998,1,0,10-19y,high school,poverty,0.364185,0.0,before 2015,0.0,1.0,10238,13000.0,sedan,2,0,1,1.0


In [24]:
numerical = ['age', 'credit_score', 'annual_mileage', 'speeding_violations', 'duis', 'past_accidents']
categorical = ['driving_experience', 'income', 'vehicle_year', 'vehicle_type']

In [25]:
for col in numerical:
    if df[col].isna().any():
        df[col] = df[col].fillna(df[col].mean())


In [26]:
X = df.drop('outcome',axis=1)
y = df['outcome']

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical),
        ("cat", OneHotEncoder(drop="first"), categorical)
    ]
)

In [28]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(random_state=42))
])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [30]:
# Train the model
pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.87      0.85      1373
         1.0       0.69      0.63      0.66       627

    accuracy                           0.80      2000
   macro avg       0.77      0.75      0.76      2000
weighted avg       0.79      0.80      0.79      2000

ROC-AUC Score: 0.8527090586162155


In [32]:
groups = ['gender', 'education', 'income', 'married']
results = []

In [33]:

for group in groups:
    for value in df[group].unique():
        mask = X_test[group] == value
        if mask.sum() > 0:
            group_y_test = y_test[mask]
            group_y_pred = y_pred[mask]
            group_y_pred_proba = y_pred_proba[mask]
            results.append({
                "Group": group,
                "Value": value,
                "Accuracy": (group_y_test == group_y_pred).mean(),
                "ROC-AUC": roc_auc_score(group_y_test, group_y_pred_proba)
            })

bias_df = pd.DataFrame(results)
print("\nGroup-wise Performance Metrics:")
print(bias_df)



Group-wise Performance Metrics:
        Group          Value  Accuracy   ROC-AUC
0      gender              0  0.793996  0.845874
1      gender              1  0.798839  0.869390
2   education    high school  0.795824  0.852765
3   education           none  0.736148  0.806027
4   education     university  0.827404  0.851937
5      income    upper class  0.877940  0.832458
6      income        poverty  0.726496  0.700547
7      income  working class  0.715116  0.775023
8      income   middle class  0.747573  0.791860
9     married            0.0  0.750259  0.824651
10    married            1.0  0.839614  0.836043


In [None]:
for group in groups:
    
    eod = equalized_odds_difference(
        y_true=y_test,
        y_pred=y_pred,
        sensitive_features=X_test[group] 
    )
    
    dpd = demographic_parity_difference(
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[group]
    )
    
    di_ratio = demographic_parity_ratio(
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[group]
    )
    print(f'\n group is {group}')
    print(f"Demographic Parity Ratio: {di_ratio:.4f}")
    print(f"Equalized Odds Difference: {eod:.4f}")
    print(f"Demographic Parity Difference: {dpd:.4f}")


 group is gender
Demographic Parity Ratio: 0.8956
Equalized Odds Difference: 0.0806
Demographic Parity Difference: 0.0313

 group is education
Demographic Parity Ratio: 0.8956
Equalized Odds Difference: 0.0806
Demographic Parity Difference: 0.0313

 group is income
Demographic Parity Ratio: 0.8956
Equalized Odds Difference: 0.0806
Demographic Parity Difference: 0.0313

 group is married
Demographic Parity Ratio: 0.8956
Equalized Odds Difference: 0.0806
Demographic Parity Difference: 0.0313
