In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
from fairlearn.metrics import MetricFrame
from sklearn.linear_model import LogisticRegression
from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference, demographic_parity_ratio
from sklearn.impute import SimpleImputer


In [2]:
df = pd.read_csv('car_insurance_claim.csv')

In [3]:
df.describe()

Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,TRAVTIME,TIF,CLM_FREQ,MVR_PTS,CAR_AGE,CLAIM_FLAG
count,10302.0,10302.0,10295.0,10302.0,9754.0,10302.0,10302.0,10302.0,10302.0,9663.0,10302.0
mean,495663100.0,0.169288,44.837397,0.720443,10.474062,33.416424,5.329159,0.800718,1.710153,8.298148,0.26655
std,286467500.0,0.506512,8.606445,1.116323,4.108943,15.869687,4.110795,1.154079,2.159015,5.71445,0.442177
min,63175.0,0.0,16.0,0.0,0.0,5.0,1.0,0.0,0.0,-3.0,0.0
25%,244286900.0,0.0,39.0,0.0,9.0,22.0,1.0,0.0,0.0,1.0,0.0
50%,497004300.0,0.0,45.0,0.0,11.0,33.0,4.0,0.0,1.0,8.0,0.0
75%,739455100.0,0.0,51.0,1.0,13.0,44.0,7.0,2.0,3.0,12.0,1.0
max,999926400.0,4.0,81.0,5.0,23.0,142.0,25.0,5.0,13.0,28.0,1.0


In [4]:
df.head(2)

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban


In [5]:
df = df.drop(columns=['ID','BIRTH'],axis=1)

In [None]:
numerical = [
    'KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME',
    'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'OLDCLAIM',
    'CLM_FREQ', 'MVR_PTS', 'CLM_AMT', 'CAR_AGE'
]

categorical = [
  'PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION',
    'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY'
]

for col in categorical:
    if df[col].isna().any():
        # Fill with the most frequent category
        df[col] = df[col].fillna(df[col].mode()[0])
        
def clean_currency(x):
    if isinstance(x, str):
        return float(x.replace('$','').replace(',',''))
    return x

for col in ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']:
    df[col] = df[col].apply(clean_currency)


print("Numerical columns:", numerical)
print("Categorical columns:", categorical)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'OLDCLAIM', 'CLM_FREQ', 'MVR_PTS', 'CLM_AMT', 'CAR_AGE']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY']


In [19]:
X = df.drop('CLAIM_FLAG',axis=1)
y = df['CLAIM_FLAG']

In [20]:
from sklearn.impute import SimpleImputer


preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical),
        ("cat", Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop="first", handle_unknown="ignore"))
        ]), categorical)
    ]
)

In [21]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(random_state=42))
])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [23]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1512
           1       1.00      0.97      0.98       549

    accuracy                           0.99      2061
   macro avg       0.99      0.98      0.99      2061
weighted avg       0.99      0.99      0.99      2061

ROC-AUC Score: 0.999656662908029


In [24]:
groups = ['GENDER', 'EDUCATION', 'MSTATUS', 'PARENT1', 'OCCUPATION']
results = []

In [25]:
for group in groups:
    for value in df[group].unique():
        mask = X_test[group] == value
        if mask.sum() > 0:
            group_y_test = y_test[mask]
            group_y_pred = y_pred[mask]
            group_y_pred_proba = y_pred_proba[mask]
            try:
                roc_auc = roc_auc_score(group_y_test, group_y_pred_proba)
            except ValueError:
                roc_auc = float('nan')  # or handle it in another appropriate way
            results.append({
                "Group": group,
                "Value": value,
                "Accuracy": (group_y_test == group_y_pred).mean(),
                "ROC-AUC": roc_auc
            })


In [26]:
for result in results:
    print(result)

{'Group': 'GENDER', 'Value': 'M', 'Accuracy': 0.9915878023133544, 'ROC-AUC': 0.9994593056346044}
{'Group': 'GENDER', 'Value': 'z_F', 'Accuracy': 0.990990990990991, 'ROC-AUC': 0.9999214798161801}
{'Group': 'EDUCATION', 'Value': 'PhD', 'Accuracy': 0.9898477157360406, 'ROC-AUC': 0.9981955972573079}
{'Group': 'EDUCATION', 'Value': 'z_High School', 'Accuracy': 0.9880952380952381, 'ROC-AUC': 0.9998858838297386}
{'Group': 'EDUCATION', 'Value': 'Bachelors', 'Accuracy': 0.9944237918215614, 'ROC-AUC': 0.9999805204924419}
{'Group': 'EDUCATION', 'Value': '<High School', 'Accuracy': 0.996742671009772, 'ROC-AUC': 1.0}
{'Group': 'EDUCATION', 'Value': 'Masters', 'Accuracy': 0.988399071925754, 'ROC-AUC': 1.0}
{'Group': 'MSTATUS', 'Value': 'z_No', 'Accuracy': 0.989010989010989, 'ROC-AUC': 0.9999932788471879}
{'Group': 'MSTATUS', 'Value': 'Yes', 'Accuracy': 0.9927536231884058, 'ROC-AUC': 0.9994313611791774}
{'Group': 'PARENT1', 'Value': 'No', 'Accuracy': 0.9916387959866221, 'ROC-AUC': 0.9996118838443245}

In [27]:
for group in groups:
    
    eod = equalized_odds_difference(
        y_true=y_test,
        y_pred=y_pred,
        sensitive_features=X_test[group] 
    )
    
    dpd = demographic_parity_difference(
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[group]
    )
    
    di_ratio = demographic_parity_ratio(
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=X_test[group]
    )
    print(f'\n group is {group}')
    print(f"Demographic Parity Ratio: {di_ratio:.4f}")
    print(f"Equalized Odds Difference: {eod:.4f}")
    print(f"Demographic Parity Difference: {dpd:.4f}")
    print(f'\nvalue counts for group are ;{df[group].value_counts()}')


 group is GENDER
Demographic Parity Ratio: 0.9848
Equalized Odds Difference: 0.0017
Demographic Parity Difference: 0.0039

value counts for group are ;GENDER
z_F    5545
M      4757
Name: count, dtype: int64

 group is EDUCATION
Demographic Parity Ratio: 0.4776
Equalized Odds Difference: 0.0487
Demographic Parity Difference: 0.1777

value counts for group are ;EDUCATION
z_High School    2952
Bachelors        2823
Masters          2078
<High School     1515
PhD               934
Name: count, dtype: int64

 group is MSTATUS
Demographic Parity Ratio: 0.6720
Equalized Odds Difference: 0.0006
Demographic Parity Difference: 0.1053

value counts for group are ;MSTATUS
Yes     6188
z_No    4114
Name: count, dtype: int64

 group is PARENT1
Demographic Parity Ratio: 0.5384
Equalized Odds Difference: 0.0094
Demographic Parity Difference: 0.1988

value counts for group are ;PARENT1
No     8959
Yes    1343
Name: count, dtype: int64

 group is OCCUPATION
Demographic Parity Ratio: 0.3782
Equalized O