In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
from fairlearn.metrics import MetricFrame
from sklearn.linear_model import LogisticRegression
from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference, demographic_parity_ratio 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_sample_weight
from aif360.metrics import ClassificationMetric
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing
pd.set_option("display.max_columns", None)


  vect_normalized_discounted_cumulative_gain = vmap(
  monte_carlo_vect_ndcg = vmap(vect_normalized_discounted_cumulative_gain, in_dims=(0,))


In [2]:
import warnings

# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [3]:
df = pd.read_csv('car_insurance_claim.csv')

In [4]:
df.describe()

Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,TRAVTIME,TIF,CLM_FREQ,MVR_PTS,CAR_AGE,CLAIM_FLAG
count,10302.0,10302.0,10295.0,10302.0,9754.0,10302.0,10302.0,10302.0,10302.0,9663.0,10302.0
mean,495663100.0,0.169288,44.837397,0.720443,10.474062,33.416424,5.329159,0.800718,1.710153,8.298148,0.26655
std,286467500.0,0.506512,8.606445,1.116323,4.108943,15.869687,4.110795,1.154079,2.159015,5.71445,0.442177
min,63175.0,0.0,16.0,0.0,0.0,5.0,1.0,0.0,0.0,-3.0,0.0
25%,244286900.0,0.0,39.0,0.0,9.0,22.0,1.0,0.0,0.0,1.0,0.0
50%,497004300.0,0.0,45.0,0.0,11.0,33.0,4.0,0.0,1.0,8.0,0.0
75%,739455100.0,0.0,51.0,1.0,13.0,44.0,7.0,2.0,3.0,12.0,1.0
max,999926400.0,4.0,81.0,5.0,23.0,142.0,25.0,5.0,13.0,28.0,1.0


In [5]:
df.head(2)

Unnamed: 0,ID,KIDSDRIV,BIRTH,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,OCCUPATION,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,63581743,0,16MAR39,60.0,0,11.0,"$67,349",No,$0,z_No,M,PhD,Professional,14,Private,"$14,230",11,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,132761049,0,21JAN56,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,z_High School,z_Blue Collar,22,Commercial,"$14,940",1,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban


In [6]:
a = sum(df['CLAIM_FLAG'] == False)
print(a)

7556


In [7]:
b = sum(df['CLAIM_FLAG'] == True)
print(b)

2746


In [8]:
df = df.drop(columns=['ID','BIRTH'],axis=1)
df = df.applymap(lambda x: x.replace('z_', '') if isinstance(x, str) else x)

print(df.head())

   KIDSDRIV   AGE  HOMEKIDS   YOJ   INCOME PARENT1  HOME_VAL MSTATUS GENDER  \
0         0  60.0         0  11.0  $67,349      No        $0      No      M   
1         0  43.0         0  11.0  $91,449      No  $257,252      No      M   
2         0  48.0         0  11.0  $52,881      No        $0      No      M   
3         0  35.0         1  10.0  $16,039      No  $124,191     Yes      F   
4         0  51.0         0  14.0      NaN      No  $306,251     Yes      M   

      EDUCATION    OCCUPATION  TRAVTIME     CAR_USE BLUEBOOK  TIF CAR_TYPE  \
0           PhD  Professional        14     Private  $14,230   11  Minivan   
1   High School   Blue Collar        22  Commercial  $14,940    1  Minivan   
2     Bachelors       Manager        26     Private  $21,970    1      Van   
3   High School      Clerical         5     Private   $4,010    4      SUV   
4  <High School   Blue Collar        32     Private  $15,440    7  Minivan   

  RED_CAR OLDCLAIM  CLM_FREQ REVOKED  MVR_PTS CLM_AMT  C

In [9]:
numerical = [
    'KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME',
    'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'OLDCLAIM',
    'CLM_FREQ', 'MVR_PTS', 'CLM_AMT', 'CAR_AGE'
]

categorical = [
  'PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION',
    'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY'
]

df[categorical] = df[categorical].apply(lambda x: x.fillna(x.mode()[0]))

        
def clean_currency(x):
    if isinstance(x, str):
        return float(x.replace('$','').replace(',',''))
    return x

for col in ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']:
    df[col] = df[col].apply(clean_currency)


print("Numerical columns:", numerical)
print("Categorical columns:", categorical)

Numerical columns: ['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'HOME_VAL', 'TRAVTIME', 'BLUEBOOK', 'TIF', 'OLDCLAIM', 'CLM_FREQ', 'MVR_PTS', 'CLM_AMT', 'CAR_AGE']
Categorical columns: ['PARENT1', 'MSTATUS', 'GENDER', 'EDUCATION', 'OCCUPATION', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'URBANICITY']


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical),
        ("cat", Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder())
        ]), categorical)
    ]
)

In [11]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(random_state=42))
])

In [None]:
X = df.drop('CLAIM_FLAG', axis=1)
y = df['CLAIM_FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='roc_auc')
print(f'Cross-validation ROC-AUC: {scores.mean()} ± {scores.std()}')

Cross-validation ROC-AUC: 0.9995566701209562 ± 0.0008796994578040058


In [13]:
print(df.columns)

Index(['KIDSDRIV', 'AGE', 'HOMEKIDS', 'YOJ', 'INCOME', 'PARENT1', 'HOME_VAL',
       'MSTATUS', 'GENDER', 'EDUCATION', 'OCCUPATION', 'TRAVTIME', 'CAR_USE',
       'BLUEBOOK', 'TIF', 'CAR_TYPE', 'RED_CAR', 'OLDCLAIM', 'CLM_FREQ',
       'REVOKED', 'MVR_PTS', 'CLM_AMT', 'CAR_AGE', 'CLAIM_FLAG', 'URBANICITY'],
      dtype='object')


In [14]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1512
           1       1.00      0.97      0.98       549

    accuracy                           0.99      2061
   macro avg       0.99      0.98      0.99      2061
weighted avg       0.99      0.99      0.99      2061

ROC-AUC Score: 0.9995940189473888


In [15]:
groups = ['GENDER', 'EDUCATION', 'MSTATUS', 'PARENT1', 'OCCUPATION', 'URBANICITY']
results = []
privileged = {}
for group in groups:
    privileged[group] = {}

In [16]:
def evaluate_accuracy(X_test, y_test, y_pred, y_pred_proba, group):
    results = []  # Ensure results list is initialized
    for value in X_test[group].unique():
        mask = X_test[group] == value
        if mask.sum() > 0:
            group_y_test = y_test[mask]
            group_y_pred = y_pred[mask]
            group_y_pred_proba = y_pred_proba[mask]
            try:
                roc_auc = roc_auc_score(group_y_test, group_y_pred_proba)
            except ValueError:
                roc_auc = float('nan')
            tn, fp, fn, tp = confusion_matrix(group_y_test, group_y_pred).ravel()

            results.append({
                "Group": group,
                "Value": value,
                "Accuracy": (group_y_test == group_y_pred).mean(),
                "ROC-AUC": roc_auc,
                "TP": tp,
                "TN": tn,
                "FP": fp,
                "FN": fn
            })

    for result in results:
        print(result)
    
    return results

    
for group in groups:
    results = evaluate_accuracy(X_test, y_test, y_pred, y_pred_proba, group)

{'Group': 'GENDER', 'Value': 'M', 'Accuracy': 0.9915878023133544, 'ROC-AUC': 0.9993340922026182, 'TP': 243, 'TN': 700, 'FP': 0, 'FN': 8}
{'Group': 'GENDER', 'Value': 'F', 'Accuracy': 0.990990990990991, 'ROC-AUC': 0.9999049492511655, 'TP': 288, 'TN': 812, 'FP': 0, 'FN': 10}
{'Group': 'EDUCATION', 'Value': 'High School', 'Accuracy': 0.9880952380952381, 'ROC-AUC': 0.9998858838297386, 'TP': 200, 'TN': 381, 'FP': 0, 'FN': 7}
{'Group': 'EDUCATION', 'Value': 'Bachelors', 'Accuracy': 0.9944237918215614, 'ROC-AUC': 0.9999415614773258, 'TP': 121, 'TN': 414, 'FP': 0, 'FN': 3}
{'Group': 'EDUCATION', 'Value': 'Masters', 'Accuracy': 0.988399071925754, 'ROC-AUC': 1.0, 'TP': 80, 'TN': 346, 'FP': 0, 'FN': 5}
{'Group': 'EDUCATION', 'Value': '<High School', 'Accuracy': 0.996742671009772, 'ROC-AUC': 1.0, 'TP': 98, 'TN': 208, 'FP': 0, 'FN': 1}
{'Group': 'EDUCATION', 'Value': 'PhD', 'Accuracy': 0.9898477157360406, 'ROC-AUC': 0.9985564778058462, 'TP': 32, 'TN': 163, 'FP': 0, 'FN': 2}
{'Group': 'MSTATUS', 'Va

For each group, we calculate its fairness, based on its equalized odds difference, demographic parity difference, demographic parity ratio.

Then, for each group we calculate the privileged and the unprivileged classes, based on the distance each class' value has with the most favoured value in the group.

In [17]:
def evaluate_fairness(y_true, y_pred, sensitive_features, group_name):
    eod = equalized_odds_difference(
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=sensitive_features
    )
    
    dpd = demographic_parity_difference(
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sensitive_features
    )
    
    di_ratio = demographic_parity_ratio(
    y_true=y_true,
    y_pred=y_pred,
    sensitive_features=sensitive_features
    )
    
    print(f'\n group is {group_name}')
    print(f"Demographic Parity Ratio: {di_ratio:.4f}")
    print(f"Equalized Odds Difference: {eod:.4f}")
    print(f"Demographic Parity Difference: {dpd:.4f}")
    
    
    positive_rates = {}
    for group_value in sensitive_features.unique():
        mask = sensitive_features == group_value
        group_y_pred = y_pred[mask]
        positive_rate = group_y_pred.mean()
        positive_rates[group_value] = positive_rate
        print(f"Subgroup: {group_value}, Positive Prediction Rate: {positive_rate:.4f}")
    
    max_rate = max(positive_rates.values())
    min_rate = min(positive_rates.values())
    
    positive_rates = dict(sorted(positive_rates.items(), key=lambda x: x[1]))
    
    
    values = list(positive_rates.values())
    q1, q3 = np.percentile(values, [25, 75])
    iqr = q3 - q1
    threshold = iqr * 0.5 
        
        
    
    
    for group_value, rate in positive_rates.items():
        
        if rate == min_rate or (rate - min_rate <= threshold):
            print(f"--> Privileged Group: {group_value} (Positive Rate: {rate:.4f})")
            
            if 'privileged' in privileged[group_name]:
                privileged[group_name]['privileged'].append(group_value)
            else:
                privileged[group_name]['privileged'] = [group_value]

        elif rate == max_rate:
            
            print(f"--> Unprivileged Group: {group_value} (Positive Rate: {rate:.4f})")
            if 'unprivileged' in privileged[group_name]:
                privileged[group_name]['unprivileged'].append(group_value)
            else:
                privileged[group_name]['unprivileged'] = [group_value]
            
            
            
            
            
            

In [18]:
for group in groups:
    
    evaluate_fairness(y_test, y_pred, X_test[group], group)


 group is GENDER
Demographic Parity Ratio: 0.9848
Equalized Odds Difference: 0.0017
Demographic Parity Difference: 0.0039
Subgroup: M, Positive Prediction Rate: 0.2555
Subgroup: F, Positive Prediction Rate: 0.2595
--> Privileged Group: M (Positive Rate: 0.2555)
--> Unprivileged Group: F (Positive Rate: 0.2595)

 group is EDUCATION
Demographic Parity Ratio: 0.4776
Equalized Odds Difference: 0.0487
Demographic Parity Difference: 0.1777
Subgroup: High School, Positive Prediction Rate: 0.3401
Subgroup: Bachelors, Positive Prediction Rate: 0.2249
Subgroup: Masters, Positive Prediction Rate: 0.1856
Subgroup: <High School, Positive Prediction Rate: 0.3192
Subgroup: PhD, Positive Prediction Rate: 0.1624
--> Privileged Group: PhD (Positive Rate: 0.1624)
--> Privileged Group: Masters (Positive Rate: 0.1856)
--> Privileged Group: Bachelors (Positive Rate: 0.2249)
--> Unprivileged Group: High School (Positive Rate: 0.3401)

 group is MSTATUS
Demographic Parity Ratio: 0.6720
Equalized Odds Differe

Based on the above fairness metrics, we will remove gender and marital status from our groups list, for we do not deem them biased enough.

In [19]:
display(df.head(2))

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,OCCUPATION,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,0,60.0,0,11.0,67349.0,No,0.0,No,M,PhD,Professional,14,Private,14230.0,11,Minivan,yes,4461.0,2,No,3,0.0,18.0,0,Highly Urban/ Urban
1,0,43.0,0,11.0,91449.0,No,257252.0,No,M,High School,Blue Collar,22,Commercial,14940.0,1,Minivan,yes,0.0,0,No,0,0.0,1.0,0,Highly Urban/ Urban


We see that the columns occupation, parent1, and occupation indicate the presence of bias in our dataset.

In [20]:
print(groups)

['GENDER', 'EDUCATION', 'MSTATUS', 'PARENT1', 'OCCUPATION', 'URBANICITY']


In [21]:
groups.remove('MSTATUS')
groups.remove('GENDER')

In [22]:
del privileged['GENDER']
del privileged['MSTATUS']
for keys,values in privileged.items():
    print(keys,values)

EDUCATION {'privileged': ['PhD', 'Masters', 'Bachelors'], 'unprivileged': ['High School']}
PARENT1 {'privileged': ['No'], 'unprivileged': ['Yes']}
OCCUPATION {'privileged': ['Manager', 'Lawyer', 'Doctor'], 'unprivileged': ['Student']}
URBANICITY {'privileged': ['Highly Rural/ Rural'], 'unprivileged': ['Highly Urban/ Urban']}


Let us use aif360 for mitigating bias in the education column

In [23]:

for group in groups:
    
    aif_dict = {}

    for element in privileged[group]['privileged']:    
        aif_dict[element] = 1
    
    for element in privileged[group]['unprivileged']:
        aif_dict[element] = 0


    privileged_class = [key for key, value in aif_dict.items() if value == 1]
    unprivileged_class = [key for key, value in aif_dict.items() if value == 0]

    print(f'privileged classes for group {group} are {privileged_class}')
    print(f'unprivileged classes for group {group} are {unprivileged_class}')
    
    #map column values to 0,1 s, based on whether or not the entry is privileged
    #df[group] = df[group].apply(lambda x: 1 if x in(privileged_class) else 0)




encoder = LabelEncoder()

categorical_columns = [
    'MSTATUS', 'GENDER', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'EDUCATION', 'OCCUPATION','PARENT1','URBANICITY'
]

for col in categorical_columns:
    # Fit and transform the column to encode categorical values
    df[col] = encoder.fit_transform(df[col].astype(str))  # Ensure all categories are considered by converting to string

    # Optionally, print the mapping of original values to encoded labels
    print(f"Encoded {col}: {dict(zip(encoder.classes_, range(len(encoder.classes_))))}")





# Collect all unique class values from all columns
all_unique_values = set()

# Collect unique values for each column
for col in categorical_columns:
    all_unique_values.update(df[col].astype(str).unique())

encoder.fit(sorted(all_unique_values)) 


privileged classes for group EDUCATION are ['PhD', 'Masters', 'Bachelors']
unprivileged classes for group EDUCATION are ['High School']
privileged classes for group PARENT1 are ['No']
unprivileged classes for group PARENT1 are ['Yes']
privileged classes for group OCCUPATION are ['Manager', 'Lawyer', 'Doctor']
unprivileged classes for group OCCUPATION are ['Student']
privileged classes for group URBANICITY are ['Highly Rural/ Rural']
unprivileged classes for group URBANICITY are ['Highly Urban/ Urban']
Encoded MSTATUS: {'No': 0, 'Yes': 1}
Encoded GENDER: {'F': 0, 'M': 1}
Encoded CAR_USE: {'Commercial': 0, 'Private': 1}
Encoded CAR_TYPE: {'Minivan': 0, 'Panel Truck': 1, 'Pickup': 2, 'SUV': 3, 'Sports Car': 4, 'Van': 5}
Encoded RED_CAR: {'no': 0, 'yes': 1}
Encoded REVOKED: {'No': 0, 'Yes': 1}
Encoded EDUCATION: {'<High School': 0, 'Bachelors': 1, 'High School': 2, 'Masters': 3, 'PhD': 4}
Encoded OCCUPATION: {'Blue Collar': 0, 'Clerical': 1, 'Doctor': 2, 'Home Maker': 3, 'Lawyer': 4, 'Mana

In [24]:
# # # Initialize a global LabelEncoder
# # encoder = LabelEncoder()

# # # Collect all unique categorical values across all columns
# # all_unique_values = set()
# # for col in df.select_dtypes(include=['object']).columns:  # Select only categorical columns
# #     all_unique_values.update(df[col].astype(str).unique())

# # # Fit the encoder on all unique values
# # encoder.fit(list(all_unique_values))

# # # Dictionary to store mappings for each column
# # encoding_mappings = {}

# # # Encode each categorical column using the global encoder
# # for col in df.select_dtypes(include=['object']).columns:
# #     # Transform the column using the global encoder
# #     df[col] = encoder.transform(df[col].astype(str))

# #     # Store the mapping for this column
# #     encoding_mappings[col] = dict(zip(encoder.classes_, range(len(encoder.classes_))))

# #     # Print the mapping for verification
# #     print(f"Encoded {col}: {encoding_mappings[col]}")

# # # Display the updated DataFrame
# # print("\nDataFrame after encoding:")
# # print(df)
# class_mapping={}
# def encode_df(df):
#     # Initialize LabelEncoder
#     encoder = LabelEncoder()

#     # List of categorical columns to encode
#     categorical_columns = [
#         'MSTATUS', 'GENDER', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED', 'EDUCATION', 'OCCUPATION', 'URBANICITY','PARENT1'
#     ]

#     # Create a dictionary to map all unique values across all categorical columns to unique labels
#     unique_classes = set()

#     # Collect all unique class values across the categorical columns
#     for col in categorical_columns:
#         unique_classes.update(df[col].astype(str).unique())

#     # Create a mapping for all unique classes
#     class_mapping = {value: idx for idx, value in enumerate(unique_classes)}

#     # Encode each categorical column using the mapping
#     for col in categorical_columns:
#         df[col] = df[col].astype(str).map(class_mapping)  # Apply the mapping

#     # Optionally, print the class mapping for each column
#     for col in categorical_columns:
#         print(f"Encoded {col}: {dict(zip(class_mapping.keys(), class_mapping.values()))}")

#     return df

In [25]:
# encoded_df = encode_df(df)
display(df.head(2))

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,OCCUPATION,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,0,60.0,0,11.0,67349.0,0,0.0,0,1,4,6,14,1,14230.0,11,0,1,4461.0,2,0,3,0.0,18.0,0,1
1,0,43.0,0,11.0,91449.0,0,257252.0,0,1,2,0,22,0,14940.0,1,0,1,0.0,0,0,0,0.0,1.0,0,1


In [26]:
for col in df.columns:
    if df[col].dtype == 'object': 
        print(col)

In [27]:
# # Print the class mapping for the 'EDUCATION' column from the global class_mapping
# education_class_mapping = {k: v for k, v in class_mapping.items() if k in df['EDUCATION'].astype(str).unique()}
# print(f"Class mapping for EDUCATION: {education_class_mapping}")


In [28]:


# Handle missing values (if necessary)
#df.fillna(0, inplace=True)  # Fill missing values with 0 (or use another strategy)

label_name = 'CLAIM_FLAG'
favorable_classes = [0]
protected_attribute_names = ['EDUCATION', 'PARENT1', 'OCCUPATION']
privileged_classes = [[4,3], [0], [5,4,2]]  

# Optional parameters (if needed)
#categorical_features = ['MSTATUS', 'GENDER', 'CAR_USE', 'CAR_TYPE', 'RED_CAR', 'REVOKED']  # Specify categorical features
features_to_drop = ['id', 'address']  # Columns to drop (if any)
na_values = ['NA', '?', '']  # Handle missing values

# Now, create the StandardDataset instance
dataset = StandardDataset(
    df,
    label_name=label_name,
    favorable_classes=favorable_classes,
    protected_attribute_names=protected_attribute_names,
    privileged_classes=privileged_classes,
    # categorical_features=categorical_columns,  # Pass all categorical columns
    # features_to_drop=features_to_drop,
    # na_values=na_values
)



# Thedataset object is now processed and ready to be used for further analysis
#display(dataset)




In [29]:
display(df.head(10))

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,OCCUPATION,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,0,60.0,0,11.0,67349.0,0,0.0,0,1,4,6,14,1,14230.0,11,0,1,4461.0,2,0,3,0.0,18.0,0,1
1,0,43.0,0,11.0,91449.0,0,257252.0,0,1,2,0,22,0,14940.0,1,0,1,0.0,0,0,0,0.0,1.0,0,1
2,0,48.0,0,11.0,52881.0,0,0.0,0,1,1,5,26,1,21970.0,1,5,1,0.0,0,0,2,0.0,10.0,0,1
3,0,35.0,1,10.0,16039.0,0,124191.0,1,0,2,1,5,1,4010.0,4,3,0,38690.0,2,0,3,0.0,10.0,0,1
4,0,51.0,0,14.0,,0,306251.0,1,1,0,0,32,1,15440.0,7,0,1,0.0,0,0,0,0.0,6.0,0,1
5,0,50.0,0,,114986.0,0,243925.0,1,0,4,2,36,1,18000.0,1,3,0,19217.0,2,1,3,0.0,17.0,0,1
6,0,34.0,1,12.0,125301.0,1,0.0,0,0,1,0,46,0,17430.0,1,4,0,0.0,0,0,0,2946.0,7.0,1,1
7,0,54.0,0,,18755.0,0,,1,0,0,0,33,1,8780.0,1,3,0,0.0,0,0,0,0.0,1.0,0,1
8,1,40.0,1,11.0,50815.0,1,0.0,0,1,2,5,21,1,18930.0,6,0,0,3295.0,1,0,2,6477.0,1.0,1,1
9,0,44.0,2,12.0,43486.0,1,0.0,0,0,2,0,30,0,5900.0,10,3,0,0.0,0,0,0,0.0,10.0,0,0


In [30]:
print(groups)

['EDUCATION', 'PARENT1', 'OCCUPATION', 'URBANICITY']


In [None]:
unprivileged_groups = [{'EDUCATION': 2}]
privileged_groups = [{'EDUCATION': 4}, {'EDUCATION': 3}, {'EDUCATION': 1}] 

unprivileged_groups.append({'PARENT1': 1})
privileged_groups.append({'PARENT1': 0})

unprivileged_groups.append({'OCCUPATION': 7})
privileged_groups.append({'OCCUPATION': 5})
privileged_groups.append({'OCCUPATION' : 4}) 
privileged_groups.append({'OCCUPATION' : 2}) 



reweighing = Reweighing(
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups
)
reweighed_data = reweighing.fit_transform(dataset)

# Step 2: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    reweighed_data.features, reweighed_data.labels.ravel(), test_size=0.3, random_state=42
)

X_test = pd.DataFrame(X_test, columns=X.columns)

# Step 3: Train a logistic regression model
model = LogisticRegression(penalty='l2', C=0.1, random_state=42)
model.fit(X_train, y_train)


scores = cross_val_score(model, X_train, y_train, cv=10, scoring='roc_auc')
print(f'Cross-validation ROC-AUC: {scores.mean()} ± {scores.std()}')


# Step 4: Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Prepare the datasets for fairness evaluation
dataset_test = dataset.copy()  # Original test dataset (without reweighing)
dataset_test.features = X_test
dataset_test.labels = y_test

# For reweighed data, we use the predictions made by the model 
reweighed_dataset_test = dataset.copy()
reweighed_dataset_test.features = X_test
reweighed_dataset_test.labels = y_pred





y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class




# # Step 6: Evaluate fairness using ClassificationMetric
# metric_original = ClassificationMetric(dataset_test, dataset_test.copy())  # Original data without reweighing
# metric_reweighed = ClassificationMetric(reweighed_dataset_test, reweighed_dataset_test.copy())  # Reweighed data

# # Example of fairness metrics (you can print more specific metrics based on your needs)
# print(f"Accuracy on test set: {accuracy_score(y_test, y_pred)}")
# print(f"Disparate Impact: {metric_reweighed.disparate_impact()}")
# print(f"Equal Opportunity Difference: {metric_reweighed.equal_opportunity_difference()}")
# print(f"Average Odds Difference: {metric_reweighed.average_odds_difference()}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
groups.remove('URBANICITY')

for group in groups:
    #evaluate_fairness(y_test, y_pred, X_test[group], group)
    evaluate_accuracy(X_test, y_test, y_pred, y_pred_proba, group)

{'Group': 'EDUCATION', 'Value': 3.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 97, 'TN': 392, 'FP': 0, 'FN': 0}
{'Group': 'EDUCATION', 'Value': 1.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 180, 'TN': 508, 'FP': 0, 'FN': 0}
{'Group': 'EDUCATION', 'Value': 0.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 126, 'TN': 258, 'FP': 0, 'FN': 0}
{'Group': 'EDUCATION', 'Value': 2.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 248, 'TN': 437, 'FP': 0, 'FN': 0}
{'Group': 'EDUCATION', 'Value': 4.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 32, 'TN': 171, 'FP': 0, 'FN': 0}
{'Group': 'PARENT1', 'Value': 0.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 525, 'TN': 1604, 'FP': 0, 'FN': 0}
{'Group': 'PARENT1', 'Value': 1.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 158, 'TN': 162, 'FP': 0, 'FN': 0}
{'Group': 'OCCUPATION', 'Value': 5.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 36, 'TN': 273, 'FP': 0, 'FN': 0}
{'Group': 'OCCUPATION', 'Value': 6.0, 'Accuracy': 1.0, 'ROC-AUC': 1.0, 'TP': 78, 'TN': 241, 'FP': 0, 'FN': 0}
{'Group': 'OCCU