In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
from fairlearn.metrics import MetricFrame

In [3]:
data = pd.read_csv('cancer issue.csv')

In [5]:
# Preprocessing
X = data.drop(columns=["PatientID", "Recurrence"])
y = data["Recurrence"].map({"Yes": 1, "No": 0})

# Define categorical and numerical columns
categorical_cols = ["Gender", "Race/Ethnicity", "SmokingStatus", "FamilyHistory", "CancerType", "Stage", "TreatmentType", "GeneticMarker", "HospitalRegion"]
numerical_cols = ["Age", "BMI", "TumorSize", "SurvivalMonths"]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(drop="first"), categorical_cols)
    ]
)

# Create pipeline with Random Forest
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

# Analyze bias: Group-wise metrics
results = []
groups = ["Gender", "Race/Ethnicity"]


Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.55      0.53      1774
           1       0.50      0.46      0.48      1764

    accuracy                           0.50      3538
   macro avg       0.50      0.50      0.50      3538
weighted avg       0.50      0.50      0.50      3538

ROC-AUC Score: 0.5072341225103344


In [6]:

for group in groups:
    for value in data[group].unique():
        mask = X_test[group] == value
        if mask.sum() > 0:
            group_y_test = y_test[mask]
            group_y_pred = y_pred[mask]
            group_y_pred_proba = y_pred_proba[mask]
            results.append({
                "Group": group,
                "Value": value,
                "Accuracy": (group_y_test == group_y_pred).mean(),
                "ROC-AUC": roc_auc_score(group_y_test, group_y_pred_proba)
            })

bias_df = pd.DataFrame(results)
print("\nGroup-wise Performance Metrics:")
print(bias_df)



Group-wise Performance Metrics:
            Group             Value  Accuracy   ROC-AUC
0          Gender            Female  0.500845  0.502630
1          Gender              Male  0.507657  0.513250
2  Race/Ethnicity             Other  0.505185  0.521728
3  Race/Ethnicity         Caucasian  0.486339  0.478156
4  Race/Ethnicity             Asian  0.500682  0.498451
5  Race/Ethnicity          Hispanic  0.514563  0.513515
6  Race/Ethnicity  African American  0.515510  0.527020


In [13]:

from sklearn.linear_model import LogisticRegression


X = data.drop(columns=["PatientID", "Recurrence"])
y = data["Recurrence"].map({"Yes": 1, "No": 0})

X = pd.get_dummies(X, drop_first=False)


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a RandomForestClassifier
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Define sensitive features
sensitive_features = X_test[['Gender_Male',
       'Race/Ethnicity_Asian', 'Race/Ethnicity_Caucasian',
       'Race/Ethnicity_Hispanic', 'Race/Ethnicity_Other']]

# Map sensitive feature names for clarity
sensitive_features.columns = ['Gender',
       'Race/Ethnicity_Asian', 'Race/Ethnicity_Caucasian',
       'Race/Ethnicity_Hispanic', 'Race/Ethnicity_Other']



# Make predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
# Compute metrics
metrics = {
    "Accuracy": lambda y_true, y_pred: accuracy_score(y_true, y_pred),
    "ROC AUC": lambda y_true, y_pred: roc_auc_score(y_true, y_pred),
}

# MetricFrame calculation
metric_frame = MetricFrame(
    metrics=metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_features
)

# Print results
print("Overall Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.3f}\n")

print("Group-wise Metrics:")
print(metric_frame.by_group)

Overall Metrics:
Accuracy: 0.501
ROC AUC: 0.509

Group-wise Metrics:
                                                                                                   Accuracy  \
Gender Race/Ethnicity_Asian Race/Ethnicity_Caucasian Race/Ethnicity_Hispanic Race/Ethnicity_Other             
False  False                False                    False                   False                 0.490385   
                                                                             True                  0.498106   
                                                     True                    False                 0.463100   
                                                                             True                       NaN   
                            True                     False                   False                 0.516917   
                                                                             True                       NaN   
                                           

In [20]:
X_test.columns

Index(['Age', 'BMI', 'TumorSize', 'SurvivalMonths', 'Gender_Male',
       'Race/Ethnicity_Asian', 'Race/Ethnicity_Caucasian',
       'Race/Ethnicity_Hispanic', 'Race/Ethnicity_Other',
       'SmokingStatus_Non-Smoker', 'SmokingStatus_Smoker', 'FamilyHistory_Yes',
       'CancerType_Colon', 'CancerType_Leukemia', 'CancerType_Lung',
       'CancerType_Prostate', 'CancerType_Skin', 'Stage_II', 'Stage_III',
       'Stage_IV', 'TreatmentType_Combination Therapy',
       'TreatmentType_Radiation', 'TreatmentType_Surgery',
       'TreatmentResponse_No Response', 'TreatmentResponse_Partial Remission',
       'GeneticMarker_EGFR', 'GeneticMarker_KRAS', 'HospitalRegion_North',
       'HospitalRegion_South', 'HospitalRegion_West'],
      dtype='object')