In [1]:
import pandas as pd
import numpy as np
import itertools
import os
import logging
import time
import gc
import math
import json 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [None]:
datasetPath = '../flask_ml/data/Income_dataset.csv'  
outputExcelFile = 'feature_gridsearch_fairness_results.xlsx'
targetColumn = 'Income' 
protectedAttributesOriginal = ['age', 'race', 'sex']

ageColumn = 'age'
binnedAgeColumnName = 'age_bin'
defaultAgeBins = [0, 24, 34, 44, 54, 64, np.inf]
defaultAgeLabels = ['<25', '25-34', '35-44', '45-54', '55-64', '65+']

In [3]:
testSize = 0.20
randomState = 42
lrParams = {'C': 1.0, 'solver': 'lbfgs', 'max_iter': 1000, 'random_state': randomState, 'class_weight': 'balanced'} 
minFeaturesCombinationSize = 1 
maxFeaturesCombinationSize = 9
includeFullFeatureSet = True

In [4]:
fairnessMetricsSubgroup = ['Demographic Parity', 'Equalized Odds', 'FPR', 'Predictive Parity', 'Group Size']
metricsForOverallDisparity = ['Demographic Parity', 'Equalized Odds', 'Predictive Parity'] 

In [5]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logProgressEveryN = 100

In [6]:
def calculate_fairness_metrics(dfGroup):
    """Calculates fairness metrics for a given subgroup DataFrame."""
    if dfGroup.empty:
        return {'Demographic Parity': np.nan, 'Equalized Odds': np.nan, 'FPR': np.nan, 'Predictive Parity': np.nan, 'Group Size': 0}

    yTrue = dfGroup['y_true'].astype(int)
    yPred = dfGroup['y_pred'].astype(int)
    
    tp = ((yTrue == 1) & (yPred == 1)).sum()
    fp = ((yTrue == 0) & (yPred == 1)).sum()
    tn = ((yTrue == 0) & (yPred == 0)).sum()
    fn = ((yTrue == 1) & (yPred == 0)).sum()

    predictedPositive = tp + fp
    actualPositive = tp + fn
    actualNegative = tn + fp
    totalGroup = tp + fp + tn + fp 

    dpRate = predictedPositive / totalGroup if totalGroup > 0 else np.nan
    tprRate = tp / actualPositive if actualPositive > 0 else np.nan 
    fprRate = fp / actualNegative if actualNegative > 0 else np.nan
    ppRate = tp / predictedPositive if predictedPositive > 0 else np.nan

    return {'Demographic Parity': dpRate,
            'Equalized Odds': tprRate, 
            'FPR': fprRate,           
            'Predictive Parity': ppRate,
            'Group Size': totalGroup}

In [7]:
def apply_age_binning(df, ageCol, binnedColName, bins, labels):
    """Applies binning to the specified age column."""
    dfBinned = df.copy()
    if ageCol in dfBinned.columns and pd.api.types.is_numeric_dtype(dfBinned[ageCol]):
        logging.info(f"Attempting to bin numeric column '{ageCol}' into '{binnedColName}'.")
        try:
             dfBinned[binnedColName] = pd.cut(dfBinned[ageCol], bins=bins, labels=labels, right=False, include_lowest=True)
             dfBinned[binnedColName] = dfBinned[binnedColName].astype(str).fillna('NaN_Age_Bin')
             logging.info(f"Successfully binned '{ageCol}' into '{binnedColName}'. Unique values: {dfBinned[binnedColName].unique()}")
        except Exception as binErr:
             logging.error(f"Error binning age column '{ageCol}': {binErr}", exc_info=True)
             raise ValueError(f"Failed to bin age column '{ageCol}'. Cannot proceed.") from binErr
    elif ageCol in dfBinned.columns:
         logging.warning(f"Column '{ageCol}' exists but is not numeric. Skipping binning.")
         dfBinned[binnedColName] = dfBinned[ageCol].astype(str).fillna('NaN_Age_Str')
         logging.info(f"Converted non-numeric '{ageCol}' to string as '{binnedColName}'.")
    else:
        logging.error(f"Age column '{ageCol}' not found in DataFrame. Cannot perform binning.")
        raise ValueError(f"Required age column '{ageCol}' not found.")

    return dfBinned

In [8]:
def create_preprocessing_pipeline(numericFeatures, categoricalFeatures):
    """Creates a ColumnTransformer pipeline for the given features."""
    transformers = []
    if numericFeatures:
        numPipe = Pipeline([
            ('imp', SimpleImputer(strategy='mean')),
            ('scale', StandardScaler())
        ])
        transformers.append(('num', numPipe, numericFeatures)) 
        
    if categoricalFeatures:
        catPipe = Pipeline([
            ('imp', SimpleImputer(strategy='most_frequent')), 
            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        transformers.append(('cat', catPipe, categoricalFeatures)) 

    if not transformers:
        logging.warning("create_preprocessing_pipeline called with no features.")
        return ColumnTransformer(transformers=[], remainder='drop') 

    preprocessor = ColumnTransformer(
        transformers=transformers,
        remainder='drop', 
        verbose_feature_names_out=False 
    )
    preprocessor.set_output(transform="pandas") 
    return preprocessor

# --- Data Loading and Preparation ---
logging.info("--- Starting Data Loading and Preparation ---")

try:
    dfFull = pd.read_csv(datasetPath)
    logging.info(f"Successfully loaded dataset: {datasetPath}. Shape: {dfFull.shape}")
except FileNotFoundError:
    logging.error(f"Dataset file not found at: {datasetPath}")
    raise
except Exception as e:
    logging.error(f"Error loading dataset: {e}")
    raise

dfFull.columns = dfFull.columns.str.strip()
dfFull.columns = dfFull.columns.str.replace('-', '_', regex=False).str.replace(' ', '_', regex=False)

for col in dfFull.select_dtypes(include=['object']).columns:
    if col in dfFull.columns: 
        try:
            dfFull[col] = dfFull[col].str.strip()
            dfFull[col] = dfFull[col].replace(['?', 'N/A', '', 'None'], np.nan)
        except AttributeError:
            logging.warning(f"Could not apply string operations to column '{col}'.")

essentialCols = [targetColumn] + protectedAttributesOriginal 
missingEssentials = [col for col in essentialCols if col not in dfFull.columns]
if missingEssentials:
    raise ValueError(f"Essential columns missing from dataset: {missingEssentials}")

uniqueTargets = dfFull[targetColumn].unique()
if len(uniqueTargets) > 2:
    if '<=50K' in uniqueTargets and '>50K' in uniqueTargets:
        targetMap = {'<=50K': 0, '>50K': 1}
        dfFull[targetColumn] = dfFull[targetColumn].map(targetMap)
    else:
         if len(uniqueTargets) == 2:
              logging.warning(f"Target values not '<=50K', '>50K'. Using first unique value '{uniqueTargets[0]}' as 0, second '{uniqueTargets[1]}' as 1.")
              targetMap = {uniqueTargets[0]: 0, uniqueTargets[1]: 1}
              dfFull[targetColumn] = dfFull[targetColumn].map(targetMap)
         else:
             raise ValueError(f"Target column '{targetColumn}' has more than 2 unique values and standard mapping failed. Values: {uniqueTargets}")
elif len(uniqueTargets) == 1:
     raise ValueError(f"Target column '{targetColumn}' has only one unique value: {uniqueTargets[0]}. Cannot train a model.")
else: 
     if not set(uniqueTargets).issubset({0, 1}):
         logging.warning(f"Target values are binary but not 0/1 ({uniqueTargets}). Mapping first to 0, second to 1.")
         targetMap = {uniqueTargets[0]: 0, uniqueTargets[1]: 1}
         dfFull[targetColumn] = dfFull[targetColumn].map(targetMap)

if dfFull[targetColumn].isnull().any():
    logging.warning(f"Target column '{targetColumn}' contains NaN values after mapping. Checking count...")
    nanCount = dfFull[targetColumn].isnull().sum()
    logging.warning(f"{nanCount} rows have NaN target.")
    rowsBefore = len(dfFull)
    dfFull.dropna(subset=[targetColumn], inplace=True)
    logging.info(f"Dropped {rowsBefore - len(dfFull)} rows with missing target values.")
    
dfFull[targetColumn] = dfFull[targetColumn].astype(int)

dfProcessed = apply_age_binning(dfFull, ageColumn, binnedAgeColumnName, defaultAgeBins, defaultAgeLabels)

protectedAttributesAnalysis = [binnedAgeColumnName if p == ageColumn else p for p in protectedAttributesOriginal]
logging.info(f"Protected attributes for analysis (column names): {protectedAttributesAnalysis}")

potentialFeatureCols = [
    col for col in dfProcessed.columns
    if col != targetColumn and col not in protectedAttributesOriginal 
]
if binnedAgeColumnName in potentialFeatureCols and binnedAgeColumnName != ageColumn:
     potentialFeatureCols.remove(binnedAgeColumnName)

logging.info(f"Identified {len(potentialFeatureCols)} potential features for grid search.")

essentialAnalysisCols = [targetColumn] + protectedAttributesAnalysis 
rowsBeforeNa = len(dfProcessed)
dfProcessed.dropna(subset=protectedAttributesAnalysis, inplace=True) 
rowsAfterNa = len(dfProcessed)
if rowsAfterNa < rowsBeforeNa:
    logging.warning(f"Dropped {rowsBeforeNa - rowsAfterNa} rows due to missing values in protected attributes: {protectedAttributesAnalysis}")

if dfProcessed.empty:
    raise ValueError("DataFrame is empty after handling missing values. Cannot proceed.")

logging.info(f"Final dataset shape before split: {dfProcessed.shape}")

x = dfProcessed[potentialFeatureCols]
y = dfProcessed[targetColumn]
p = dfProcessed[protectedAttributesAnalysis] 

stratifyOption = y if y.nunique() > 1 and y.value_counts().min() >= 2 else None
if stratifyOption is None:
    logging.warning("Cannot stratify train/test split.")
    
xTrainOrig, xTestOrig, yTrain, yTest, pTrain, pTest = train_test_split(
    x, y, p,
    test_size=testSize,
    random_state=randomState,
    stratify=stratifyOption
)

logging.info(f"Data split complete: Train size={len(yTrain)}, Test size={len(yTest)}")
del dfFull, dfProcessed, x, y, p 
gc.collect()

# --- Grid Search ---
logging.info("--- Starting Feature Subset Grid Search ---")
startTimeGridsearch = time.time()

allResults = []
processedCombinationsCount = 0

featureCombinations = []
logging.info(f"Generating feature combinations (Min Size: {minFeaturesCombinationSize}, Max Size: {maxFeaturesCombinationSize})...")
for k in range(minFeaturesCombinationSize, maxFeaturesCombinationSize + 1):
    for combo in itertools.combinations(potentialFeatureCols, k): 
        featureCombinations.append(list(combo))

if includeFullFeatureSet:
    if tuple(potentialFeatureCols) not in [tuple(sorted(c)) for c in featureCombinations]: 
         featureCombinations.append(list(potentialFeatureCols))
         logging.info(f"Added the full feature set ({len(potentialFeatureCols)} features).")
         
totalCombinations = len(featureCombinations)
logging.info(f"Total feature combinations to process: {totalCombinations}")

# --- Loop Through Combinations ---
for i, selectedFeatures in enumerate(featureCombinations):
    iterationStartTime = time.time()
    if (i + 1) % logProgressEveryN == 0 or (i + 1) == totalCombinations:
         logging.info(f"--- Processing Combination {i+1}/{totalCombinations} ({len(selectedFeatures)} features) ---")

    if not all(f in xTrainOrig.columns for f in selectedFeatures): 
        logging.error(f"Skipping combination {i+1}: Features not found in xTrainOrig. Features: {selectedFeatures}")
        continue
        
    xTrainSubset = xTrainOrig[selectedFeatures].copy()
    xTestSubset = xTestOrig[selectedFeatures].copy()

    numericFeaturesSubset = xTrainSubset.select_dtypes(include=np.number).columns.tolist()
    categoricalFeaturesSubset = xTrainSubset.select_dtypes(exclude=np.number).columns.tolist()
    
    resultsRow = {
        'Features Used': ", ".join(selectedFeatures), 
        'Num Features': len(selectedFeatures),      
        'Accuracy': np.nan,                        
    }
    
    for paColName in protectedAttributesAnalysis: 
        for metric in metricsForOverallDisparity:
             key = f"{metric.replace(' ', '_')}_Disparity_{paColName}" 
             resultsRow[key] = np.nan

    for paColName in protectedAttributesAnalysis: 
        for group in pTest[paColName].unique(): 
             groupStr = str(group) if pd.notna(group) else "NaN_Group" 
             groupStrKey = groupStr.replace('<','lt').replace('+','plus').replace(' ','_').replace('=','eq')
             groupStrKey = ''.join(filter(lambda x: x.isalnum() or x == '_', groupStrKey))
             for fm in fairnessMetricsSubgroup:
                fmKeyPart = fm.replace(' ', '_')
                key = f"{fmKeyPart}_{paColName}_{groupStrKey}" 
                resultsRow[key] = np.nan 
    
    # --- Main Processing Logic ---
    try:
        preprocessor = create_preprocessing_pipeline(numericFeaturesSubset, categoricalFeaturesSubset)
        xTrainProcessed = preprocessor.fit_transform(xTrainSubset)
        xTestProcessed = preprocessor.transform(xTestSubset)

        model = LogisticRegression(**lrParams) 
        if xTrainProcessed.shape[1] == 0:
             # Log warning only if it happens
             logging.warning(f"Combination {i+1} resulted in 0 features after preprocessing. Skipping training. Features: {selectedFeatures}")
             yPred = np.repeat(yTrain.mode()[0], len(yTest)) 
        else:
            model.fit(xTrainProcessed, yTrain) 
            yPred = model.predict(xTestProcessed)

        accuracy = accuracy_score(yTest, yPred) 
        resultsRow['Accuracy'] = round(accuracy, 4) 

        # --- Evaluation: Fairness ---
        resultsDfTemp = pTest.copy() 
        resultsDfTemp['y_true'] = yTest 
        resultsDfTemp['y_pred'] = yPred

        allSubgroupMetrics = {} 

        for paColName in protectedAttributesAnalysis: 
            subgroupMetricsForPa = {} 
            metricsForDisparityCalc = {m: [] for m in ['Demographic Parity', 'Equalized Odds', 'FPR', 'Predictive Parity']} 

            uniqueGroups = resultsDfTemp[paColName].unique() 
            
            for groupName in uniqueGroups:
                 groupNameStr = str(groupName) if pd.notna(groupName) else "NaN_Group" 
                 
                 groupDf = resultsDfTemp[resultsDfTemp[paColName] == groupName]
                 metrics = calculate_fairness_metrics(groupDf) 
                 subgroupMetricsForPa[groupNameStr] = metrics
                 
                 groupStrKey = groupNameStr.replace('<','lt').replace('+','plus').replace(' ','_').replace('=','eq')
                 groupStrKey = ''.join(filter(lambda x: x.isalnum() or x == '_', groupStrKey))
                 for fmKey, fmValue in metrics.items():
                      fmKeyPart = fmKey.replace(' ', '_')
                      colNameKey = f"{fmKeyPart}_{paColName}_{groupStrKey}" 
                      
                      if colNameKey in resultsRow:
                          resultsRow[colNameKey] = round(fmValue, 4) if pd.notna(fmValue) else np.nan
                      else:
                          logging.warning(f"Adding new results column key dynamically (unexpected): {colNameKey}")
                          resultsRow[colNameKey] = round(fmValue, 4) if pd.notna(fmValue) else np.nan

                 if metrics['Group Size'] > 0:
                      for metricKeyInternal in metricsForDisparityCalc.keys():
                           if pd.notna(metrics[metricKeyInternal]): 
                               metricsForDisparityCalc[metricKeyInternal].append(metrics[metricKeyInternal])
            
            allSubgroupMetrics[paColName] = subgroupMetricsForPa 
            
            # --- Calculate Overall Disparity Scores ---
            for metricDisp in metricsForOverallDisparity: 
                overallScore = np.nan
                
                if metricDisp == 'Equalized Odds':
                     tprRates = metricsForDisparityCalc.get('Equalized Odds', []) 
                     fprRates = metricsForDisparityCalc.get('FPR', [])
                     tprDiff = max(tprRates) - min(tprRates) if len(tprRates) > 1 else 0.0
                     fprDiff = max(fprRates) - min(fprRates) if len(fprRates) > 1 else 0.0
                     validDiffs = [d for d in [tprDiff, fprDiff] if pd.notna(d)]
                     overallScore = max(validDiffs) if validDiffs else np.nan
                else:
                     rates = metricsForDisparityCalc.get(metricDisp, [])
                     if len(rates) > 1:
                         overallScore = max(rates) - min(rates)
                     
                disparityKey = f"{metricDisp.replace(' ', '_')}_Disparity_{paColName}" 
                resultsRow[disparityKey] = round(overallScore, 4) if pd.notna(overallScore) else np.nan
                
        processedCombinationsCount += 1

    except MemoryError:
        logging.error(f"Memory Error encountered processing combination {i+1}. Skipping.")
        gc.collect() 
        resultsRow['Accuracy'] = 'Memory Error' 
    except Exception as e:
        logging.error(f"Error processing combination {i+1} ({selectedFeatures}): {e}", exc_info=True) 
        resultsRow['Accuracy'] = f'Error: {type(e).__name__}' 

    finally:
        allResults.append(resultsRow)
        del xTrainSubset, xTestSubset 
        if 'xTrainProcessed' in locals(): del xTrainProcessed
        if 'xTestProcessed' in locals(): del xTestProcessed
        if 'model' in locals(): del model
        if 'preprocessor' in locals(): del preprocessor
        if 'resultsDfTemp' in locals(): del resultsDfTemp 



gridsearchDuration = time.time() - startTimeGridsearch
logging.info(f"--- Grid Search Finished ---")
logging.info(f"Processed {processedCombinationsCount}/{totalCombinations} combinations.")
logging.info(f"Total time: {gridsearchDuration:.2f} seconds ({gridsearchDuration/60:.2f} minutes)")


# --- Formatting and Saving Results ---
logging.info("--- Formatting and Saving Results ---")

if not allResults:
    logging.warning("No results were generated. Skipping saving to Excel.")
else:
    resultsDf = pd.DataFrame(allResults) 
    
    colsOrder = ['Num Features', 'Accuracy'] 
    for paColName in protectedAttributesAnalysis:
        for metric in metricsForOverallDisparity:
            colsOrder.append(f"{metric.replace(' ', '_')}_Disparity_{paColName}")
            
    colsOrder.append('Features Used') 
    
    remainingCols = [col for col in resultsDf.columns if col not in colsOrder]
    remainingCols.sort() 
    
    finalCols = colsOrder + remainingCols
    finalCols = [col for col in finalCols if col in resultsDf.columns]
    
    resultsDf = resultsDf[finalCols]

    try:
        logging.info(f"Attempting to save results to: {outputExcelFile}")
        resultsDf.to_excel(outputExcelFile, index=False, engine='openpyxl')
        logging.info(f"Successfully saved results to {outputExcelFile}")
    except Exception as e:
        logging.error(f"Failed to save results to Excel: {e}", exc_info=True)
        try:
             csvFallbackPath = outputExcelFile.replace('.xlsx', '.csv')
             logging.warning(f"Saving results as CSV fallback: {csvFallbackPath}")
             resultsDf.to_csv(csvFallbackPath, index=False)
             logging.info(f"Successfully saved results to {csvFallbackPath}")
        except Exception as csvE:
             logging.error(f"Failed to save results to CSV as fallback: {csvE}", exc_info=True)

#
print("\n--- Results Summary (First 5 Rows) ---")
print(resultsDf.head())
print("\n--- Results Summary (Last 5 Rows) ---")
print(resultsDf.tail())

2025-05-02 10:44:27,017 - INFO - --- Starting Data Loading and Preparation ---
2025-05-02 10:44:27,090 - INFO - Successfully loaded dataset: flask_ml/data/Income_dataset.csv. Shape: (48842, 14)
2025-05-02 10:44:27,228 - INFO - Attempting to bin numeric column 'age' into 'age_bin'.
2025-05-02 10:44:27,241 - INFO - Successfully binned 'age' into 'age_bin'. Unique values: ['35-44' '45-54' '25-34' '<25' '55-64' '65+']
2025-05-02 10:44:27,241 - INFO - Protected attributes for analysis (column names): ['age_bin', 'race', 'sex']
2025-05-02 10:44:27,242 - INFO - Identified 10 potential features for grid search.
2025-05-02 10:44:27,268 - INFO - Final dataset shape before split: (48842, 15)
2025-05-02 10:44:27,301 - INFO - Data split complete: Train size=39073, Test size=9769
2025-05-02 10:44:27,353 - INFO - --- Starting Feature Subset Grid Search ---
2025-05-02 10:44:27,354 - INFO - Generating feature combinations (Min Size: 1, Max Size: 9)...
2025-05-02 10:44:27,354 - INFO - Added the full fea


--- Results Summary (First 5 Rows) ---
   Num Features  Accuracy  Demographic_Parity_Disparity_age_bin  \
0             1    0.6735                                0.2705   
1             1    0.7130                                0.3002   
2             1    0.7130                                0.3002   
3             1    0.7172                                0.4282   
4             1    0.6200                                0.1621   

   Equalized_Odds_Disparity_age_bin  Predictive_Parity_Disparity_age_bin  \
0                            0.2222                               0.4374   
1                            0.4247                               0.5694   
2                            0.4247                               0.5694   
3                            0.4626                               0.4684   
4                            0.3032                               0.5350   

   Demographic_Parity_Disparity_race  Equalized_Odds_Disparity_race  \
0                            