In [40]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from deepface import DeepFace
import face_recognition

FAIRFACE_CSV = "C:\\Users\\jonat\\Desktop\\dataethicsproj\\data259-project\\analysis\\data\\fairface_label_train.csv"
FAIRFACE_DIR = "C:\\Users\\jonat\\Desktop\\dataethicsproj\\data259-project\\analysis\\data\\fairface-img-margin025-trainval"

# Step 1: Load and prepare the FairFace dataset
def load_fairface_data(csv_path, image_dir, sample_size=500):
    """
    Load the FairFace dataset and create intersectional categories
    """
    # Load CSV with labels
    df = pd.read_csv(csv_path)
    print(f"Columns in CSV file: {df.columns.tolist()}")
    print(f"Total records in CSV: {len(df)}")
    
    # Create intersectional categories
    df['gender_race'] = df['gender'] + '_' + df['race']
    print("Created 'gender_race' column by combining 'gender' and 'race'")
    
    # Take a sample for faster processing if needed
    if sample_size and sample_size < len(df):
        df = df.sample(sample_size, random_state=42)
        print(f"Sampled {sample_size} records")
    
    # Check if files exist - with more detailed debugging
    print(f"Checking image paths in directory: {image_dir}")
    # Check if directory exists
    if not os.path.isdir(image_dir):
        print(f"ERROR: Image directory does not exist: {image_dir}")
        return df.head(0)  # Return empty dataframe
    
    valid_files = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Checking image files"):
        file_path = os.path.join(image_dir, row['file'])
        if os.path.exists(file_path):
            valid_files.append(idx)
        else:
            # Print a few examples of missing files for debugging
            if len(valid_files) == 0 and len(valid_files) + len([i for i in range(len(df)) if i not in valid_files]) < 5:
                print(f"File not found: {file_path}")
    
    # Handle case where no files were found
    if len(valid_files) == 0:
        print("ERROR: No valid image files found!")
        print("Checking first few file paths from CSV:")
        for i, row in df.iloc[:5].iterrows():
            file_path = os.path.join(image_dir, row['file'])
            print(f"- {file_path} (exists: {os.path.exists(file_path)})")
        return df.head(0)  # Return empty dataframe
    
    df_valid = df.loc[valid_files]
    print(f"Loaded {len(df_valid)} images with valid file paths out of {len(df)} total")
    
    return df_valid

In [58]:
BASE_DIR = r"C:\Users\jonat\Desktop\dataethicsproj\data259-project\analysis\data\fairface-img-margin025-trainval"

In [59]:
def analyze_opencv_haar(data_df, sample_size=None):
    """
    Analyze OpenCV Haar Cascades face detection across racial groups.
    Args:
        data_df: DataFrame with image paths and race labels
        sample_size: Number of images to sample (for faster testing)
    Returns:
        DataFrame with detection results
    """
    print("Analyzing OpenCV Haar Cascades...")
    
    # Sample data if requested
    if sample_size and sample_size < len(data_df):
        data_df = data_df.sample(sample_size, random_state=42)
    
    # Load the pre-trained model
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    results = []
    
    for idx, row in tqdm(data_df.iterrows(), total=len(data_df)):
        img_path = os.path.join(BASE_DIR, row['file'])
        race = row['race']
        
        try:
            # Read image
            img = cv2.imread(img_path)
            if img is None:
                results.append({
                    'image_path': img_path,
                    'race': race,
                    'detected': False,
                    'confidence': 0,
                    'error': 'Image could not be loaded'
                })
                continue
                
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Detect faces with confidence scores
            faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, 
                                                 minNeighbors=5, minSize=(30, 30),
                                                 flags=cv2.CASCADE_SCALE_IMAGE)
            
            # Check if any faces were detected
            if len(faces) > 0:
                # OpenCV doesn't provide confidence scores directly, so we'll use
                # the number of neighbors as a proxy for confidence
                max_neighbors = 0
                for (x, y, w, h) in faces:
                    # Detect with different minNeighbors to get a "confidence" score
                    for neighbors in range(1, 10):
                        test_faces = face_cascade.detectMultiScale(
                            gray[y:y+h, x:x+w], 
                            scaleFactor=1.1, 
                            minNeighbors=neighbors
                        )
                        if len(test_faces) > 0:
                            max_neighbors = max(max_neighbors, neighbors)
                
                results.append({
                    'image_path': img_path,
                    'race': race,
                    'detected': True,
                    'confidence': max_neighbors / 10.0,  # Normalize to 0-1 range
                    'faces_count': len(faces),
                    'error': None
                })
            else:
                results.append({
                    'image_path': img_path,
                    'race': race,
                    'detected': False,
                    'confidence': 0,
                    'faces_count': 0,
                    'error': None
                })
                
        except Exception as e:
            results.append({
                'image_path': img_path,
                'race': race,
                'detected': False,
                'confidence': 0,
                'error': str(e)
            })
    
    results_df = pd.DataFrame(results)
    
    
    return results_df

In [42]:
def test_deepface(df, image_dir):
    """
    Test DeepFace for face detection and demographic classification
    """
    # Handle empty dataframe
    if len(df) == 0:
        print("No images to process for DeepFace")
        return pd.DataFrame(columns=['file', 'gender', 'race', 'age', 'gender_race', 'detected', 
                                    'predicted_gender', 'predicted_race', 'predicted_age', 
                                    'gender_correct', 'race_correct'])
    
    results = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Testing DeepFace"):
        # Use the corrected path if available
        if 'corrected_path' in row:
            img_path = row['corrected_path']
        else:
            img_path = os.path.join(image_dir, row['file'])
        
        try:
            # Analyze image with DeepFace
            analysis = DeepFace.analyze(
                img_path=img_path, 
                actions=['age', 'gender', 'race'], 
                enforce_detection=False
            )
            
            # Handle single face or multiple faces
            if isinstance(analysis, list):
                analysis = analysis[0]  # Take first face if multiple
            
            # Debug output for first few results to understand structure
            if len(results) < 2:
                print(f"DeepFace analysis structure for {row['file']}:")
                print(type(analysis))
                print(analysis.keys() if isinstance(analysis, dict) else "Not a dict")
                if isinstance(analysis, dict):
                    for k, v in analysis.items():
                        print(f"  {k}: {type(v)} - {v}")
            
            # Initialize result dict with metadata
            result = {
                'file': row['file'],
                'gender': row['gender'],
                'race': row['race'],
                'age': row['age'],
                'gender_race': row['gender_race'],
                'detected': True,
                'predicted_gender': None,
                'predicted_race': None,
                'predicted_age': None,
                'gender_correct': False,
                'race_correct': False
            }
            
            # Safely extract predictions
            if isinstance(analysis, dict):
                # Gender - handle different possible structures
                if 'gender' in analysis:
                    if isinstance(analysis['gender'], str):
                        result['predicted_gender'] = analysis['gender']
                    elif isinstance(analysis['gender'], dict) and 'dominant_gender' in analysis['gender']:
                        result['predicted_gender'] = analysis['gender']['dominant_gender']
                
                # Race - handle different possible structures
                if 'race' in analysis:
                    if isinstance(analysis['race'], str):
                        result['predicted_race'] = analysis['race']
                    elif isinstance(analysis['race'], dict) and 'dominant_race' in analysis['race']:
                        result['predicted_race'] = analysis['race']['dominant_race']
                    elif 'dominant_race' in analysis:
                        result['predicted_race'] = analysis['dominant_race']
                
                # Age - handle different possible structures
                if 'age' in analysis:
                    if isinstance(analysis['age'], (int, float, str)):
                        result['predicted_age'] = float(analysis['age'])
                    elif isinstance(analysis['age'], dict) and 'apparent_age' in analysis['age']:
                        result['predicted_age'] = float(analysis['age']['apparent_age'])
            
            # The correct way to extract gender prediction
            if 'dominant_gender' in analysis:
                result['predicted_gender'] = analysis['dominant_gender']
            elif 'gender' in analysis and isinstance(analysis['gender'], dict):
                # Get the gender with highest confidence
                result['predicted_gender'] = max(analysis['gender'].items(), key=lambda x: x[1])[0]

            # Then compare with FairFace label
            # Map DeepFace's "Man"/"Woman" to FairFace's "Male"/"Female"
            gender_mapping = {
                'man': 'male',
                'woman': 'female'
            }

            if result['predicted_gender'] is not None:
                predicted_gender_lower = result['predicted_gender'].lower()
                if predicted_gender_lower in gender_mapping:
                    predicted_gender_mapped = gender_mapping[predicted_gender_lower]
                    result['gender_correct'] = predicted_gender_mapped == row['gender'].lower()
            
            # Map race categories between FairFace and DeepFace
            race_mapping = {
                'white': ['White'],
                'black': ['Black'],
                'asian': ['East Asian', 'Southeast Asian'],
                'indian': ['Indian'],
                'middle eastern': ['Middle Eastern'],
                'latino hispanic': ['Latino_Hispanic']
            }
            
            # Check if race prediction is correct
            if result['predicted_race'] is not None:
                predicted_race_lower = result['predicted_race'].lower()
                for df_race, ff_races in race_mapping.items():
                    if predicted_race_lower == df_race and row['race'] in ff_races:
                        result['race_correct'] = True
                        break
            
        except Exception as e:
            print(f"Error processing {img_path} with DeepFace: {e}")
            # Handle failed analysis
            result = {
                'file': row['file'],
                'gender': row['gender'],
                'race': row['race'],
                'age': row['age'],
                'gender_race': row['gender_race'],
                'detected': False,
                'predicted_gender': None,
                'predicted_race': None,
                'predicted_age': None,
                'gender_correct': False,
                'race_correct': False
            }
            
        results.append(result)
    
    results_df = pd.DataFrame(results)
    print(f"DeepFace results shape: {results_df.shape}, columns: {results_df.columns.tolist()}")
    return results_df

In [None]:
def create_comprehensive_visualizations(df_results, output_dir='finalfinalresults'):
    """
    Create a comprehensive set of visualizations for facial analysis models
    
    Args:
        df_results: DataFrame with results (must contain predicted_gender, predicted_race, 
                   predicted_age, gender, race, age, gender_correct, race_correct columns)
        output_dir: Directory to save visualizations (default: current directory)
    """
    import os
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Set up plotting style
    plt.style.use('seaborn-v0_8-whitegrid')
    sns.set_palette("viridis")
    
    # Calculate age difference
    # if all(col in df_results.columns for col in ['predicted_age', 'age']):
    #     # Convert age columns to numeric if they're not already
    #     df_results['age'] = pd.to_numeric(df_results['age'], errors='coerce')
    #     df_results['predicted_age'] = pd.to_numeric(df_results['predicted_age'], errors='coerce')
        
    #     # Calculate age difference (absolute error)
    #     df_results['age_diff'] = (df_results['predicted_age'] - df_results['age']).abs()
        
    #     # Create a copy with non-null age difference values
    #     df_age = df_results.dropna(subset=['age_diff']).copy()
        
    #     # Print summary statistics for age prediction
    #     print("\nAge Prediction Error Statistics:")
    #     print(f"Mean absolute error: {df_age['age_diff'].mean():.2f} years")
    #     print(f"Median absolute error: {df_age['age_diff'].median():.2f} years")
    #     print(f"Min absolute error: {df_age['age_diff'].min():.2f} years")
    #     print(f"Max absolute error: {df_age['age_diff'].max():.2f} years")
    # else:
    #     print("Age columns not found. Skipping age prediction analysis.")
    #     df_age = pd.DataFrame()  # Empty dataframe
    
    # 1. OVERALL CLASSIFICATION ACCURACY
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Calculate overall accuracy rates
    accuracy_data = {}
    
    if 'gender_correct' in df_results.columns:
        accuracy_data['Gender Classification'] = df_results['gender_correct'].mean() * 100
    
    if 'race_correct' in df_results.columns:
        accuracy_data['Race Classification'] = df_results['race_correct'].mean() * 100
    
    if 'detected' in df_results.columns:
        accuracy_data['Face Detection'] = df_results['detected'].mean() * 100
    
    if accuracy_data:
        # Create bar chart
        bars = ax.bar(accuracy_data.keys(), accuracy_data.values())
        
        # Add labels and formatting
        ax.set_ylim(0, 100)
        ax.set_ylabel('Accuracy (%)')
        ax.set_title('Overall Classification Accuracy')
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'overall_accuracy.png'), dpi=300)
        plt.close()
    
    # 2. GENDER CLASSIFICATION ACCURACY BY RACE
    
    if all(col in df_results.columns for col in ['gender_correct', 'race']):
        gender_by_race = df_results.groupby('race')['gender_correct'].mean() * 100
        
        fig, ax = plt.subplots(figsize=(12, 6))
        bars = ax.bar(gender_by_race.index, gender_by_race.values)
        
        # Add labels and formatting
        ax.set_ylim(0, 100)
        ax.set_xlabel('Race')
        ax.set_ylabel('Gender Classification Accuracy (%)')
        ax.set_title('Gender Classification Accuracy by Race')
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'gender_accuracy_by_race.png'), dpi=300)
        plt.close()
        
        # Print findings
        print("\nGender Classification Accuracy by Race:")
        print(gender_by_race.sort_values())
        print(f"Disparity: {gender_by_race.max() - gender_by_race.min():.2f}% between {gender_by_race.idxmax()} and {gender_by_race.idxmin()}")
    
    # 3. RACE CLASSIFICATION ACCURACY BY GENDER
    
    if all(col in df_results.columns for col in ['race_correct', 'gender']):
        race_by_gender = df_results.groupby('gender')['race_correct'].mean() * 100
        
        fig, ax = plt.subplots(figsize=(8, 6))
        bars = ax.bar(race_by_gender.index, race_by_gender.values)
        
        # Add labels and formatting
        ax.set_ylim(0, 100)
        ax.set_xlabel('Gender')
        ax.set_ylabel('Race Classification Accuracy (%)')
        ax.set_title('Race Classification Accuracy by Gender')
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'race_accuracy_by_gender.png'), dpi=300)
        plt.close()
        
        # Print findings
        print("\nRace Classification Accuracy by Gender:")
        print(race_by_gender)
        if len(race_by_gender) > 1:
            print(f"Disparity: {race_by_gender.max() - race_by_gender.min():.2f}% between {race_by_gender.idxmax()} and {race_by_gender.idxmin()}")
    
    # 4. RACE CLASSIFICATION ACCURACY BY AGE
    
    if all(col in df_results.columns for col in ['race_correct', 'age']):
        # Create age groups for better visualization
        # df_results['age_group'] = pd.cut(
        #     pd.to_numeric(df_results['age'], errors='coerce'),
        #     bins=[0, 10, 20, 30, 40, 50, 60, 100],
        #     labels=['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60+']
        # )
        
        # Calculate accuracy by age group
        race_by_age = df_results.groupby('age')['race_correct'].mean() * 100
        age_order = ['0-2', '3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', 'more than 70']
        race_by_age = race_by_age.reindex(age_order)
        fig, ax = plt.subplots(figsize=(12, 6))
        bars = ax.bar(race_by_age.index, race_by_age.values)
        
        # Add labels and formatting
        ax.set_ylim(0, 100)
        ax.set_xlabel('Age Group')
        ax.set_ylabel('Race Classification Accuracy (%)')
        ax.set_title('Race Classification Accuracy by Age Group')
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'race_accuracy_by_age.png'), dpi=300)
        plt.close()
        
        # Print findings
        print("\nRace Classification Accuracy by Age Group:")
        print(race_by_age)
        if not race_by_age.isna().all():
            print(f"Disparity: {race_by_age.max() - race_by_age.min():.2f}% between {race_by_age.idxmax()} and {race_by_age.idxmin()}")
    
    # 5. GENDER CLASSIFICATION ACCURACY BY AGE
    
    if all(col in df_results.columns for col in ['gender_correct', 'age']):
        gender_by_age = df_results.groupby('age')['gender_correct'].mean() * 100
        age_order = ['0-2', '3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', 'more than 70']
        gender_by_age = gender_by_age.reindex(age_order)   
        fig, ax = plt.subplots(figsize=(12, 6))
        bars = ax.bar(gender_by_age.index, gender_by_age.values)
        
        # Add labels and formatting
        ax.set_ylim(0, 100)
        ax.set_xlabel('Age Group')
        ax.set_ylabel('Gender Classification Accuracy (%)')
        ax.set_title('Gender Classification Accuracy by Age Group')
        
        # Add value labels on top of bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'gender_accuracy_by_age.png'), dpi=300)
        plt.close()
        
        # Print findings
        print("\nGender Classification Accuracy by Age Group:")
        print(gender_by_age)
        if not gender_by_age.isna().all():
            print(f"Disparity: {gender_by_age.max() - gender_by_age.min():.2f}% between {gender_by_age.idxmax()} and {gender_by_age.idxmin()}")
    
    
    # Gender confusion matrix
    if all(col in df_results.columns for col in ['gender', 'predicted_gender']):
        # Filter out rows with missing predictions
        df_gender = df_results.dropna(subset=['predicted_gender']).copy()
        
        if len(df_gender) > 0:
            # Create confusion matrix
            gender_cm = pd.crosstab(
                df_gender['gender'], 
                df_gender['predicted_gender'],
                normalize='index'
            ) * 100
            
            plt.figure(figsize=(10, 8))
            sns.heatmap(gender_cm, annot=True, fmt='.1f', cmap="viridis", vmin=0, vmax=100)
            plt.title('Gender Classification Confusion Matrix (% of actual)')
            plt.ylabel('True Gender')
            plt.xlabel('Predicted Gender')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'gender_confusion_matrix.png'), dpi=300)
            plt.close()
    
    # Race confusion matrix
    if all(col in df_results.columns for col in ['race', 'predicted_race']):
        # Filter out rows with missing predictions
        df_race = df_results.dropna(subset=['predicted_race']).copy()
        
        if len(df_race) > 0:
            # Create confusion matrix
            race_cm = pd.crosstab(
                df_race['race'], 
                df_race['predicted_race'],
                normalize='index'
            ) * 100
            
            plt.figure(figsize=(14, 12))
            sns.heatmap(race_cm, annot=True, fmt='.1f', cmap="viridis", vmin=0, vmax=100)
            plt.title('Race Classification Confusion Matrix (% of actual)')
            plt.ylabel('True Race')
            plt.xlabel('Predicted Race')
            plt.xticks(rotation=45, ha='right')
            plt.yticks(rotation=0)
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'race_confusion_matrix.png'), dpi=300)
            plt.close()
    
    # 9. INTERSECTIONAL ANALYSIS: GENDER_RACE HEATMAP
    
    if all(col in df_results.columns for col in ['gender_correct', 'race_correct', 'gender_race']):
        # Create a pivot table for heatmap visualization
        gender_race_accuracy = df_results.pivot_table(
            index='gender', 
            columns='race',
            values=['gender_correct', 'race_correct'],
            aggfunc='mean'
        ) * 100
        
        # Gender accuracy heatmap
        if ('gender_correct' in gender_race_accuracy.columns.levels[0]) and (not gender_race_accuracy['gender_correct'].isna().all().all()):
            plt.figure(figsize=(14, 10))
            sns.heatmap(
                gender_race_accuracy['gender_correct'], 
                annot=True, 
                fmt='.1f', 
                cmap="viridis",
                vmin=0,
                vmax=100,
                cbar_kws={'label': 'Accuracy (%)'}
            )
            plt.title('Gender Classification Accuracy by Gender and Race')
            plt.ylabel('Gender')
            plt.xlabel('Race')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'gender_race_heatmap.png'), dpi=300)
            plt.close()
        
        # Race accuracy heatmap
        if ('race_correct' in gender_race_accuracy.columns.levels[0]) and (not gender_race_accuracy['race_correct'].isna().all().all()):
            plt.figure(figsize=(14, 10))
            sns.heatmap(
                gender_race_accuracy['race_correct'], 
                annot=True, 
                fmt='.1f', 
                cmap="viridis",
                vmin=0,
                vmax=100,
                cbar_kws={'label': 'Accuracy (%)'}
            )
            plt.title('Race Classification Accuracy by Gender and Race')
            plt.ylabel('Gender')
            plt.xlabel('Race')
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, 'race_gender_heatmap.png'), dpi=300)
            plt.close()
    

    
    print(f"\nAll visualizations saved to {output_dir}/")
    return

In [44]:
def analyze_opencv_results(results_df):
    """
    Analyze OpenCV Haar Cascades results.
    Args:
        results_df: DataFrame with detection results
    """
    print("\n--- OpenCV Haar Cascades Analysis ---")
    
    # Detection rate by race
    detection_by_race = results_df.groupby('race')['detected'].mean()
    print("\nDetection Rate by Race:")
    print(detection_by_race)
    
    # Average confidence by race (for detected faces)
    confidence_by_race = results_df[results_df['detected']].groupby('race')['confidence'].mean()
    print("\nAverage Confidence by Race (for detected faces):")
    print(confidence_by_race)
    
    # Plot detection rate by race
    plt.figure(figsize=(12, 6))
    sns.barplot(x=detection_by_race.index, y=detection_by_race.values)
    plt.title('Face Detection Rate by Race (OpenCV Haar Cascades)')
    plt.xlabel('Race')
    plt.ylabel('Detection Rate')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join("finalfinalresults", 'opencv_detection_rate.png'))
    
    # Plot confidence by race
    plt.figure(figsize=(12, 6))
    sns.barplot(x=confidence_by_race.index, y=confidence_by_race.values)
    plt.title('Average Confidence by Race (OpenCV Haar Cascades)')
    plt.xlabel('Race')
    plt.ylabel('Confidence Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join("finalfinalresults", 'opencv_confidence.png'))

In [None]:
fairface_df = load_fairface_data(FAIRFACE_CSV, FAIRFACE_DIR, 500)
deepface_results = test_deepface(fairface_df, FAIRFACE_DIR)
opencv_results = analyze_opencv_haar(fairface_df)
analyze_opencv_results(opencv_results)
print(deepface_results)


Columns in CSV file: ['file', 'age', 'gender', 'race', 'service_test']
Total records in CSV: 86744
Created 'gender_race' column by combining 'gender' and 'race'
Sampled 500 records
Checking image paths in directory: C:\Users\jonat\Desktop\dataethicsproj\data259-project\analysis\data\fairface-img-margin025-trainval


Checking image files: 100%|██████████| 500/500 [00:00<00:00, 2099.11it/s]


Loaded 500 images with valid file paths out of 500 total


Action: race: 100%|██████████| 3/3 [00:01<00:00,  1.76it/s]
Testing DeepFace:   0%|          | 1/500 [00:01<15:07,  1.82s/it]

DeepFace analysis structure for train/47414.jpg:
<class 'dict'>
dict_keys(['age', 'region', 'face_confidence', 'gender', 'dominant_gender', 'race', 'dominant_race'])
  age: <class 'int'> - 30
  region: <class 'dict'> - {'x': 0, 'y': 0, 'w': 223, 'h': 223, 'left_eye': (141, 58), 'right_eye': (83, 63)}
  face_confidence: <class 'numpy.float64'> - 0.93
  gender: <class 'dict'> - {'Woman': 1.7560571432113647, 'Man': 98.2439398765564}
  dominant_gender: <class 'str'> - Man
  race: <class 'dict'> - {'asian': 0.09687183764725478, 'indian': 1.859305275595817, 'black': 0.07329193468333002, 'white': 54.116303219354535, 'middle eastern': 33.00789392088579, 'latino hispanic': 10.846338969032642}
  dominant_race: <class 'str'> - white


Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.60it/s]
Testing DeepFace:   0%|          | 2/500 [00:02<10:36,  1.28s/it]

DeepFace analysis structure for train/52264.jpg:
<class 'dict'>
dict_keys(['age', 'region', 'face_confidence', 'gender', 'dominant_gender', 'race', 'dominant_race'])
  age: <class 'int'> - 30
  region: <class 'dict'> - {'x': 0, 'y': 0, 'w': 223, 'h': 223, 'left_eye': (148, 68), 'right_eye': (69, 70)}
  face_confidence: <class 'numpy.float64'> - 0.93
  gender: <class 'dict'> - {'Woman': 5.082449092697061e-05, 'Man': 99.99995231628418}
  dominant_gender: <class 'str'> - Man
  race: <class 'dict'> - {'asian': 0.11518732644617558, 'indian': 4.047111049294472, 'black': 0.23568044416606426, 'white': 25.98211169242859, 'middle eastern': 59.918588399887085, 'latino hispanic': 9.701324999332428}
  dominant_race: <class 'str'> - middle eastern


Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.66it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.78it/s]0s/it]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.97it/s]0it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.94it/s]9it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  4.08it/s]2it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  4.24it/s]6it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.94it/s]1it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  4.05it/s]1it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.99it/s]22it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.72it/s]23it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.49it/s]21it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.72it/s]16it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.85it/s]15it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.75it/s]17it/s]
Action: race: 100%|██████████| 3/3 [00:00<00:00,  3.77it/s]17it/s]
Action: r

DeepFace results shape: (500, 11), columns: ['file', 'gender', 'race', 'age', 'gender_race', 'detected', 'predicted_gender', 'predicted_race', 'predicted_age', 'gender_correct', 'race_correct']
Analyzing OpenCV Haar Cascades...


  0%|          | 0/500 [00:00<?, ?it/s]


KeyError: 'image_path'

In [None]:
create_comprehensive_visualizations(deepface_results, output_dir='finalfinalresults')


Gender Classification Accuracy by Race:
race
Southeast Asian    65.517241
White              65.591398
Black              65.789474
Indian             70.491803
East Asian         70.666667
Latino_Hispanic    71.052632
Middle Eastern     83.606557
Name: gender_correct, dtype: float64
Disparity: 18.09% between Middle Eastern and Southeast Asian

Race Classification Accuracy by Gender:
gender
Female    59.459459
Male      61.870504
Name: race_correct, dtype: float64
Disparity: 2.41% between Male and Female

Race Classification Accuracy by Age Group:
age
0-2             71.428571
3-9             70.491803
10-19           48.437500
20-29           62.264151
30-39           63.888889
40-49           56.000000
50-59           47.058824
60-69           83.333333
more than 70    60.000000
Name: race_correct, dtype: float64
Disparity: 36.27% between 60-69 and 50-59

Gender Classification Accuracy by Age Group:
age
0-2             85.714286
3-9             70.491803
10-19           56.250000
20

In [70]:
def calculate_comprehensive_disparate_impact(df_results, output_dir='finalfinalresults'):
    """
    Calculate disparate impact across all demographic attribute combinations
    
    Args:
        df_results: DataFrame with results (must contain predicted_gender, gender, 
                   predicted_race, race, age columns and their correctness indicators)
        output_dir: Directory to save visualizations
        
    Returns:
        Dictionary containing calculated disparate impact metrics
    """
    import os
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Set up plotting style
    plt.style.use('seaborn-v0_8-whitegrid')
    sns.set_palette("viridis")
    
    # Store all metrics in dictionary
    disparate_impact_metrics = {
        'gender_classification': {},
        'race_classification': {}
    }
    
    # Define demographic attributes to analyze
    demographic_attrs = []
    if 'race' in df_results.columns:
        demographic_attrs.append('race')
    if 'gender' in df_results.columns:
        demographic_attrs.append('gender')
    if 'age' in df_results.columns:
        demographic_attrs.append('age')
    
    # Define standardized age order if needed
    age_order = ['0-2', '3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', 'more than 70']
    
    print("\n=== COMPREHENSIVE DISPARATE IMPACT ANALYSIS ===")
    
    # Function to calculate and visualize disparate impact for a specific task and attribute
    def analyze_disparate_impact(task, attribute):
        print(f"\n--- {task.capitalize()} Classification by {attribute.capitalize()} ---")
        
        # Determine the correctness column
        correct_col = f"{task}_correct"
        if correct_col not in df_results.columns:
            print(f"Error: {correct_col} column not found")
            return None
        
        # Get unique attribute values
        attr_values = sorted(df_results[attribute].unique())
        if attribute == 'age' and all(age in age_order for age in attr_values):
            # Sort age groups using predefined order
            sorterIndex = dict(zip(age_order, range(len(age_order))))
            attr_values = sorted(attr_values, key=lambda x: sorterIndex.get(x, 999))
        
        # Calculate overall accuracy as baseline
        overall_accuracy = df_results[correct_col].mean()
        print(f"Overall {task} classification accuracy: {overall_accuracy:.4f}")
        
        # Dictionary to store results
        accuracy_by_attr = {}
        
        for value in attr_values:
            # Filter data for this attribute value
            attr_mask = (df_results[attribute] == value)
            df_filtered = df_results[attr_mask].copy()
            
            # Skip if not enough data
            if len(df_filtered) < 10:
                print(f"Skipping {attribute}={value}: insufficient data (n={len(df_filtered)})")
                continue
            
            # Calculate accuracy for this attribute value
            accuracy = df_filtered[correct_col].mean()
            accuracy_by_attr[value] = accuracy
            
            # Calculate and print disparate impact ratio
            di_ratio = accuracy / overall_accuracy
            di_threshold = 0.8  # 80% rule
            di_status = "PASS" if di_ratio >= di_threshold else "FAIL"
            print(f"{task.capitalize()} classification for {attribute}={value}: Accuracy = {accuracy:.4f}, DI Ratio = {di_ratio:.4f} ({di_status})")
        
        # If we have results, create visualization
        if accuracy_by_attr:
            # Determine figure size based on number of attribute values
            fig_width = max(10, len(accuracy_by_attr) * 1.0)
            fig, ax = plt.subplots(figsize=(fig_width, 6))
            
            # Convert to DataFrame for easier plotting
            di_df = pd.DataFrame({
                attribute.capitalize(): list(accuracy_by_attr.keys()),
                'Accuracy': list(accuracy_by_attr.values())
            })
            
            # Plot bars
            bars = ax.bar(di_df[attribute.capitalize()], di_df['Accuracy'])
            
            # Add horizontal line for 80% of max accuracy (disparate impact threshold)
            max_accuracy = di_df['Accuracy'].max()
            threshold = 0.8 * max_accuracy
            ax.axhline(y=threshold, linestyle='--', color='r', 
                      label=f'80% Threshold ({threshold:.4f})')
            
            # Highlight bars below threshold
            for i, (_, row) in enumerate(di_df.iterrows()):
                if row['Accuracy'] < threshold:
                    bars[i].set_color('red')
            
            # Add labels and formatting
            ax.set_ylabel(f'{task.capitalize()} Classification Accuracy')
            ax.set_title(f'Disparate Impact Analysis: {task.capitalize()} Classification by {attribute.capitalize()}')
            ax.set_ylim(0, 1.0)
            
            # Add value labels on top of bars
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                        f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
            
            # Adjust x-axis formatting based on attribute
            if len(attr_values) > 5 or any(len(str(val)) > 10 for val in attr_values):
                plt.xticks(rotation=45, ha='right')
            
            ax.legend()
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, f'{task}_disparate_impact_by_{attribute}.png'), dpi=300)
            plt.close()
            
            # Store metrics
            disparate_impact_metrics[f'{task}_classification'][f'disparate_impact_by_{attribute}'] = {
                f'{attribute}_accuracies': accuracy_by_attr,
                'overall_accuracy': overall_accuracy,
                'threshold': threshold,
                'di_ratios': {val: acc/overall_accuracy for val, acc in accuracy_by_attr.items()},
                'failing_groups': [val for val, acc in accuracy_by_attr.items() if acc < threshold]
            }
            
            return accuracy_by_attr
        return None

    # Function to calculate and visualize disparate impact for intersectional attributes
    def analyze_intersectional_disparate_impact(task, attr1, attr2):
        print(f"\n--- {task.capitalize()} Classification by {attr1.capitalize()} and {attr2.capitalize()} ---")
        
        # Determine the correctness column
        correct_col = f"{task}_correct"
        if correct_col not in df_results.columns:
            print(f"Error: {correct_col} column not found")
            return None
        
        # Get unique attribute values
        attr1_values = sorted(df_results[attr1].unique())
        attr2_values = sorted(df_results[attr2].unique())
        
        # Sort age groups if applicable
        if attr1 == 'age' and all(age in age_order for age in attr1_values):
            sorterIndex = dict(zip(age_order, range(len(age_order))))
            attr1_values = sorted(attr1_values, key=lambda x: sorterIndex.get(x, 999))
        if attr2 == 'age' and all(age in age_order for age in attr2_values):
            sorterIndex = dict(zip(age_order, range(len(age_order))))
            attr2_values = sorted(attr2_values, key=lambda x: sorterIndex.get(x, 999))
        
        # Calculate overall accuracy as baseline
        overall_accuracy = df_results[correct_col].mean()
        
        # Prepare heatmap data
        heatmap_data = []
        
        for val1 in attr1_values:
            for val2 in attr2_values:
                # Filter for this combination
                mask = (df_results[attr1] == val1) & (df_results[attr2] == val2)
                df_subset = df_results[mask].copy()
                
                # Only include if we have enough data
                if len(df_subset) >= 10:
                    accuracy = df_subset[correct_col].mean()
                    di_ratio = accuracy / overall_accuracy
                    
                    heatmap_data.append({
                        attr1: val1,
                        attr2: val2,
                        'Accuracy': accuracy,
                        'DI_Ratio': di_ratio,
                        'Sample_Size': len(df_subset)
                    })
        
        # Only create visualization if we have enough data
        if len(heatmap_data) > 3:
            heatmap_df = pd.DataFrame(heatmap_data)
            
            # Create pivot tables for heatmaps
            pivot_acc = heatmap_df.pivot_table(
                index=attr1, 
                columns=attr2,
                values='Accuracy'
            )
            
            pivot_di = heatmap_df.pivot_table(
                index=attr1, 
                columns=attr2,
                values='DI_Ratio'
            )
            
            pivot_size = heatmap_df.pivot_table(
                index=attr1, 
                columns=attr2,
                values='Sample_Size'
            )
            
            # Create heatmap for accuracy
            plt.figure(figsize=(max(8, len(attr2_values) * 1.2), max(6, len(attr1_values) * 0.8)))
            ax = sns.heatmap(
                pivot_acc, 
                annot=True, 
                fmt='.3f', 
                cmap='viridis',
                vmin=max(0.5, pivot_acc.values.min() - 0.05),
                vmax=min(1.0, pivot_acc.values.max() + 0.05),
                cbar_kws={'label': 'Accuracy'}
            )
            
            plt.title(f'Intersectional Analysis: {task.capitalize()} Classification Accuracy by {attr1.capitalize()} and {attr2.capitalize()}')
            plt.ylabel(attr1.capitalize())
            plt.xlabel(attr2.capitalize())
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, f'{task}_accuracy_{attr1}_{attr2}_heatmap.png'), dpi=300)
            plt.close()
            
            # Create heatmap for disparate impact ratio
            plt.figure(figsize=(max(8, len(attr2_values) * 1.2), max(6, len(attr1_values) * 0.8)))
            ax = sns.heatmap(
                pivot_di, 
                annot=True, 
                fmt='.3f', 
                cmap='RdYlGn',
                vmin=0.7,  # Below 0.8 is problematic
                vmax=1.3,
                cbar_kws={'label': 'Disparate Impact Ratio'}
            )
            
            # Add reference lines
            ax.axhline(y=0, color='black', linewidth=1)
            ax.axvline(x=0, color='black', linewidth=1)
            
            plt.title(f'Intersectional Analysis: {task.capitalize()} Classification Disparate Impact by {attr1.capitalize()} and {attr2.capitalize()}')
            plt.ylabel(attr1.capitalize())
            plt.xlabel(attr2.capitalize())
            plt.tight_layout()
            plt.savefig(os.path.join(output_dir, f'{task}_disparate_impact_{attr1}_{attr2}_heatmap.png'), dpi=300)
            plt.close()
            
            # Store metrics
            disparate_impact_metrics[f'{task}_classification'][f'disparate_impact_{attr1}_{attr2}'] = {
                'data': heatmap_data,
                'overall_accuracy': overall_accuracy,
                'failing_combinations': [
                    {attr1: row[attr1], attr2: row[attr2], 'accuracy': row['Accuracy'], 'di_ratio': row['DI_Ratio']}
                    for _, row in heatmap_df.iterrows() if row['DI_Ratio'] < 0.8
                ]
            }
            
            return heatmap_data
        else:
            print(f"Not enough data for meaningful intersectional analysis of {attr1} and {attr2}")
            return None
    
    # Tasks we'll analyze
    tasks = []
    if 'gender_correct' in df_results.columns:
        tasks.append('gender')
    if 'race_correct' in df_results.columns:
        tasks.append('race')
    
    # 1. Calculate basic disparate impact for each task and demographic attribute
    for task in tasks:
        for attribute in demographic_attrs:
            # Skip self-comparisons (e.g., race classification by race)
            if not (task == attribute):
                analyze_disparate_impact(task, attribute)
    
    # 2. Calculate intersectional disparate impact for combinations of demographic attributes
    for task in tasks:
        # Get all pairs of demographic attributes
        for i, attr1 in enumerate(demographic_attrs):
            for attr2 in demographic_attrs[i+1:]:
                # Skip any combinations involving the task itself
                if task != attr1 and task != attr2:
                    analyze_intersectional_disparate_impact(task, attr1, attr2)
    
    # 3. Print summary of disparate impact findings
    print("\n=== DISPARATE IMPACT SUMMARY ===")
    for task in tasks:
        print(f"\n{task.capitalize()} Classification:")
        
        # Get all disparate impact metrics for this task
        task_metrics = disparate_impact_metrics[f'{task}_classification']
        
        # Print failures for each demographic attribute
        for metric_name, metric_data in task_metrics.items():
            if 'failing_groups' in metric_data and metric_data['failing_groups']:
                print(f"  {metric_name}: {len(metric_data['failing_groups'])} failing groups")
                for group in metric_data['failing_groups']:
                    ratio = metric_data['di_ratios'].get(group, 'N/A')
                    print(f"    - {group}: Ratio = {ratio:.4f}")
            elif 'failing_combinations' in metric_data and metric_data['failing_combinations']:
                print(f"  {metric_name}: {len(metric_data['failing_combinations'])} failing combinations")
                for combo in metric_data['failing_combinations'][:5]:  # Show at most 5 examples
                    attrs = ', '.join([f"{k}={v}" for k, v in combo.items() if k not in ['accuracy', 'di_ratio']])
                    print(f"    - {attrs}: Ratio = {combo['di_ratio']:.4f}")
                if len(metric_data['failing_combinations']) > 5:
                    print(f"    - ... and {len(metric_data['failing_combinations']) - 5} more")
    
    return disparate_impact_metrics

In [71]:
calculate_comprehensive_disparate_impact(deepface_results)


=== COMPREHENSIVE DISPARATE IMPACT ANALYSIS ===

--- Gender Classification by Race ---
Overall gender classification accuracy: 0.7000
Gender classification for race=Black: Accuracy = 0.6579, DI Ratio = 0.9398 (PASS)
Gender classification for race=East Asian: Accuracy = 0.7067, DI Ratio = 1.0095 (PASS)
Gender classification for race=Indian: Accuracy = 0.7049, DI Ratio = 1.0070 (PASS)
Gender classification for race=Latino_Hispanic: Accuracy = 0.7105, DI Ratio = 1.0150 (PASS)
Gender classification for race=Middle Eastern: Accuracy = 0.8361, DI Ratio = 1.1944 (PASS)
Gender classification for race=Southeast Asian: Accuracy = 0.6552, DI Ratio = 0.9360 (PASS)
Gender classification for race=White: Accuracy = 0.6559, DI Ratio = 0.9370 (PASS)

--- Gender Classification by Age ---
Overall gender classification accuracy: 0.7000
Skipping age=0-2: insufficient data (n=7)
Gender classification for age=3-9: Accuracy = 0.7049, DI Ratio = 1.0070 (PASS)
Gender classification for age=10-19: Accuracy = 0.

{'gender_classification': {'disparate_impact_by_race': {'race_accuracies': {'Black': 0.6578947368421053,
    'East Asian': 0.7066666666666667,
    'Indian': 0.7049180327868853,
    'Latino_Hispanic': 0.7105263157894737,
    'Middle Eastern': 0.8360655737704918,
    'Southeast Asian': 0.6551724137931034,
    'White': 0.6559139784946236},
   'overall_accuracy': 0.7,
   'threshold': 0.6688524590163936,
   'di_ratios': {'Black': 0.9398496240601505,
    'East Asian': 1.0095238095238095,
    'Indian': 1.0070257611241218,
    'Latino_Hispanic': 1.0150375939849625,
    'Middle Eastern': 1.1943793911007028,
    'Southeast Asian': 0.935960591133005,
    'White': 0.9370199692780338},
   'failing_groups': ['Black', 'Southeast Asian', 'White']},
  'disparate_impact_by_age': {'age_accuracies': {'3-9': 0.7049180327868853,
    '10-19': 0.5625,
    '20-29': 0.6855345911949685,
    '30-39': 0.7592592592592593,
    '40-49': 0.74,
    '50-59': 0.7352941176470589,
    '60-69': 0.8333333333333334},
   'over