### Bias & Fairness in Data: Distribution Check
**Description**: Load the Adult Income dataset and check for representation bias by analyzing the distribution of gender across different income levels.

In [None]:
# write your code from here

In [1]:
import pandas as pd
import numpy as np

# Define the URL for the raw Adult Income training dataset
# This dataset is commonly used for classification tasks related to income prediction.
adult_income_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define the column names as the dataset does not have a header row
# These column names are standard for the Adult Income dataset.
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

print("Loading the Adult Income dataset...")
try:
    # Load the dataset into a pandas DataFrame
    # 'sep=", "' specifies the separator, and 'engine="python"' handles the regex-like separator.
    # 'na_values=[" ?"]' tells pandas to treat ' ?' as missing values (NaN).
    df = pd.read_csv(adult_income_url, sep=", ", header=None, names=column_names, na_values=["?"], engine='python')
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

    print("\n--- Initial Data Information ---")
    df.info()

    print("\n--- Handling Missing Values (if any) ---")
    # Identify missing values after loading
    missing_values = df.isnull().sum()
    missing_info = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing Percentage (%)': (missing_values / len(df)) * 100
    })
    missing_info = missing_info[missing_info['Missing Count'] > 0].sort_values(
        by='Missing Count', ascending=False
    )

    if not missing_info.empty:
        print("Columns with missing values (before cleaning for bias analysis):")
        print(missing_info)
        # For the purpose of bias analysis on 'sex' and 'income',
        # we will drop rows with missing values in these specific columns if any,
        # or in general, for simplicity, drop rows with any missing values.
        # In a real-world scenario, more sophisticated imputation might be used.
        df.dropna(inplace=True)
        print("\nRows with missing values have been dropped for cleaner analysis.")
    else:
        print("No missing values found in the dataset.")

    # Strip whitespace from categorical columns, especially 'sex' and 'income'
    # This is a common issue with datasets loaded from text files.
    for col in ['sex', 'income']:
        if col in df.columns and df[col].dtype == 'object':
            df[col] = df[col].str.strip()

    print("\n--- Analyzing Gender Distribution Across Income Levels ---")

    # Use crosstab to get the frequency distribution of 'sex' across 'income' levels
    gender_income_distribution = pd.crosstab(df['sex'], df['income'])
    print("\nAbsolute counts of gender across income levels:")
    print(gender_income_distribution)

    # Calculate the percentage distribution for better understanding of bias
    # Normalize by column to see the percentage of each gender within each income bracket
    gender_income_percentage_by_income = pd.crosstab(df['sex'], df['income'], normalize='columns') * 100
    print("\nPercentage of each gender within each income level:")
    print(gender_income_percentage_by_income.round(2)) # Round to 2 decimal places

    # Calculate the percentage distribution to see the income distribution for each gender
    gender_income_percentage_by_gender = pd.crosstab(df['sex'], df['income'], normalize='index') * 100
    print("\nPercentage of income levels within each gender:")
    print(gender_income_percentage_by_gender.round(2)) # Round to 2 decimal places

    print("\n--- Interpretation of Potential Bias ---")
    print("By examining the percentages above, we can observe the representation of 'Male' and 'Female' individuals in both income categories ('>50K' and '<=50K').")
    print("A significant difference in these percentages between genders for the '>50K' income level could indicate a representation bias.")
    print("For example, if a much higher percentage of males earn '>50K' compared to females, it suggests a potential bias in income distribution related to gender in this dataset.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please ensure the dataset URL is correct and accessible.")



Loading the Adult Income dataset...
Dataset loaded successfully.

First 5 rows of the dataset:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174  