### Bias & Fairness in Data: Distribution Check
**Description**: Load the Adult Income dataset and check for representation bias by analyzing the distribution of gender across different income levels.

In [None]:
# write your code from here
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Adult Income dataset
try:
    # You might need to adjust the path depending on where you've stored the data
    df = pd.read_csv('adult.csv')
    print("Adult Income dataset loaded successfully.\n")
except FileNotFoundError:
    print("Error: adult.csv not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 1: Explore Gender and Income Columns ---
print("--- Exploring Gender and Income Columns ---")
print("Unique values in 'sex' column:")
print(df['sex'].unique())
print("\nUnique values in 'income' column:")
print(df['income'].unique())

# Rename columns for easier analysis (optional)
df.rename(columns={'sex': 'gender', 'income': 'income_level'}, inplace=True)

# --- Step 2: Analyze Gender Distribution Across Income Levels ---
print("\n--- Analyzing Gender Distribution Across Income Levels ---")

# Create a contingency table to show the counts of each gender for each income level
gender_income_counts = pd.crosstab(df['gender'], df['income_level'])
print("\nCounts of Gender across Income Levels:")
print(gender_income_counts)

# Calculate the proportion of each gender within each income level
gender_income_proportions = gender_income_counts.apply(lambda x: x / x.sum(), axis=1)
print("\nProportion of Gender within each Income Level:")
print(gender_income_proportions)

# --- Step 3: Visualize the Distribution ---
print("\n--- Visualizing the Distribution ---")

# Stacked bar chart of gender distribution by income level (counts)
gender_income_counts.plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Gender Distribution by Income Level (Counts)')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(title='Income Level')
plt.tight_layout()
plt.show()

# Stacked bar chart of gender distribution by income level (proportions)
gender_income_proportions.plot(kind='bar', stacked=True, figsize=(8, 6))
plt.title('Gender Distribution by Income Level (Proportions)')
plt.xlabel('Gender')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.legend(title='Income Level')
plt.tight_layout()
plt.show()

# Grouped bar chart to compare income levels for each gender
df.groupby('gender')['income_level'].value_counts(normalize=True).unstack().plot(kind='bar', figsize=(8, 6))
plt.title('Income Level Distribution by Gender (Proportions)')
plt.xlabel('Gender')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.legend(title='Income Level')
plt.tight_layout()
plt.show()

# --- Step 4: Interpretation of Potential Representation Bias ---
print("\n--- Interpretation of Potential Representation Bias ---")
print("By analyzing the counts and proportions, we can observe the representation of each gender within different income levels.")
print("If the proportion of one gender is significantly lower in the higher income level compared to the lower income level (or vice versa),")
print("it might indicate a potential representation bias in the dataset.")
print("\nFor example, if the proportion of 'Female' individuals earning '>50K' is considerably lower than the proportion of 'Male' individuals")
print("in the same income bracket, this could suggest a lack of representation or potential bias in the data collection or societal factors reflected in the data.")
print("\nFurther statistical analysis (e.g., Chi-squared test for independence) can be used to determine if the observed differences are statistically significant.")

Error: adult.csv not found. Please make sure the file is in the correct directory.
--- Exploring Gender and Income Columns ---
Unique values in 'sex' column:


NameError: name 'df' is not defined

: 