# The following cell retrieves the current project directory and constructs the absolute path 
# to our raw data file (adult.data). This helps us ensure that our code is running in the correct location.

In [2]:
import os
import sys
import pandas as pd
from aif360.datasets import AdultDataset

# Get the current working directory and print it for debugging.
project_root = os.getcwd()
print("Project root directory:", project_root)

# Construct the absolute path to the raw data file.
raw_data_file = os.path.join(project_root, "data", "raw", "adult", "adult.data")
print("Looking for raw data file at:", raw_data_file)

Project root directory: /Users/stay-c/Desktop/AI_Fairness_Project
Looking for raw data file at: /Users/stay-c/Desktop/AI_Fairness_Project/data/raw/adult/adult.data


# In this step, if AIF360's AdultDataset cannot convert the data because of non-numeric values 
# (for example, the 'race' column contains the value 'Black'), we load the raw data manually.
# We drop rows with missing values to prevent errors in model training and to ensure that we have complete data.
# Next, categorical variables are converted to numerical codes using pd.Categorical().codes.
# This encoding is necessary because many machine learning algorithms require numerical input.
# Finally, we create a binary target column ('income_binary') and drop the original income column.

In [3]:
# Define paths for train.csv and the cleaned dataset.
train_csv = os.path.join(project_root, "data", "train.csv")
cleaned_csv = os.path.join(project_root, "data", "train_cleaned.csv")

# Check if train.csv exists; if not, create it.
if not os.path.exists(train_csv):
    print("train.csv not found. Creating train.csv from raw Adult dataset...")
    try:
        dataset = AdultDataset(protected_attribute_names=['sex'], features_to_drop=['fnlwgt'])
        data_df, label_names, protected_attribute_names = dataset.convert_to_dataframe()
    except Exception as e:
        print("Error using AdultDataset:", e)
        print("Falling back to manual loading and encoding...")
        columns = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income-per-year'
        ]
        data_df = pd.read_csv(raw_data_file, header=None, names=columns, na_values='?')
        data_df = data_df.dropna()
        categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                            'relationship', 'race', 'sex', 'native-country', 'income-per-year']
        for col in categorical_cols:
            data_df[col] = pd.Categorical(data_df[col]).codes
        data_df['income_binary'] = data_df['income-per-year'].apply(lambda x: 1 if x > 0 else 0)
        data_df = data_df.drop('income-per-year', axis=1)
        label_names = ['income_binary']
        protected_attribute_names = ['sex']
    data_df.to_csv(train_csv, index=False)
    print("train.csv created and saved at:", train_csv)
else:
    print("train.csv already exists.")


train.csv already exists.


# After creating train.csv (or confirming its existence), we load it and perform additional cleaning:
# - We remove any leakage feature (e.g., "14_ <=50K") if it exists.
# - We ensure that the target column ('income_binary') and the protected attribute ('sex') are converted 
#   to numeric types.
# The final cleaned dataset is then saved as train_cleaned.csv.

In [4]:
# Load train.csv
data_df = pd.read_csv(train_csv)
print("Loaded train.csv with shape:", data_df.shape)

# Remove leakage feature if it exists (e.g., "14_ <=50K")
if "14_ <=50K" in data_df.columns:
    data_df = data_df.drop("14_ <=50K", axis=1)
    print("Leakage feature '14_ <=50K' removed.")
else:
    print("No leakage feature '14_ <=50K' found.")

# Select target column (use 'income_binary' if it exists)
target_column = 'income_binary' if 'income_binary' in data_df.columns else '14_ >50K'
data_df[target_column] = data_df[target_column].astype(int)

# Ensure that the protected attribute 'sex' is numeric.
if 'sex' in data_df.columns:
    data_df['sex'] = data_df['sex'].astype(int)

# Save the final cleaned dataset.
data_df.to_csv(cleaned_csv, index=False)
print("Cleaned dataset saved at:", cleaned_csv)
print("Data sample:")
print(data_df.head())


Loaded train.csv with shape: (32561, 15)
No leakage feature '14_ <=50K' found.
Cleaned dataset saved at: /Users/stay-c/Desktop/AI_Fairness_Project/data/train_cleaned.csv
Data sample:
   age  workclass  fnlwgt  education  education-num  marital-status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0  

# The final cell prints a sample of the cleaned data to verify that our transformations have been applied correctly.
# This sample provides a quick visual confirmation of the cleaning process.