Pre-process adult dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

REPOSITORY_PATH = "/home/ramongonze/phd/privacy-ml" # privacy-ml repository path

In [2]:
df = pd.read_csv(
    os.path.join(REPOSITORY_PATH, "data/adult/adult_original.csv"),
    na_values=["?"],  # optional: strings that should become NaN
    encoding="utf-8"
)
display(df)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [3]:
cols = ["age","workclass","occupation","race","sex","education","native-country","marital-status","income"]
df = df[cols]

print("Domain sizes")
for col in cols:
    values = df[col].unique().tolist()
    print(f"{col}: {len(values)}")
    print(f"values: {set(values)}\n")

Domain sizes
age: 74
values: {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90}

workclass: 9
values: {'Self-emp-not-inc', 'Federal-gov', 'Without-pay', 'State-gov', 'Self-emp-inc', 'Never-worked', nan, 'Local-gov', 'Private'}

occupation: 15
values: {'Armed-Forces', 'Tech-support', 'Exec-managerial', 'Prof-specialty', 'Transport-moving', 'Adm-clerical', 'Handlers-cleaners', 'Sales', 'Craft-repair', 'Farming-fishing', 'Machine-op-inspct', nan, 'Other-service', 'Priv-house-serv', 'Protective-serv'}

race: 5
values: {'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Black', 'Other', 'White'}

sex: 2
values: {'Female', 'Male'}

education: 16
values: {'5th-6th', '12th', 'Doctorate', 'Bachelors', 'Assoc-voc', 'Assoc-acdm', 'Masters', '7th-8th', '10th', '

In [4]:
# Drop nan values
df = df.dropna()
print(f"# rows after dropping nan values: {len(df)}")

# rows after dropping nan values: 45222


In [5]:
# Pre-process cols
# income: convert <=50K. to <=50K and >50K. to >50K
df["income"] = df["income"].replace("<=50K.", "<=50K")
df["income"] = df["income"].replace(">50K.", ">50K")

display(df)

Unnamed: 0,age,workclass,occupation,race,sex,education,native-country,marital-status,income
0,39,State-gov,Adm-clerical,White,Male,Bachelors,United-States,Never-married,<=50K
1,50,Self-emp-not-inc,Exec-managerial,White,Male,Bachelors,United-States,Married-civ-spouse,<=50K
2,38,Private,Handlers-cleaners,White,Male,HS-grad,United-States,Divorced,<=50K
3,53,Private,Handlers-cleaners,Black,Male,11th,United-States,Married-civ-spouse,<=50K
4,28,Private,Prof-specialty,Black,Female,Bachelors,Cuba,Married-civ-spouse,<=50K
...,...,...,...,...,...,...,...,...,...
48836,33,Private,Prof-specialty,White,Male,Bachelors,United-States,Never-married,<=50K
48837,39,Private,Prof-specialty,White,Female,Bachelors,United-States,Divorced,<=50K
48839,38,Private,Prof-specialty,White,Male,Bachelors,United-States,Married-civ-spouse,<=50K
48840,44,Private,Adm-clerical,Asian-Pac-Islander,Male,Bachelors,United-States,Divorced,<=50K


In [6]:
print(f"# records after pre-processing: {len(df)}")
print("Domain sizes after pre-processing:\n")
for col in cols:
    values = df[col].unique().tolist()
    print(f"{col}: {len(values)}")

# records after pre-processing: 45222
Domain sizes after pre-processing:

age: 74
workclass: 7
occupation: 14
race: 5
sex: 2
education: 16
native-country: 41
marital-status: 7
income: 2


In [7]:
# Save pre-processed file
df.to_csv(os.path.join(REPOSITORY_PATH, "data/adult/adult_pp.csv"), index=False, encoding="utf-8")