In [None]:
import pandas as pd
df = pd.read_csv('adult.csv', na_values='?')
df

In [None]:
df_cat = df.dropna()

# bin `age` and `hours-per-week` to nearest 10
df_cat['age'] = df_cat['age'].apply(lambda x: round(x / 10) * 10)
df_cat['hours-per-week'] = df_cat['hours-per-week'].apply(lambda x: round(x / 10) * 10)

# remove `educational-num` repeated in `education`, `fnlwgt` is a quasi-id, `capital-gain` & `capital-loss` are continuous cols
remove_cols = ['fnlwgt', 'educational-num', 'capital-gain', 'capital-loss']
cols_of_interest = [col for col in list(df.columns) if col not in remove_cols]

df_cat = df_cat[cols_of_interest]
df_cat.to_csv('adult_cat.csv', index=False)

df_cat

## Extract metadata

In [None]:
# sys path hack
import sys; sys.path.insert(0, '../..')
from audit.utils import conv_to_cat
import json

def get_metadata(df):
    df = conv_to_cat(df)
    return {
        'columns': [
            {
                'name': col,
                'type': 'Categorical',
                'i2s': list(df[col].unique())
            }
            for col in df.columns
        ]
    }

metadata = get_metadata(df_cat)

with open('adult_cat.json', 'w') as f:
    json.dump(metadata, f)

metadata

## Calculate vulnerabilities of each record

In [None]:
import numpy as np
# sys path hack
import sys; sys.path.insert(0, '../..')
from attacks.utils import get_vuln

# pre-calculate unique values for each column
full_uniq_vals = {}
for col in df_cat.columns:
    full_uniq_vals[col] = df_cat[col].unique().tolist()

vulns = get_vuln(df_cat, full_uniq_vals, show_progress=True)
np.savetxt('vulns.txt', vulns)
vulns