In [1]:
import pandas as pd
import json

def make_categorical(df):
    new_df = df.copy()

    # convert all columns to categorical integers
    for col in new_df.columns:
        new_df[col] = new_df[col].astype('category')
    cat_columns = new_df.select_dtypes(['category']).columns
    new_df[cat_columns] = new_df[cat_columns].apply(lambda x: x.cat.codes)

    return new_df

In [2]:
# download FIRE dataset from https://github.com/usnistgov/Differential-Privacy-Synthetic-Data-Challenge-assets/tree/master/2018%20DP%20Synthetic%20Data%20Challenge%20Competitor%20Packs/DeID1_Match_2_competitor%20pack
fire = pd.read_csv('fire_raw.csv')
fire

Unnamed: 0,ALS Unit,Final Priority,Call Type Group,Original Priority,Priority,City,Unit Type,Fire Prevention District,Battalion,Supervisor District,...,Hospital DtTm,Location - Lng,Number of Alarms,Available DtTm,Unit sequence in call dispatch,Location - Lat,Call Date,Unit ID,Box,Address
0,0,0,0,0,0,0,0,0,0,0,...,,-122.426379,1,1.451636e+09,2,37.776688,1451635200,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1.451639e+09,-122.426379,1,1.451640e+09,1,37.776688,1451635200,1,0,0
2,1,0,1,0,0,0,1,1,1,1,...,,-122.394748,1,1.451636e+09,1,37.794480,1451635200,2,1,1
3,1,1,2,1,1,0,2,2,2,2,...,,-122.409572,1,1.451636e+09,1,37.747553,1451635200,3,2,2
4,0,1,3,1,1,0,3,3,3,3,...,,-122.488371,1,1.451636e+09,3,37.731150,1451635200,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305128,1,1,0,1,1,0,1,0,0,2,...,1.483347e+09,-122.420910,1,1.483348e+09,2,37.765792,1483257600,104,630,5023
305129,1,1,0,1,1,0,2,0,0,2,...,,-122.420910,1,1.483344e+09,1,37.765792,1483257600,19,630,5023
305130,1,1,0,1,1,0,2,0,0,2,...,,-122.420910,1,1.483344e+09,5,37.765792,1483257600,61,630,5023
305131,1,1,0,1,1,0,6,0,0,2,...,,-122.420910,1,1.483345e+09,3,37.765792,1483257600,77,630,5023


In [3]:
# keep only categorical columns
cat_cols = ['Call Type', 'Call Final Disposition', 'City', 'Zipcode of Incident', 'Battalion', 'Station Area', 'Priority', 'ALS Unit', 'Call Type Group', 'Number of Alarms']
df_cat = make_categorical(fire[cat_cols])

# save pre-processed dataset and domain
df_cat.to_csv('fire_cat.csv', index=False)

df_cat

Unnamed: 0,Call Type,Call Final Disposition,City,Zipcode of Incident,Battalion,Station Area,Priority,ALS Unit,Call Type Group,Number of Alarms
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,1,1,0,1,1,0
3,1,2,0,2,2,2,1,1,2,0
4,2,3,0,3,3,3,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...
305128,0,0,0,6,0,6,1,1,0,0
305129,0,0,0,6,0,6,1,1,0,0
305130,0,0,0,6,0,6,1,1,0,0
305131,0,0,0,6,0,6,1,1,0,0


## Extract metadata

In [4]:
# sys path hack
import sys; sys.path.insert(0, '../..')
from audit.utils import conv_to_cat

def get_metadata(df):
    df = conv_to_cat(df)
    return {
        'columns': [
            {
                'name': col,
                'type': 'Categorical',
                'i2s': list(df[col].unique())
            }
            for col in df.columns
        ]
    }

metadata = get_metadata(df_cat)

with open('fire_cat.json', 'w') as f:
    json.dump(metadata, f)

metadata

2023-12-29 01:57:18.715082: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


  from .autonotebook import tqdm as notebook_tqdm




{'columns': [{'name': 'Call Type',
   'type': 'Categorical',
   'i2s': ['0',
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    '10',
    '11',
    '12',
    '13',
    '14',
    '15',
    '16',
    '17',
    '18',
    '19',
    '20',
    '21',
    '22',
    '23',
    '24',
    '25',
    '26',
    '27']},
  {'name': 'Call Final Disposition',
   'type': 'Categorical',
   'i2s': ['0',
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    '10',
    '11',
    '12',
    '13',
    '14']},
  {'name': 'City',
   'type': 'Categorical',
   'i2s': ['0', '1', '2', '3', '4', '5', '6', '7', '8']},
  {'name': 'Zipcode of Incident',
   'type': 'Categorical',
   'i2s': ['0',
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    '10',
    '11',
    '12',
    '13',
    '14',
    '15',
    '16',
    '17',
    '18',
    '19',
    '20',
    '21',
    '22',
    '23',
    '24',
    '25',
    '26',
    '27']},
  {'na

## Calculate vulnerability for each record

In [9]:
import numpy as np
# sys path hack
import sys; sys.path.insert(0, '../..')
from attacks.utils import onehot_encode

# pre-calculate unique values for each column
full_uniq_vals = {}
for col in df_cat.columns:
    full_uniq_vals[col] = df_cat[col].unique().tolist()

# convert to onehot encoding
df_np = onehot_encode(df_cat, full_uniq_vals)
df_np = df_np.astype(int).to_numpy()

np.savetxt('df_np.txt', df_np, delimiter='\t', fmt='%d')
df_np.shape

(305133, 157)