In [1]:
import pandas as pd
import numpy as np
import dataset
import lattice
import flash

---
# Examples in paper

In [2]:
columns1 = ['Age', 'Gender', 'ZipCode']
df1 = pd.DataFrame([[34, 'male', 81667], \
                    [45, 'female', 81675], \
                    [66, 'male', 81925], \
                    [70, 'female', 81931], \
                    [34, 'female', 81931], \
                    [70, 'male', 81931], \
                    [45, 'male', 81931]], columns=columns1)


In [3]:
# QID is all the attributes
D1 = dataset.Dataset(df1)

In [4]:
# Hierarchies for Age
h1, d1 = D1.lat.createNumericalHierarchies(0, 3)
D1.lat.addNewHierarchy(0, h1, d1, numerical=True)

# Hierarchies for Gender
D1.lat.addNewHierarchy(1, {0:2, 1:2}, ['*'])

# Hierarchies for ZipCode
D1.lat.addNewHierarchy(2, {0:4, 1:5, 2:6, 3:7}, ['8166*', '8167*', '8192*', '8193*'])
D1.lat.addNewHierarchy(2, {4:8, 5:8, 6:9, 7:9}, ['816**', '819**'])
D1.lat.addNewHierarchy(2, {8:10, 9:10}, ['81***'])
D1.lat.addNewHierarchy(2, {10:11}, ['8****'])
D1.lat.addNewHierarchy(2, {11:12}, ['*'])

In [5]:
D1.lat.hierarchies

[2, 1, 5]

In [6]:
localOptimuns1 = flash.flash(D1, 2)
print('Local optimum nodes: ' + str(localOptimuns1))

Local optimum nodes: [(2, 0, 5), (1, 1, 5), (2, 0, 3), (2, 1, 2), (1, 1, 2)]


In [7]:
genDF1 = D1.generalizeDataset((1,1,2))
genDF1

Unnamed: 0,Age,Gender,ZipCode
0,"[34.00, 52.00)",*,816**
1,"[34.00, 52.00)",*,816**
2,"[52.00, 70.00]",*,819**
3,"[52.00, 70.00]",*,819**
4,"[34.00, 52.00)",*,819**
5,"[52.00, 70.00]",*,819**
6,"[34.00, 52.00)",*,819**


---
# Adults dataset

In [8]:
columns2 = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', \
           'marital-status', 'occupation', 'relationship', 'race', 'sex', \
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '<=50K']
df2 = pd.read_csv('adult/adult.data', index_col=False, names=columns2)

In [9]:
qid2 = ['age', 'workclass', 'education', 'race', 'sex']
D2 = dataset.Dataset(df2[qid2])

In [10]:
# Hierarchies for age
h2, d2 = D2.lat.createNumericalHierarchies(0, 4)
D2.lat.addNewHierarchy(0,h2,d2,numerical=True)

# Hierarchies for workclass
D2.lat.addNewHierarchy(1, dict([(i,9) for i in np.arange(9)]), ['*'])

# Hierarchies for education
D2.lat.addNewHierarchy(2, dict([(i,9) for i in np.arange(16)]), ['*'])

# Hierarchies for race
D2.lat.addNewHierarchy(3, {0:5, 1:5, 2:5, 3:5, 4:5}, ['*'])

# Hierarchies for sex
D2.lat.addNewHierarchy(4,{0:2, 1:2},['*'])

In [11]:
D2.lat.hierarchies

[3, 1, 1, 1, 1]

In [12]:
localOptimuns2 = flash.flash(D2, 2)
print('Local optimum nodes: ' + str(localOptimuns2))

Local optimum nodes: [(3, 1, 0, 1, 1), (3, 1, 0, 0, 1), (3, 0, 1, 1, 1), (3, 1, 0, 1, 0), (3, 1, 1, 0, 0), (2, 1, 0, 1, 1), (2, 1, 1, 0, 1), (2, 1, 0, 1, 0), (1, 1, 0, 1, 1), (1, 1, 1, 1, 0), (2, 0, 1, 1, 0), (2, 1, 1, 0, 0), (1, 1, 1, 0, 1)]


In [13]:
genDF2 = D2.generalizeDataset((1,1,1,0,1))
genDF2

Unnamed: 0,age,workclass,education,race,sex
0,"[35.00, 53.00)",*,Bachelors,White,*
1,"[35.00, 53.00)",*,Bachelors,White,*
2,"[35.00, 53.00)",*,Bachelors,White,*
3,"[53.00, 71.00)",*,Bachelors,Black,*
4,"[17.00, 35.00)",*,Bachelors,Black,*
...,...,...,...,...,...
32556,"[17.00, 35.00)",*,Bachelors,White,*
32557,"[35.00, 53.00)",*,Bachelors,White,*
32558,"[53.00, 71.00)",*,Bachelors,White,*
32559,"[17.00, 35.00)",*,Bachelors,White,*
