In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn import preprocessing, pipeline, ensemble, compose
import datasets
import os

real_path = '/mnt/data/sonia/datasets/adult/may8.dat'
synth_path= ['/mnt/data/sonia/datasets/synthetic/adult/may10.dat', 
             '/mnt/data/sonia/datasets/synthetic/adult/may10.dat-1', 
             './may10.dat']
oh_path = 'vanilla-adult-clean.dat'
gan_path = '/home/sonia/TabFairGAN/fake_adult.csv'
grid_path = '/mnt/data/sonia/ckpts/grid2/'
grid_name = 'synth.dat'

In [2]:
full_dataset = datasets.DatasetDict({})
for f in os.listdir(real_path):
    if f.endswith('.json'): continue
    full_dataset[f] = datasets.load_from_disk(os.path.join(real_path, f))
real = full_dataset['train'].to_pandas().drop(['length'], axis=1)
syntha= datasets.load_from_disk(synth_path[0]).to_pandas()
synthb= datasets.load_from_disk(synth_path[1]).to_pandas()
synthc= datasets.load_from_disk(synth_path[2]).to_pandas()
synth = pd.concat([syntha, synthb, synthc])
synth.columns = real.columns

gan = pd.read_csv(gan_path)[real.columns]
gan['native-country'] = gan.apply(lambda x: ' '.join(x['native-country'].strip().split('-')), axis=1)
gan['age'] = gan.apply(lambda x: str(x['age']), axis=1)
gan['sex'] = gan.apply(lambda x: x['sex'].strip(), axis=1)
gan['education'] = gan.apply(lambda x: x['education'].strip(), axis=1)
gan['occupation'] = gan.apply(lambda x: ' '.join(x['occupation'].strip().split('-')), axis=1)
inc_map = {' <=50K': 'under 50K', ' >50K': 'over 50K'}
gan['income'] = gan.apply(lambda x: inc_map[x['income']], axis=1)

ohl = datasets.load_from_disk(oh_path).to_pandas().drop('__index_level_0__', axis=1)
ohl.columns = real.columns

grids = {}
grid_size = 5000
min_grid_size = 1e10
for ckpt_name in os.listdir(grid_path):
    if os.path.exists(os.path.join(grid_path, ckpt_name, grid_name, 'dataset_info.json')): # since still training
        df = datasets.load_from_disk(os.path.join(grid_path, ckpt_name, grid_name)).to_pandas()
        if len(df) > 3000: # skip ones not nearly completed
            grids[ckpt_name] = df
            grids[ckpt_name].columns = real.columns
            min_grid_size = min(min_grid_size, len(grids[ckpt_name]))

min_dataset_size = min(synth.shape[0], gan.shape[0], real.shape[0], ohl.shape[0], min_grid_size)
print('sampling all datasets to smallest size of', min_dataset_size)
rs = 30
synth = synth.sample(min_dataset_size, random_state=rs)
gan =gan.sample(min_dataset_size, random_state=rs)
real = real.sample(min_dataset_size, random_state=rs)
ohl = ohl.sample(min_dataset_size, random_state=rs)
grids = {g:grids[g].sample(min_dataset_size, random_state=rs) for g in grids}
synth.shape, real.shape, gan.shape, ohl.shape

sampling all datasets to smallest size of 5000


((5000, 6), (5000, 6), (5000, 6), (5000, 6))

In [3]:
datadict = {'real':real, 'llama':synth, 'gan':gan, 'OH llama':ohl}
datadict.update(grids)
datadict.keys()

dict_keys(['real', 'llama', 'gan', 'OH llama', 'r16_a8_lr1e-3_wd1e-3', 'r64_a8_lr1e-3_wd1e-3', 'r64_a8_lr1e-4_wd1e-5', 'r64_a16_lr1e-5_wd1e-5', 'r64_a16_lr1e-5_wd0', 'r64_a8_lr1e-4_wd1e-3', 'r16_a8_lr1e-3_wd0', 'r64_a8_lr1e-5_wd1e-3', 'r64_a32_lr1e-5_wd0', 'r64_a8_lr1e-3_wd1e-5', 'r64_a8_lr1e-4_wd0', 'r64_a32_lr1e-3_wd0', 'r64_a8_lr1e-5_wd1e-5', 'r64_a16_lr1e-4_wd1e-3', 'r64_a16_lr1e-3_wd0', 'r64_a8_lr1e-5_wd0', 'r64_a16_lr1e-3_wd1e-5', 'r64_a32_lr1e-3_wd1e-3', 'r64_a32_lr1e-5_wd1e-5', 'r16_a8_lr1e-3_wd1e-5', 'r64_a8_lr1e-3_wd0', 'r64_a32_lr1e-3_wd1e-5', 'r64_a32_lr1e-4_wd0', 'r64_a16_lr1e-4_wd0', 'r64_a16_lr1e-4_wd1e-5', 'r64_a16_lr1e-5_wd1e-3', 'r64_a16_lr1e-3_wd1e-3'])

In [4]:
ords = ['sex', 'native-country', 'education', 'occupation'] # MUST BE IN ORDER
nums = ['age',]
labs = ['income']

# for each ord column, get all unique values occurign in real/synth, train/test
categories = []
for name in ords:
    s = set(real[name].unique().tolist())
    s.update(synth[name].unique().tolist())
    
    categories.append( list(s) )
# categories

ordenc = preprocessing.OrdinalEncoder(categories=categories)
numenc = preprocessing.StandardScaler()
lb = preprocessing.LabelBinarizer()

In [5]:
def create_pipeline(trainset):
    rfc = ensemble.RandomForestClassifier()
    preprocessing_pipeline = compose.ColumnTransformer([
        ("ordinal_preprocessor", ordenc, ords),
        ("numerical_preprocessor", numenc, nums),
    ])
    complete_pipeline = pipeline.Pipeline([
        ("preprocessor", preprocessing_pipeline),
        ("estimator", rfc)
    ])
    
    preprocessed_labels = lb.fit_transform(trainset[labs])
    complete_pipeline.fit(trainset[ords+nums], preprocessed_labels)
    return complete_pipeline

In [6]:
%%capture
# create random forest pipelines
rfdict = {}
for src in datadict.keys():
    rfdict[src] = create_pipeline(datadict[src])

In [7]:
for data in datadict.keys():
    labels = lb.fit_transform(datadict[data][labs])
    for model in rfdict.keys():
        score = rfdict[model].score(datadict[data][ords+nums], labels)
        print(f'{model} on {data}: \t\t\t{score}')

real on real: 			0.9164
llama on real: 			0.7702
gan on real: 			0.7706
OH llama on real: 			0.771
r16_a8_lr1e-3_wd1e-3 on real: 			0.7264
r64_a8_lr1e-3_wd1e-3 on real: 			0.6926
r64_a8_lr1e-4_wd1e-5 on real: 			0.7642
r64_a16_lr1e-5_wd1e-5 on real: 			0.7708
r64_a16_lr1e-5_wd0 on real: 			0.7646
r64_a8_lr1e-4_wd1e-3 on real: 			0.7674
r16_a8_lr1e-3_wd0 on real: 			0.7192
r64_a8_lr1e-5_wd1e-3 on real: 			0.7694
r64_a32_lr1e-5_wd0 on real: 			0.7596
r64_a8_lr1e-3_wd1e-5 on real: 			0.6688
r64_a8_lr1e-4_wd0 on real: 			0.7662
r64_a32_lr1e-3_wd0 on real: 			0.7324
r64_a8_lr1e-5_wd1e-5 on real: 			0.7696
r64_a16_lr1e-4_wd1e-3 on real: 			0.7614
r64_a16_lr1e-3_wd0 on real: 			0.6934
r64_a8_lr1e-5_wd0 on real: 			0.7702
r64_a16_lr1e-3_wd1e-5 on real: 			0.6288
r64_a32_lr1e-3_wd1e-3 on real: 			0.7162
r64_a32_lr1e-5_wd1e-5 on real: 			0.7718
r16_a8_lr1e-3_wd1e-5 on real: 			0.7508
r64_a8_lr1e-3_wd0 on real: 			0.7462
r64_a32_lr1e-3_wd1e-5 on real: 			0.6264
r64_a32_lr1e-4_wd0 on real: 			0.71