In [14]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn import preprocessing, pipeline, ensemble, compose
import datasets
import os

real_path = '/mnt/data/sonia/datasets/adult/may8.dat'
synth_path= ['/mnt/data/sonia/datasets/synthetic/adult/may10.dat', 
             '/mnt/data/sonia/datasets/synthetic/adult/may10.dat-1', 
             './may10.dat']
oh_path = 'vanilla-adult-clean.dat'
gan_path = '/home/sonia/TabFairGAN/fake_adult.csv'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
full_dataset = datasets.DatasetDict({})
for f in os.listdir(real_path):
    if f.endswith('.json'): continue
    full_dataset[f] = datasets.load_from_disk(os.path.join(real_path, f))
real = full_dataset['train'].to_pandas().drop(['length'], axis=1)
syntha= datasets.load_from_disk(synth_path[0]).to_pandas()
synthb= datasets.load_from_disk(synth_path[1]).to_pandas()
synthc= datasets.load_from_disk(synth_path[2]).to_pandas()
synth = pd.concat([syntha, synthb, synthc])
synth.columns = real.columns

gan = pd.read_csv(gan_path)[real.columns]
gan['native-country'] = gan.apply(lambda x: ' '.join(x['native-country'].strip().split('-')), axis=1)
gan['age'] = gan.apply(lambda x: str(x['age']), axis=1)
gan['sex'] = gan.apply(lambda x: x['sex'].strip(), axis=1)
gan['education'] = gan.apply(lambda x: x['education'].strip(), axis=1)
gan['occupation'] = gan.apply(lambda x: ' '.join(x['occupation'].strip().split('-')), axis=1)
inc_map = {' <=50K': 'under 50K', ' >50K': 'over 50K'}
gan['income'] = gan.apply(lambda x: inc_map[x['income']], axis=1)

ohl = datasets.load_from_disk(oh_path).to_pandas().drop('__index_level_0__', axis=1)
ohl.columns = real.columns

min_dataset_size = min(synth.shape[0], gan.shape[0], real.shape[0], ohl.shape[0])
print('sampling all datasets to smallest size of', min_dataset_size)
synth = synth.sample(min_dataset_size, random_state=42)
gan =gan.sample(min_dataset_size, random_state=42)
real = real.sample(min_dataset_size, random_state=42)
ohl = ohl.sample(min_dataset_size, random_state=42)
synth.shape, real.shape, gan.shape, ohl.shape

sampling all datasets to smallest size of 9895


((9895, 6), (9895, 6), (9895, 6), (9895, 6))

In [24]:
datadict = {'real':real, 'llama':synth, 'gan':gan, 'OH llama':ohl}
real.head(), synth.head(), gan.head(), ohl.head()

(      age     sex native-country     education         occupation     income
 10020  40    Male  United States       HS-grad       Craft repair  under 50K
 4265   32  Female  United States   Prof-school              Sales   over 50K
 12013  36    Male  United States       HS-grad       Craft repair  under 50K
 13167  34  Female  United States       HS-grad  Machine op inspct  under 50K
 33559  56    Male  United States  Some-college              Sales   over 50K,
      age     sex native-country     education       occupation     income
 5106  52  Female  United States  Some-college  Farming fishing  under 50K
 1418  25  Female  United States  Some-college     Armed Forces  under 50K
 4828  46  Female  United States  Some-college     Armed Forces  under 50K
 651   44  Female  United States  Some-college     Craft repair  under 50K
 3298  17  Female  United States  Some-college     Adm clerical  under 50K,
         age   sex native-country     education         occupation     income
 1

In [25]:
ords = ['sex', 'native-country', 'education', 'occupation'] # MUST BE IN ORDER
nums = ['age',]
labs = ['income']

# for each ord column, get all unique values occurign in real/synth, train/test
categories = []
for name in ords:
    s = set(real[name].unique().tolist())
    s.update(synth[name].unique().tolist())
    
    categories.append( list(s) )
# categories

ordenc = preprocessing.OrdinalEncoder(categories=categories)
numenc = preprocessing.StandardScaler()
lb = preprocessing.LabelBinarizer()

In [26]:
def create_pipeline(trainset):
    rfc = ensemble.RandomForestClassifier()
    preprocessing_pipeline = compose.ColumnTransformer([
        ("ordinal_preprocessor", ordenc, ords),
        ("numerical_preprocessor", numenc, nums),
    ])
    complete_pipeline = pipeline.Pipeline([
        ("preprocessor", preprocessing_pipeline),
        ("estimator", rfc)
    ])
    
    preprocessed_labels = lb.fit_transform(trainset[labs])
    complete_pipeline.fit(trainset[ords+nums], preprocessed_labels)
    return complete_pipeline

In [27]:
%%capture
rf_synth = create_pipeline(synth)
rf_real  = create_pipeline(real)
rf_gan   = create_pipeline(gan)
rf_ohl   = create_pipeline(ohl)
rfdict = {'llama': rf_synth, 'real': rf_real, 'gan':rf_gan, "OH llama":rf_ohl}

In [7]:
# preprocessed_labels_synth = lb.fit_transform(synth[labs])
# preprocessed_labels_real  = lb.fit_transform(real[labs])
# synth_on_synth = rf_synth.score(synth[ords+nums], preprocessed_labels_synth)
# synth_on_real  = rf_synth.score(real[ords+nums], preprocessed_labels_real)
# real_on_synth  = rf_real.score(synth[ords+nums], preprocessed_labels_synth)
# real_on_real   = rf_real.score(real[ords+nums], preprocessed_labels_real)
# print('synth_on_synth', 'synth_on_real', 'real_on_synth', 'real_on_real', sep='\t\t')
# print(synth_on_synth, synth_on_real, real_on_synth, real_on_real, sep='\t\t')

In [28]:
for data in datadict.keys():
    labels = lb.fit_transform(datadict[data][labs])
    for model in rfdict.keys():
        score = rfdict[model].score(datadict[data][ords+nums], labels)
        print(f'{model} on {data}: \t\t\t{score}')

llama on real: 			0.7574532592218292
real on real: 			0.892066700353714
gan on real: 			0.7727134916624557
OH llama on real: 			0.7649317837291562
llama on llama: 			0.9998989388580091
real on llama: 			0.942900454775139
gan on llama: 			0.9241030823648307
OH llama on llama: 			0.9401718039413846
llama on gan: 			0.7617988883274381
real on gan: 			0.7753410813542193
gan on gan: 			0.8964123294593229
OH llama on gan: 			0.7678625568468924
llama on OH llama: 			0.8304194037392623
real on OH llama: 			0.7633148054573017
gan on OH llama: 			0.7741283476503285
OH llama on OH llama: 			0.8865083375442142
