In [14]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn import preprocessing, pipeline, ensemble, compose
import datasets
import os

real_path = '/mnt/data/sonia/datasets/adult/may8.dat'
synth_path= ['/mnt/data/sonia/datasets/synthetic/adult/may10.dat', 
             '/mnt/data/sonia/datasets/synthetic/adult/may10.dat-1', 
             './may10.dat']
gan_path = '/home/sonia/TabFairGAN/fake_adult.csv'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
full_dataset = datasets.DatasetDict({})
for f in os.listdir(real_path):
    if f.endswith('.json'): continue
    full_dataset[f] = datasets.load_from_disk(os.path.join(real_path, f))
real = full_dataset['train'].to_pandas().drop(['length'], axis=1)
syntha= datasets.load_from_disk(synth_path[0]).to_pandas()
synthb= datasets.load_from_disk(synth_path[1]).to_pandas()
synthc= datasets.load_from_disk(synth_path[2]).to_pandas()
synth = pd.concat([syntha, synthb, synthc])
synth.columns = real.columns
real = real.sample(synth.shape[0], random_state=42)

gan = pd.read_csv(gan_path)[real.columns].sample(synth.shape[0], random_state=42)
gan['native-country'] = gan.apply(lambda x: ' '.join(x['native-country'].strip().split('-')), axis=1)
gan['age'] = gan.apply(lambda x: str(x['age']), axis=1)
gan['sex'] = gan.apply(lambda x: x['sex'].strip(), axis=1)
gan['education'] = gan.apply(lambda x: x['education'].strip(), axis=1)
gan['occupation'] = gan.apply(lambda x: ' '.join(x['occupation'].strip().split('-')), axis=1)
inc_map = {' <=50K': 'under 50K', ' >50K': 'over 50K'}
gan['income'] = gan.apply(lambda x: inc_map[x['income']], axis=1)
synth.shape, real.shape, gan.shape

((10250, 6), (10250, 6), (10250, 6))

In [23]:
datadict = {'real':real, 'llama':synth, 'gan':gan}
real.head(), synth.head(), gan.head()

(      age     sex native-country     education         occupation     income
 10020  40    Male  United States       HS-grad       Craft repair  under 50K
 4265   32  Female  United States   Prof-school              Sales   over 50K
 12013  36    Male  United States       HS-grad       Craft repair  under 50K
 13167  34  Female  United States       HS-grad  Machine op inspct  under 50K
 33559  56    Male  United States  Some-college              Sales   over 50K,
   age     sex native-country     education         occupation     income
 0  47  Female  United States  Some-college  Handlers cleaners  under 50K
 1  35  Female  United States  Some-college       Armed Forces  under 50K
 2  65  Female  United States     Doctorate       Armed Forces  under 50K
 3  19  Female  United States          11th    Protective serv  under 50K
 4  54    Male  United States     Assoc-voc       Armed Forces  under 50K,
         age   sex native-country     education         occupation     income
 14160  

In [24]:
ords = ['sex', 'native-country', 'education', 'occupation'] # MUST BE IN ORDER
nums = ['age',]
labs = ['income']

# for each ord column, get all unique values occurign in real/synth, train/test
categories = []
for name in ords:
    s = set(real[name].unique().tolist())
    s.update(synth[name].unique().tolist())
    
    categories.append( list(s) )
# categories

ordenc = preprocessing.OrdinalEncoder(categories=categories)
numenc = preprocessing.StandardScaler()
lb = preprocessing.LabelBinarizer()

In [25]:
def create_pipeline(trainset):
    rfc = ensemble.RandomForestClassifier()
    preprocessing_pipeline = compose.ColumnTransformer([
        ("ordinal_preprocessor", ordenc, ords),
        ("numerical_preprocessor", numenc, nums),
    ])
    complete_pipeline = pipeline.Pipeline([
        ("preprocessor", preprocessing_pipeline),
        ("estimator", rfc)
    ])
    
    preprocessed_labels = lb.fit_transform(trainset[labs])
    complete_pipeline.fit(trainset[ords+nums], preprocessed_labels)
    return complete_pipeline

In [29]:
%%capture
rf_synth = create_pipeline(synth)
rf_real  = create_pipeline(real)
rf_gan   = create_pipeline(gan)
rfdict = {'llama': rf_synth, 'real': rf_real, 'gan':rf_gan}

In [27]:
# preprocessed_labels_synth = lb.fit_transform(synth[labs])
# preprocessed_labels_real  = lb.fit_transform(real[labs])
# synth_on_synth = rf_synth.score(synth[ords+nums], preprocessed_labels_synth)
# synth_on_real  = rf_synth.score(real[ords+nums], preprocessed_labels_real)
# real_on_synth  = rf_real.score(synth[ords+nums], preprocessed_labels_synth)
# real_on_real   = rf_real.score(real[ords+nums], preprocessed_labels_real)
# print('synth_on_synth', 'synth_on_real', 'real_on_synth', 'real_on_real', sep='\t\t')
# print(synth_on_synth, synth_on_real, real_on_synth, real_on_real, sep='\t\t')

synth_on_synth		synth_on_real		real_on_synth		real_on_real
0.9999024390243902		0.7568780487804878		0.9305365853658537		0.8907317073170732


In [1]:
for data in datadict.values():
    labels = lb.fit_transform(datadict[data][labs])
    for model in rfdict.values():
        score = rfdict[model].score(datadict[data][ords+nums], labels)
        print(f'{model} on {data}: \t\t\t{score}')

NameError: name 'datadict' is not defined