In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn import preprocessing, pipeline, ensemble, compose
import datasets
import os

# cols = ['age', 'workclass', 'fnlwgt', 'income']

real_path = '/hdd3/sonia/data/adult.csv'
dgpt2_path = '/hdd3/sonia/be_great/ckpts/dgpt2/adult-allcol/samples.csv'
moe_path = '/hdd3/sonia/be_great/ckpts/moe/dgpt2/adult-allcol/jul21/samplesclean.csv'
greatdpt2_path = '/hdd3/sonia/be_great/ckpts/dgpt2-greatclean.csv'
moegreatdgpt2_path = '/hdd3/sonia/be_great/ckpts/great/adult/moegreatdgpt2-aug01.csv'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
real = pd.read_csv(real_path)#[cols]
dgpt2= pd.read_csv(dgpt2_path)
moe  = pd.read_csv(moe_path)
greatdgpt2=pd.read_csv(greatdpt2_path)
moegreatdgpt2=pd.read_csv(moegreatdgpt2_path)
print(real.shape, dgpt2.shape, moe.shape, greatdgpt2.shape, moegreatdgpt2.shape)

rs = 2
min_dataset_size = min(len(real), len(dgpt2), len(moe), len(greatdgpt2), len(moegreatdgpt2))
real = real.sample(min_dataset_size, random_state=rs)
dgpt2=dgpt2.sample(min_dataset_size, random_state=rs)
moe  =  moe.sample(min_dataset_size, random_state=rs)
greatdgpt2 = greatdgpt2.sample(min_dataset_size, random_state=rs)
moegreatdgpt2=moegreatdgpt2.sample(min_dataset_size, random_state=rs)
real.shape, dgpt2.shape, moe.shape, greatdgpt2.shape, moegreatdgpt2.shape

(48842, 15) (5935, 15) (9106, 15) (9815, 15) (3993, 15)


((3993, 15), (3993, 15), (3993, 15), (3993, 15), (3993, 15))

In [3]:
datadict = {'real':real, 'dgpt2':dgpt2, 'moe':moe, 'greatdgpt2':greatdgpt2, 'moegreatdgpt2':moegreatdgpt2}
datadict.keys()

dict_keys(['real', 'dgpt2', 'moe', 'greatdgpt2', 'moegreatdgpt2'])

In [4]:
ords = ['workclass', 'education', 'marital-status', 'occupation', 
        'relationship', 'race', 'sex', 'native-country'] # MUST BE IN ORDER
nums = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', ]
labs = ['income']

for k, v in datadict.items():
    print(k)
    datadict[k] = datadict[k][ords+nums+labs] # ensure they're in order

# for each ord column, get all unique values occurign in real/synth, train/test
categories = []
for name in ords:
    s = set(real[name].unique().tolist())
    s.update(dgpt2[name].unique().tolist())
    s.update(moe[name].unique().tolist())
    s.update(greatdgpt2[name].unique().tolist())
    
    categories.append( list(s) )
categories

ordenc = preprocessing.OrdinalEncoder(categories=categories)
numenc = preprocessing.StandardScaler()
lb = preprocessing.LabelBinarizer()

real
dgpt2
moe
greatdgpt2
moegreatdgpt2


In [5]:
def create_pipeline(trainset):
    rfc = ensemble.RandomForestClassifier(n_estimators=10, max_depth=4)
    preprocessing_pipeline = compose.ColumnTransformer([
        ("ordinal_preprocessor", ordenc, ords),
        ("numerical_preprocessor", numenc, nums),
    ])
    complete_pipeline = pipeline.Pipeline([
        ("preprocessor", preprocessing_pipeline),
        ("estimator", rfc)
    ])
    
    preprocessed_labels = lb.fit_transform(trainset[labs].values.ravel()).ravel()
    complete_pipeline.fit(trainset[ords+nums], preprocessed_labels)
    return complete_pipeline

In [6]:
# %%capture
# create random forest pipelines
rfdict = {}
for src in datadict.keys():
    print(src)
    rfdict[src] = create_pipeline(datadict[src])

real
dgpt2
moe
greatdgpt2
moegreatdgpt2


In [7]:
for data in datadict.keys():
    labels = lb.fit_transform(datadict[data][labs])
    for model in rfdict.keys():
        score = rfdict[model].score(datadict[data][ords+nums], labels)
        print(f'{model} on {data}: \t\t\t{score}')

real on real: 			0.8494866015527173
dgpt2 on real: 			0.8016528925619835
moe on real: 			0.8146756824442775
greatdgpt2 on real: 			0.8196844477836214
moegreatdgpt2 on real: 			0.8254445279238668
real on dgpt2: 			0.639869772101177
dgpt2 on dgpt2: 			0.7751064362634611
moe on dgpt2: 			0.7420485850237917
greatdgpt2 on dgpt2: 			0.7325319308790383
moegreatdgpt2 on dgpt2: 			0.7145003756574004
real on moe: 			0.6378662659654395
dgpt2 on moe: 			0.7738542449286251
moe on moe: 			0.7986476333583772
greatdgpt2 on moe: 			0.7458051590282995
moegreatdgpt2 on moe: 			0.7743551214625595
real on greatdgpt2: 			0.7518156774355121
dgpt2 on greatdgpt2: 			0.8064112196343601
moe on greatdgpt2: 			0.7953919358878037
greatdgpt2 on greatdgpt2: 			0.8131730528424743
moegreatdgpt2 on greatdgpt2: 			0.8029050838968195
real on moegreatdgpt2: 			0.7721011770598547
dgpt2 on moegreatdgpt2: 			0.8564988730277987
moe on moegreatdgpt2: 			0.8692712246431255
greatdgpt2 on moegreatdgpt2: 			0.8407212622088656
moegr