In [1]:
import pandas as pd

data = pd.DataFrame({
    'integer': [1, 2, 1, 2, 1, 2, 3, 1],
    'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.3, 0.1],
    'categorical': ['a', 'b', 'a', 'b', 'a', None, 'c', None],
    'bool': [False, True, False, True, False, False, False, False],
    'nullable': [1, None, 3, None, 5, None, 7, None],
    'datetime': [
        '2010-01-01', '2010-02-01', '2010-01-01', '2010-02-01',
        '2010-01-01', '2010-02-01', '2010-03-01', '2010-02-01'
    ]
})

data['datetime'] = pd.to_datetime(data['datetime'])

data

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,1,0.1,a,False,1.0,2010-01-01
1,2,0.2,b,True,,2010-02-01
2,1,0.1,a,False,3.0,2010-01-01
3,2,0.2,b,True,,2010-02-01
4,1,0.1,a,False,5.0,2010-01-01
5,2,0.2,,False,,2010-02-01
6,3,0.3,c,False,7.0,2010-03-01
7,1,0.1,,False,,2010-02-01


In [5]:
from rdt import HyperTransformer

ht = HyperTransformer(anonymize={'categorical': 'email'})

ht.fit(data)

trans_test = ht.transform(data)

trans_test

Unnamed: 0,integer,float,categorical,bool,nullable,nullable#1,datetime
0,1,0.1,0.208204,0,1.0,0.0,1.262304e+18
1,2,0.2,0.483121,1,4.0,1.0,1.264982e+18
2,1,0.1,0.178803,0,3.0,0.0,1.262304e+18
3,2,0.2,0.516839,1,4.0,1.0,1.264982e+18
4,1,0.1,0.111023,0,5.0,0.0,1.262304e+18
5,2,0.2,0.716757,0,4.0,1.0,1.264982e+18
6,3,0.3,0.929053,0,7.0,0.0,1.267402e+18
7,1,0.1,0.769039,0,4.0,1.0,1.264982e+18


In [6]:
from copulas.multivariate import GaussianMultivariate

gm = GaussianMultivariate()

gm.fit(trans_test)

sampled_test = gm.sample(len(trans_test))
sampled_test

Unnamed: 0,integer,float,categorical,bool,nullable,nullable#1,datetime
0,1.106892,0.110689,0.134189,-0.017369,4.431099,-0.353245,1.26244e+18
1,2.179208,0.217921,0.844452,-0.467453,6.898361,-0.197168,1.266269e+18
2,0.919078,0.091908,0.635089,-0.004072,4.251118,0.802128,1.264514e+18
3,1.585359,0.158536,0.347673,-0.048487,4.631074,0.280282,1.263462e+18
4,1.658965,0.165897,0.353263,-0.367389,3.820639,-0.254015,1.263291e+18
5,0.483883,0.048388,-0.161375,-0.178751,3.50881,-0.362637,1.260519e+18
6,2.959793,0.295979,1.03277,0.107743,8.527751,0.573748,1.267872e+18
7,1.412011,0.141201,0.445884,0.370138,2.51477,0.494141,1.26392e+18


In [7]:
ht.reverse_transform(sampled_test)

Unnamed: 0,integer,float,categorical,bool,nullable,datetime
0,1,0.110689,a,False,4.431099,2010-01-02 13:53:42.952054784
1,2,0.217921,,False,6.898361,2010-02-15 21:19:12.321053952
2,1,0.091908,,False,,2010-01-26 13:51:29.267797248
3,2,0.158536,a,False,4.631074,2010-01-14 09:33:09.314653440
4,2,0.165897,a,False,3.820639,2010-01-12 10:02:31.169252096
5,0,0.048388,,False,3.50881,2009-12-11 08:13:31.808914432
6,3,0.295979,a,False,,2010-03-06 10:46:18.713844480
7,1,0.141201,b,False,2.51477,2010-01-19 16:59:31.242653184
