In [1]:
import pandas as pd
import numpy as np
from rdt import HyperTransformer
TEST_DATA_INDEX = [4, 6, 3, 8, 'a', 1.0, 2.0, 3.0]
datetimes = pd.to_datetime([
    np.nan,
    '2010-02-01',
    '2010-01-01',
    '2010-01-01',
    '2010-01-01',
    '2010-02-01',
    '2010-01-01',
    '2010-01-01',
])
data = pd.DataFrame({
    'integer': [1, 2, 1, 3, 1, 4, 2, 3],
    'float': [0.1, 0.2, 0.1, np.nan, 0.1, 0.4, np.nan, 0.3],
    'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'],
    'bool': [False, np.nan, False, True, False, np.nan, True, False],
    'datetime': datetimes,
    'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'],
}, index=TEST_DATA_INDEX)

# Run
ht = HyperTransformer()
ht.detect_initial_config(data)
ht.fit(data)
transformed = ht.transform(data)
reverse_transformed = ht.reverse_transform(transformed)

# Assert
expected_datetimes = [
    1.263069e+18,
    1.264982e+18,
    1.262304e+18,
    1.262304e+18,
    1.262304e+18,
    1.264982e+18,
    1.262304e+18,
    1.262304e+18
]
expected_transformed = pd.DataFrame({
    'integer': [1., 2., 1., 3., 1., 4., 2., 3.],
    'float': [0.1, 0.2, 0.1, 0.2, 0.1, 0.4, 0.2, 0.3],
    'categorical': [
        0.6056724228102, 0.551035999670618, 0.6415811931779333, 0.9497122229547376,
        0.5791232599393884, 0.9713636500947356, 0.5800536682210967, 0.3183267452542666
    ],
    'bool': [
        0.13080626592394468, 0.6433871161873272, 0.013336903148287393, 0.7993073999936193,
        0.4430320785278661, 0.5835819683962835, 0.7772353030710347, 0.3091326939224907
    ],
    'datetime': expected_datetimes,
    'names': [
        0.15112620775650704, 0.857444679914493, 0.7654375186193025, 0.42569016008650984,
        0.30010761543029285, 0.9108473448910603, 0.15922866807030298, 0.3875325513134956
    ]
}, index=TEST_DATA_INDEX)
pd.testing.assert_frame_equal(transformed, expected_transformed)

reversed_datetimes = pd.to_datetime([
    '2010-01-09',
    '2010-02-01',
    '2010-01-01',
    '2010-01-01',
    '2010-01-01',
    '2010-02-01',
    '2010-01-01',
    '2010-01-01',
])
expected_reversed = pd.DataFrame({
    'integer': [1, 2, 1, 3, 1, 4, 2, 3],
    'float': [0.1, 0.2, 0.1, 0.20000000000000004, 0.1, 0.4, 0.20000000000000004, 0.3],
    'categorical': ['a', 'a', np.nan, 'b', 'a', 'b', 'a', 'a'],
    'bool': [False, False, False, True, False, False, True, False],
    'datetime': reversed_datetimes,
    'names': ['Jon', 'Arya', 'Arya', 'Jon', 'Jon', 'Sansa', 'Jon', 'Jon'],
}, index=TEST_DATA_INDEX)
for row in range(reverse_transformed.shape[0]):
    for column in range(reverse_transformed.shape[1]):
        expected = expected_reversed.iloc[row, column]
        actual = reverse_transformed.iloc[row, column]
        assert pd.isna(actual) or expected == actual

assert isinstance(ht.field_transformers['integer'], FloatFormatter)
assert isinstance(ht.field_transformers['float'], FloatFormatter)
assert isinstance(ht.field_transformers['categorical'], UniformEncoder)
assert isinstance(ht.field_transformers['bool'], UniformEncoder)
assert isinstance(ht.field_transformers['datetime'], UnixTimestampEncoder)
assert isinstance(ht.field_transformers['names'], UniformEncoder)

get_default_transformers.cache_clear()
get_default_transformer.cache_clear()

0.0 0.625
0.0 0.625
0.625 nan


OverflowError: Range exceeds valid bounds

In [7]:
import pandas as pd
import numpy as np
data = pd.Series([1, 2, 3])
print(data.value_counts(normalize=True, dropna=False))

3    0.333333
2    0.333333
1    0.333333
dtype: float64


In [9]:
t = data.value_counts(normalize=True, dropna=False)
t.reindex([1, 2, 3], fill_value=None)


1    0.333333
2    0.333333
3    0.333333
dtype: float64

In [1]:
import pandas as pd
data = pd.Series([1, 2, 3, None, None, 1, 2 ])
print(data.value_counts(normalize=True, dropna=False))


1.0    0.285714
2.0    0.285714
NaN    0.285714
3.0    0.142857
Name: proportion, dtype: float64

In [4]:
transformed.dtypes

a    category
dtype: object

In [6]:
r = ht.get_config()
t = r['transformers']['categorical']

In [8]:
t.intervals

{'a': [0.0, 0.625], None: [0.625, nan], 'b': [nan, 1.0]}

In [1]:
from rdt.transformers import UniformEncoder

transformer = UniformEncoder()
transformer.get_input_sdtype()



['categorical', 'boolean']

In [1]:
from rdt.transformers import TRANSFORMERS
print(list(TRANSFORMERS.keys()))

['rdt.transformers.base.BaseMultiColumnTransformer', 'rdt.transformers.boolean.BinaryEncoder', 'rdt.transformers.categorical.UniformEncoder', 'rdt.transformers.categorical.OrderedUniformEncoder', 'rdt.transformers.categorical.FrequencyEncoder', 'rdt.transformers.categorical.OneHotEncoder', 'rdt.transformers.categorical.LabelEncoder', 'rdt.transformers.categorical.OrderedLabelEncoder', 'rdt.transformers.categorical.CustomLabelEncoder', 'rdt.transformers.datetime.UnixTimestampEncoder', 'rdt.transformers.datetime.OptimizedTimestampEncoder', 'rdt.transformers.numerical.FloatFormatter', 'rdt.transformers.numerical.GaussianNormalizer', 'rdt.transformers.numerical.ClusterBasedNormalizer', 'rdt.transformers.pii.anonymizer.AnonymizedFaker', 'rdt.transformers.pii.anonymizer.PseudoAnonymizedFaker', 'rdt.transformers.text.IDGenerator', 'rdt.transformers.text.RegexGenerator', 'rdt.transformers.addons.identity.identity.IdentityTransformer']


In [5]:
test_file

PosixPath('/Users/romain_datacebo/Desktop/RDT/tests/unit/transformers/test_base.py')

In [6]:
module

<module 'rdt.transformers.test_base' from '/Users/romain_datacebo/Desktop/RDT/tests/unit/transformers/test_base.py'>

In [7]:
transformer.get_name()

'BaseMultiColumnTransformer'

In [4]:
from tests.code_style import get_test_location, _load_module_from_path
from rdt.transformers import BaseMultiColumnTransformer
transformer = BaseMultiColumnTransformer
test_file = get_test_location(transformer)
module = _load_module_from_path(test_file)

test_class = getattr(module, f'Test{transformer.get_name()}', None)
assert test_class is not None, 'The expected test class was not found.'

AssertionError: The expected test class was not found.

In [1]:
from rdt import HyperTransformer
from rdt.transformers import AnonymizedFaker

import pandas as pd

ht = HyperTransformer()
data_test = pd.DataFrame({
    'col_1': [1, 2, 3, 4, 5],
    'col_2': [1, 2, 3, 4, 5],
    'col_3': ['1', '2', '3', '4', '5'],
    'col_4': [True, False, True, False, True],
})
ht.detect_initial_config(data_test)
print(ht.get_config())



{
    "sdtypes": {
        "col_1": "numerical",
        "col_2": "numerical",
        "col_3": "categorical",
        "col_4": "boolean"
    },
    "transformers": {
        "col_1": FloatFormatter(),
        "col_2": FloatFormatter(),
        "col_3": UniformEncoder(),
        "col_4": UniformEncoder()
    }
}


In [7]:
ht._multi_column_fields

{'col_1': ('col_1', 'col_2'), 'col_2': ('col_1', 'col_2')}

In [6]:
ht._multi_column_fields = ht._get_multi_column_fields()

In [2]:
ht.update_transformers(
    column_name_to_transformer={
        ('col_1', 'col_2'): None
    }
)
print(ht.get_config())

{
    "sdtypes": {
        "col_1": "numerical",
        "col_2": "numerical",
        "col_3": "categorical",
        "col_4": "boolean"
    },
    "transformers": {
        "col_3": UniformEncoder(),
        "col_4": UniformEncoder(),
        "('col_1', 'col_2')": None
    }
}


In [20]:
from rdt.hyper_transformer import Config, HyperTransformer
from rdt.transformers import UniformEncoder, BaseMultiColumnTransformer
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'categorical'
  },
  'transformers': {
    ('A', 'B', 'C'): BaseMultiColumnTransformer,
  }
}
config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

InvalidConfigError: Invalid transformers for columns: [('A', 'B', 'C')]. Please assign an rdt transformer instance to each column name.

In [1]:
from rdt.hyper_transformer import Config, HyperTransformer
from rdt.transformers import UniformEncoder
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean',
        'D': 'categorical',
        'E': 'categorical' 
},
'transformers': {
    'A': UniformEncoder(),
    ('B', 'C', 'D'): None,
    'E': UniformEncoder()
}
}

config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.update_transformers({
    ('A', 'B'): None,
    'D': UniformEncoder()
})
new_config = ht.get_config()

In [2]:
new_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "E": UniformEncoder(),
        "('A', 'B')": None,
        "C": None,
        "D": UniformEncoder()
    }
}

In [3]:
ht.get_config()

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "categorical"
    },
    "transformers": {
        "('A', 'B', 'C')": None
    }
}

In [3]:
# Setup
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean',
        'D': 'categorical',
        'E': 'categorical' 
},
'transformers': {
    'A': UniformEncoder(),
    ('B', 'C', 'D'): None,
    'E': UniformEncoder()
}
}

config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.update_transformers_by_sdtype('boolean', transformer_name='LabelEncoder' )
new_config = ht.get_config()

# Assert
expected_config = Config({
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "E": UniformEncoder(),
        "C": LabelEncoder(),
        "('B', 'D')": None
    }
})

assert new_config == expected_config
    

In [4]:
new_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "E": UniformEncoder(),
        "C": LabelEncoder(),
        "('B', 'D')": None
    }
}

In [None]:
# Assert
expected_config = Config({
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "E": UniformEncoder(),
        "C": LabelEncoder(),
        "('B', 'D')": None
    }
})

assert repr(new_config) == repr(expected_config)

In [4]:
ht.update_transformers({
    'A': UniformEncoder(),
    'B': UniformEncoder()})
ht.get_config()

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "C": None,
        "B": UniformEncoder()
    }
}

In [2]:
from rdt.hyper_transformer import Config, HyperTransformer
from rdt.transformers import UniformEncoder
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean'
  },
  'transformers': {
    'A': None,
    'B': UniformEncoder(),
    'C': UniformEncoder()
  }
}
config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)
print(ht.get_config())
ht.update_transformers({
    ('A', 'B'): None,
  })
ht.get_config()


{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean"
    },
    "transformers": {
        "A": None,
        "B": UniformEncoder(),
        "C": UniformEncoder()
    }
}


{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean"
    },
    "transformers": {
        "C": UniformEncoder(),
        "('A', 'B')": None
    }
}

In [25]:
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean'
},
'transformers': {
    'A': None,
    'B': UniformEncoder(),
    'C': UniformEncoder()
}
}
config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.update_transformers({
    ('A', 'B'): None,
})
new_config = ht.get_config()

# Assert
expected_config = Config({
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean'
    },
'transformers': {
    'C': UniformEncoder(),
    ('A', 'B'): None
}
})
assert dict(new_config['sdtypes']) == dict(expected_config['sdtypes'])
assert dict(new_config["('A', 'B')"]) == dict(expected_config["('A', 'B')"])

KeyError: "('A', 'B')"

In [26]:
assert repr(new_config) == repr(expected_config)

In [18]:
expected_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean"
    },
    "transformers": {
        "C": UniformEncoder(),
        "('A', 'B')": None
    }
}

In [19]:
dict(new_config)

{'sdtypes': {'A': 'categorical', 'B': 'categorical', 'C': 'boolean'},
 'transformers': {'C': UniformEncoder(), ('A', 'B'): None}}

In [21]:
dict(expected_config) == dict(new_config)

False

In [22]:
dict(expected_config)

{'sdtypes': {'A': 'categorical', 'B': 'categorical', 'C': 'boolean'},
 'transformers': {'C': UniformEncoder(), ('A', 'B'): None}}

In [9]:
assert dict(new_config) == dict(expected_config)

AssertionError: 

In [7]:
new_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean"
    },
    "transformers": {
        "C": UniformEncoder(),
        "('A', 'B')": None
    }
}

In [8]:
expected_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean"
    },
    "transformers": {
        "C": UniformEncoder(),
        "('A', 'B')": None
    }
}

In [14]:
from rdt.hyper_transformer import Config, HyperTransformer
from rdt.transformers import UniformEncoder, FloatFormatter
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean',
        'D': 'numerical'
  },
  'transformers': {
    'A': None,
    'B': UniformEncoder(),
    'C': UniformEncoder(),
    'D': FloatFormatter()
  }
}
config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)
print(ht.get_config())
ht.update_transformers({
    ('A', 'D'): None
  })
ht.get_config()


{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "numerical"
    },
    "transformers": {
        "A": None,
        "B": UniformEncoder(),
        "C": UniformEncoder(),
        "D": FloatFormatter()
    }
}


{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "numerical"
    },
    "transformers": {
        "B": UniformEncoder(),
        "C": UniformEncoder(),
        "('A', 'D')": None
    }
}

In [19]:
ht.update_transformers_by_sdtype(sdtype='categorical', transformer_name='LabelEncoder')
print(ht.get_config())

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "numerical"
    },
    "transformers": {
        "B": LabelEncoder(),
        "C": LabelEncoder(),
        "A": LabelEncoder(),
        "D": None
    }
}


In [8]:
print(ht.get_config())

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "numerical"
    },
    "transformers": {
        "B": UniformEncoder(),
        "C": UniformEncoder(),
        "D": FloatFormatter(),
        "A": None
    }
}


In [3]:
from rdt.transformers import BaseMultiColumnTransformer
import pandas as pd
import numpy as np

class AdditionTransformer(BaseMultiColumnTransformer):
            
    INPUT_SDTYPE = 'numerical'
    SUPPORTED_SDTYPES = ['numerical']

    def __init__(self):
        super().__init__()
        self.output_properties = {
            'col_1': {'sdtype': 'numerical'},
            'col_2': {'sdtype': 'numerical'},
            'col_3': {'sdtype': 'numerical'}
        }

    def _fit(self, columns_data, columns_to_sdtypess):
        self.dtypes = columns_data.dtypes

    def _transform(self, data):
        return data.cumsum(axis=1)

    def _reverse_transform(self, data):
        result =  data.diff(axis=1)
        result.iloc[:, 0] = data.iloc[:, 0]

        return result.astype(self.dtypes)

data_test = pd.DataFrame({
    'col_1': [1, 2, 3],
    'col_2': [10, 20, 30],
    'col_3': [100, 200, 300]
})

column_to_sdtype= {
    'col_1': 'numerical',
    'col_2': 'numerical',
    'col_3': 'numerical'
}
transformer = AdditionTransformer()
#transformer.random_states = None

# Run
#transformer.fit(data_test, column_to_sdtype)
transformed = transformer.fit_transform(data_test, column_to_sdtype)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
    'col_1': [1, 2, 3],
    'col_2': [11, 22, 33],
    'col_3': [111, 222, 333]
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)

In [2]:
transformer = AdditionTransformer()
#transformer.random_states = None

# Run
transformer.fit(data_test, column_to_sdtype)

In [3]:
transformer.random_states

{'fit': RandomState(MT19937) at 0x15045ED40,
 'transform': RandomState(MT19937) at 0x15045EA40,
 'reverse_transform': RandomState(MT19937) at 0x15045EE40}

In [10]:
data_test.dtypes

col_1    int64
col_2    int64
col_3    int64
dtype: object

In [9]:
reverse.dtypes

col_1      int64
col_2    float64
col_3    float64
dtype: object

In [5]:
transformed.diff(axis=1)

Unnamed: 0,col_1,col_2,col_3
0,,10.0,100.0
1,,20.0,200.0
2,,30.0,300.0


In [6]:
transformed

Unnamed: 0,col_1,col_2,col_3
0,1,11,111
1,2,22,222
2,3,33,333


In [8]:
reverse

Unnamed: 0,col_1,col_2,col_3
0,1,10.0,100.0
1,2,20.0,200.0
2,3,30.0,300.0


In [4]:
transformer._get_output_to_property('sdtype')

{'None.cumsum_col_1': 'numerical',
 'None.cumsum_col_2': 'numerical',
 'None.cumsum_col_3': 'numerical'}

In [3]:
list(transformer._get_output_to_property('sdtype'))

['None.cumsum_col_1', 'None.cumsum_col_2', 'None.cumsum_col_3']

In [5]:
transformer.output_properties

{'Add': {'sdtype': 'numerical'}}

In [6]:
transformer._get_output_to_property('sdtype')

{'None.Add': 'numerical'}

In [16]:
transformer.get_output_columns()

['col_1#col_2#col_3']

In [7]:
transformer.output_columns

['col_1#col_2#col_3.Add']

In [4]:
from rdt.transformers import UniformEncoder
data = pd.DataFrame({
    'bool': [True, False, None, False, True],
    'mycol': ['a', 'b', 'a', None, np.nan],
})
ue = UniformEncoder()

In [5]:
ue.random_states

{'fit': RandomState(MT19937) at 0x1226D8440,
 'transform': None,
 'reverse_transform': None}

In [6]:
ue.fit(data, 'mycol')

In [7]:
ue.random_states

{'fit': RandomState(MT19937) at 0x15045EC40,
 'transform': RandomState(MT19937) at 0x1505D1040,
 'reverse_transform': RandomState(MT19937) at 0x1505D1140}

In [8]:
ue.columns

['mycol']

In [None]:
# Run
ue.fit(data, 'mycol')
transformed = ue.transform(data)
out = ue.reverse_transform(transformed)

# Assert
pd.testing.assert_frame_equal(out, data)

In [32]:
from rdt.transformers import BaseMultiColumnTransformer
import pandas as pd
import numpy as np

class ConcatenateTransformer(BaseMultiColumnTransformer):

    def _fit(self, columns_data, columns_to_sdtypess):
        column_names = list(columns_to_sdtypess.keys())
        self.name_1 = column_names[0] + '#' + column_names[1]
        self.name_2 = column_names[2] + '#' + column_names[3]
        self.output_properties = {
            self.name_1: {'sdtype': 'categorical'},
            self.name_2: {'sdtype': 'categorical'}
        }
        self.dtypes = columns_data.dtypes

    def _transform(self, data):
        data[self.name_1] = data.iloc[:, 0] + '#' + data.iloc[:, 1]
        data[self.name_2] = data.iloc[:, 2] + '#' + data.iloc[:, 3]

        return data.drop(columns=self.columns)

    def _reverse_transform(self, data):
        
        result = data.copy()
        column_names = list(data.columns)

        col1, col2 = column_names[0].split('#')
        result[[col1, col2]] = result[column_names[0]].str.split('#', expand=True)
        
        col3, col4 = column_names[1].split('#')
        result[[col3, col4]] = result[column_names[1]].str.split('#', expand=True)

        return result.astype(self.dtypes).drop(columns=column_names)
    

data_test = pd.DataFrame({
    'col_1': ['A', 'B', 'C'],
    'col_2': ['D', 'E', 'F'],
    'col_3': ['G', 'H', 'I'],
    'col_4': ['J', 'K', 'L']
})

column_to_sdtype= {
    'col_1': 'categorical',
    'col_2': 'categorical',
    'col_3': 'categorical',
    'col_4': 'categorical'
}
transformer = ConcatenateTransformer()

# Run
transformer.fit(data_test, column_to_sdtype)
transformed = transformer.transform(data_test)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
    'col_1#col_2': ['A#D', 'B#E', 'C#F'],
    'col_3#col_4': ['G#J', 'H#K', 'I#L']
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)

In [27]:
transformer.name_1

'col_1#col_2'

In [28]:
transformed

Unnamed: 0,col_1#col_2,col_3#col_4
0,A#D,G#J
1,B#E,H#K
2,C#F,I#L


In [23]:
transformer.output_properties

{'col_1#col_2': {'sdtype': 'categorical'},
 'col_3#col_4': {'sdtype': 'categorical'}}

In [19]:
transformer.output_properties

{'col_1#col_2': {'sdtype': 'categorical'},
 'col_3#col_4': {'sdtype': 'categorical'}}

In [11]:
transformed

Unnamed: 0,col_1#col_2,col_3#col_4
0,A#D,G#J
1,B#E,H#K
2,C#F,I#L


In [12]:
reverse

Unnamed: 0,col_1,col_2,col_3,col_4
0,A,D,G,J
1,B,E,H,K
2,C,F,I,L


In [4]:
data_test = pd.DataFrame({
    'col_1': [1, 2, 3],
    'col_2': [10, 20, 30],
    'col_3': [100, 200, 300]
})


In [6]:
list(data_test.columns)

['col_1', 'col_2', 'col_3']

In [33]:
from rdt.transformers import BaseMultiColumnTransformer
import pandas as pd
import numpy as np

class ExpandTransformer(BaseMultiColumnTransformer):

    def _fit(self, columns_data, columns_to_sdtypess):
        name_1 = self.columns[0] + '.first_part'
        name_2 = self.columns[0] + '.second_part'
        name_3 = self.columns[1] + '.first_part'
        name_4 = self.columns[1] + '.second_part'
        self.output_properties = {
            name_1: {'sdtype': 'categorical'},
            name_2: {'sdtype': 'categorical'},
            name_3: {'sdtype': 'categorical'},
            name_4: {'sdtype': 'categorical'}
        }
        self.names = [name_1, name_2, name_3, name_4]
        self.dtypes = columns_data.dtypes

    def _transform(self, data):
        data[self.names[0]] = data[self.columns[0]].str[0]
        data[self.names[1]] = data[self.columns[0]].str[1]
        data[self.names[2]] = data[self.columns[1]].str[0]
        data[self.names[3]] = data[self.columns[1]].str[1]

        return data.drop(columns=self.columns)

    def _reverse_transform(self, data):
        result = data.copy()
        result[self.columns[0]] = result[self.names[0]] + result[self.names[1]]
        result[self.columns[1]] = result[self.names[2]] + result[self.names[3]]

        return result.astype(self.dtypes).drop(columns=self.names)
    

data_test = pd.DataFrame({
    'col_1': ['AB', 'CD', 'EF'],
    'col_2': ['GH', 'IJ', 'KL'],
})

column_to_sdtype= {
    'col_1': 'categorical',
    'col_2': 'categorical',
}
transformer = ExpandTransformer()

# Run
transformer.fit(data_test, column_to_sdtype)
transformed = transformer.transform(data_test)
reverse = transformer.reverse_transform(transformed)

# Assert
expected_transform = pd.DataFrame({
    'col_1.first_part': ['A', 'C', 'E'],
    'col_1.second_part': ['B', 'D', 'F'],
    'col_2.first_part': ['G', 'I', 'K'],
    'col_2.second_part': ['H', 'J', 'L']
})
pd.testing.assert_frame_equal(expected_transform, transformed)
pd.testing.assert_frame_equal(reverse, data_test)

In [34]:
transformed

Unnamed: 0,col_1.first_part,col_1.second_part,col_2.first_part,col_2.second_part
0,A,B,G,H
1,C,D,I,J
2,E,F,K,L


In [37]:
transformer._get_output_to_property('lala')

KeyError: 'lala'

In [35]:
reverse

Unnamed: 0,col_1,col_2
0,AB,GH
1,CD,IJ
2,EF,KL


In [1]:
import numpy as np
import pandas as pd
import pytest

from rdt import get_demo
from rdt.errors import ConfigNotSetError, InvalidConfigError, InvalidDataError, NotFittedError
from rdt.hyper_transformer import Config, HyperTransformer
from rdt.transformers import (
    AnonymizedFaker, BaseTransformer, BinaryEncoder, ClusterBasedNormalizer, FloatFormatter,
    FrequencyEncoder, LabelEncoder, OneHotEncoder, RegexGenerator, UniformEncoder, BaseMultiColumnTransformer,
    UnixTimestampEncoder, get_default_transformer, get_default_transformers)
from rdt.transformers.datetime import OptimizedTimestampEncoder
from rdt.transformers.numerical import GaussianNormalizer
from rdt.transformers.pii.anonymizer import PseudoAnonymizedFaker


class DummyMultiColumnTransformerNumerical(BaseMultiColumnTransformer):
    """Multi column transformer that takes categorical data."""

    SUPPORTED_SDTYPES = ['categorical', 'boolean']

    def _fit(self, columns_data, columns_to_sdtypess):
        self.output_properties = {
            column: {
                'sdtype': 'numerical',
            } for column in self.columns
        }

    def _transform(self, data):
        return data.astype(float)

    def _reverse_transform(self, data):
        return data.astype(str)

dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean',
        'D': 'categorical',
        'E': 'categorical'
    },
    'transformers': {
        'A': UniformEncoder(),
        ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(),
        'E': UniformEncoder()
    }
}

config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.remove_transformers_by_sdtype(sdtype='boolean')
new_config = ht.get_config()

# Assert
expected_config = Config({
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "E": UniformEncoder(),
        "('C', 'D')": DummyMultiColumnTransformerNumerical(),
        "B": None
    }
})

assert repr(new_config) == repr(expected_config)


InvalidConfigError: The column names in the 'sdtypes' dictionary must match the column names in the 'transformers' dictionary.

In [2]:
new_config

{
    "sdtypes": {
        "A": "categorical",
        "B": "categorical",
        "C": "boolean",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": UniformEncoder(),
        "E": UniformEncoder(),
        "('B', 'D')": DummyMultiColumnTransformerNumerical(),
        "C": None
    }
}

In [10]:
ht = HyperTransformer()
ht.field_sdtypes = {
    'column1': 'categorical',
    'column2': 'categorical',
    'column3': 'categorical',
    'column4': 'categorical'
}
ht.field_transformers = {
    'column1': 'transformer',
    ('column2', 'column3'): 'multi_column_transformer',
    'column4': 'transformer'
}
ht._multi_column_fields = {
    'column2': ('column2', 'column3'),
    'column3': ('column2', 'column3')
}

# Run
ht.remove_transformers(column_names=['column3', 'column4'])

In [11]:
ht.get_config()

{
    "sdtypes": {
        "column1": "categorical",
        "column2": "categorical",
        "column3": "categorical",
        "column4": "categorical"
    },
    "transformers": {
        "column1": "'transformer'",
        "column4": None,
        "column2": "'multi_column_transformer'",
        "column3": None
    }
}

In [3]:
ht = HyperTransformer()
ht.field_sdtypes = {
    'column1': 'categorical',
    'column2': 'categorical',
    'column3': 'boolean',
    'column4': 'boolean'
}
ht.field_transformers = {
    'column1': 'transformer',
    ('column2', 'column3'): 'multi_column_transformer',
    'column4': 'transformer'
}
ht._multi_column_fields = {
    'column2': ('column2', 'column3'),
    'column3': ('column2', 'column3')
}

# Run
ht.update_sdtypes(column_name_to_sdtype={
    'column1': 'boolean',
    }
)

# Assert
assert ht.field_transformers == {
    'column1': 'transformer',
    'column2': 'multi_column_transformer',
    'column3': None,
    'column4': None
}

AttributeError: 'str' object has no attribute 'get_supported_sdtypes'

In [4]:
ht.get_config()

{
    "sdtypes": {
        "column1": "categorical",
        "column2": "categorical",
        "column3": "boolean",
        "column4": "boolean"
    },
    "transformers": {
        "column1": "'transformer'",
        "column4": None,
        "column2": "'multi_column_transformer'",
        "column3": None
    }
}

In [13]:
"""Test ``remove_transformer_by_sdtype`` with multi column transformer."""
# Setup
dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean',
        'D': 'categorical',
        'E': 'categorical'
    },
    'transformers': {
        'A': UniformEncoder(),
        ('B', 'C', 'D'): DummyMultiColumnTransformerNumerical(),
        'E': UniformEncoder()
    }
}

config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.update_sdtypes({
    'C': 'numerical',
    'A': 'numerical'
})
new_config = ht.get_config()

# Assert
expected_config = Config({
    'sdtypes': {
        'A': 'numerical',
        'B': 'categorical',
        'C': 'numerical',
        'D': 'categorical',
        'E': 'categorical'
    },
    'transformers': {
        'A': FloatFormatter(),
        'E': UniformEncoder(),
        "('B', 'D')": DummyMultiColumnTransformerNumerical(),
        'C': FloatFormatter()
    }
})

assert repr(new_config) == repr(expected_config)

In [14]:
new_config

{
    "sdtypes": {
        "A": "numerical",
        "B": "categorical",
        "C": "numerical",
        "D": "categorical",
        "E": "categorical"
    },
    "transformers": {
        "A": FloatFormatter(),
        "E": UniformEncoder(),
        "('B', 'D')": DummyMultiColumnTransformerNumerical(),
        "C": FloatFormatter()
    }
}

In [15]:
class DummyMultiColumnTransformer(BaseMultiColumnTransformer):
    """Dummy multi column transformer."""

    SUPPORTED_SDTYPES = ['categorical', 'boolean']


ht = HyperTransformer()
ht.field_sdtypes = {
    'column1': 'categorical',
    'column2': 'categorical',
    'column3': 'categorical',
    'column4': 'categorical'
}
ht.field_transformers = {
    'column1': UniformEncoder(),
    ('column2', 'column3'): DummyMultiColumnTransformer(),
    'column4': None
}
ht._multi_column_fields = {
    'column2': ('column2', 'column3'),
    'column3': ('column2', 'column3')
}

# Run
ht.update_sdtypes(column_name_to_sdtype = {
    'column2': 'boolean',
    'column1': 'boolean'}
)

In [16]:
ht.get_config()

{
    "sdtypes": {
        "column1": "boolean",
        "column2": "boolean",
        "column3": "categorical",
        "column4": "categorical"
    },
    "transformers": {
        "column1": UniformEncoder(),
        "('column2', 'column3')": DummyMultiColumnTransformer(),
        "column4": None
    }
}

In [26]:
import pytest
class DummyMultiColumnTransformer(BaseMultiColumnTransformer):
    """Dummy multi column transformer."""

    SUPPORTED_SDTYPES = ['categorical', 'boolean']


ht = HyperTransformer()
ht.field_sdtypes = {
    'column1': 'categorical',
    'column2': 'categorical',
    'column3': 'categorical',
    'column4': 'categorical'
}
ht.field_transformers = {
    'column1': UniformEncoder(),
    ('column2', 'column3'): DummyMultiColumnTransformer(),
    'column4': None
}
ht._multi_column_fields = {
    'column2': ('column2', 'column3'),
    'column3': ('column2', 'column3')
}

# Run
expected_warning = (
    "Sdtype 'numerical' is incompatible with transformer 'DummyMultiColumnTransformer'. Assigning a new transformer to it."
)
with pytest.warns(UserWarning, match=expected_warning):
    ht.update_sdtypes(column_name_to_sdtype = {
        'column2': 'numerical',
        'column1': 'boolean'}
    )

# Assert
expected_field_sdtypes = {
    'column1': 'boolean',
    'column2': 'numerical',
    'column3': 'categorical',
    'column4': 'categorical'
}
expected_field_transformers = {
    'column1': UniformEncoder(),
    'column4': None,
    'column3': DummyMultiColumnTransformer(),
    'column2': FloatFormatter(),
}
assert ht.field_sdtypes == expected_field_sdtypes
assert str(ht.field_transformers) == str(expected_field_transformers)

In [24]:
str(ht.field_transformers)

"{'column1': UniformEncoder(), 'column4': None, 'column3': DummyMultiColumnTransformer(), 'column2': FloatFormatter()}"

In [25]:
str(expected_field_transformers)

"{'column1': UniformEncoder(), 'column4': None, 'column2': FloatFormatter(), 'column3': DummyMultiColumnTransformer()}"

In [19]:
ht.field_sdtypes

{'column1': 'boolean',
 'column2': 'numerical',
 'column3': 'categorical',
 'column4': 'categorical'}

In [30]:
transformer.dtypes

[dtype('int64'), dtype('int64'), dtype('int64')]

In [29]:
class AdditionTransformer(BaseMultiColumnTransformer):
    """This transformer takes 3 columns and return the cumulative sum of each row."""
    def _fit(self, columns_data, columns_to_sdtypess):
        self.output_properties = {
            column: {'sdtype': 'numerical'} for column in self.columns
        }
        self.dtypes = list(columns_data.dtypes)

    def _generate_prefixes(self, data, ordered_columns):
        prefixes = {}
        for idx, column in enumerate(self.output_properties):
            prefixes[column] = '#'.join(ordered_columns[:idx + 1])

        return prefixes

    def _transform(self, data):
        return data.cumsum(axis=1)

    def _reverse_transform(self, data):
        result = data.diff(axis=1)
        result.iloc[:, 0] = data.iloc[:, 0]

        return result.astype(self.dtypes)

data_test = pd.DataFrame({
'col_1': [1, 2, 3],
'col_2': [10, 20, 30],
'col_3': [100, 200, 300]
})

order_columns = ('col_1', 'col_2', 'col_3')
transformer = AdditionTransformer()

# Run
transformed = transformer.fit_transform(data_test, order_columns)
reverse = transformer.reverse_transform(transformed)

TypeError: Field elements must be 2- or 3-tuples, got 'dtype('int64')'

In [31]:
transformer.output_columns

['col_1.col_1', 'col_1#col_2.col_2', 'col_1#col_2#col_3.col_3']

In [27]:
reverse.dtypes

col_1    object
col_2    object
col_3    object
dtype: object

In [19]:
transformer.dtypes.values

array([dtype('int64'), dtype('int64'), dtype('int64')], dtype=object)

In [11]:
transformer.output_columns

['col_1.col_1', 'col_1#col_2.col_2', 'col_1#col_2#col_3.col_3']

In [16]:
t = transformer._get_columns_data(transformed, transformer.output_columns)

In [17]:
transformer._reverse_transform(t)

KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

In [8]:
transformed

Unnamed: 0,col_1.col_1,col_1#col_2.col_2,col_1#col_2#col_3.col_3
0,1,11,111
1,2,22,222
2,3,33,333


In [6]:
import pandas as pd
import numpy as np
from rdt.transformers import UniformEncoder, BaseMultiColumnTransformer
from rdt.hyper_transformer import Config, HyperTransformer

class DummyMultiColumnTransformerNumerical(BaseMultiColumnTransformer):
    """Multi column transformer that takes categorical data."""

    SUPPORTED_SDTYPES = ['categorical', 'boolean']

    def _fit(self, data, ordered_columns):
        self.output_properties = {
            column: {
                'sdtype': 'numerical',
                'next_transformer': None
            } for column in self.columns
        }

    def _generate_prefixes(self, ordered_columns):
        prefixes = {column: column for column in self.output_properties}
        return prefixes

    def _transform(self, data):
        return data.astype(float)

    def _reverse_transform(self, data):
        return data.astype(str)

dict_config = {
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean'
    },
    'transformers': {
        'A': None,
        'B': UniformEncoder(),
        'C': UniformEncoder()
    }
}
config = Config(dict_config)
ht = HyperTransformer()
ht.set_config(config)

# Run
ht.update_transformers({
    ('A', 'B'): DummyMultiColumnTransformerNumerical(),
})
new_config = ht.get_config()

# Assert
expected_config = Config({
    'sdtypes': {
        'A': 'categorical',
        'B': 'categorical',
        'C': 'boolean'
    },
    'transformers': {
        'C': UniformEncoder(),
        "('A', 'B')": DummyMultiColumnTransformerNumerical()
    }
})

assert repr(new_config) == repr(expected_config)

In [8]:
data_test = pd.DataFrame({
    'A': ['1', '2', '3'],
    'B': ['4', '5', '6'],
    'C': [True, False, True]
})
transformed = ht.fit_transform(data_test)

In [14]:
transformed.to_dict()

{'C': {0: 0.5225768219566304, 1: 0.7797813625043645, 2: 0.31881544039752413},
 'A.A': {0: 1.0, 1: 2.0, 2: 3.0},
 'B.B': {0: 4.0, 1: 5.0, 2: 6.0}}

In [10]:
rt = ht.reverse_transform(transformed)

In [11]:
rt.dtypes

A    object
B    object
C      bool
dtype: object