In [1]:
import copy

from hypex.experiments.preprocessing import PREPROCESSING_DATA
from hypex.dataset import Dataset, ExperimentData, InfoRole, TreatmentRole, TargetRole, FeatureRole

# Test dataset creation 

In [2]:
from hypex.dataset import PreTargetRole

data = Dataset(
    roles={
        "user_id": InfoRole(),
        "signup_month": FeatureRole(),
        "treat": TreatmentRole(),
        "pre_spends": PreTargetRole(),
        "post_spends": TargetRole(),
        "age": FeatureRole(),
        "gender": FeatureRole(),
        "industry": FeatureRole(),
    }, data="data.csv",
)
data = data.add_column(data["post_spends"], {"poust_spends": TargetRole(float)})
data

      user_id  signup_month  treat  pre_spends  post_spends   age gender  \
0           0             0      0       488.0   414.444444   NaN      M   
1           1             8      1       512.5   462.222222  26.0    NaN   
2           2             7      1       483.0   479.444444  25.0      M   
3           3             0      0       501.5   424.333333  39.0      M   
4           4             1      1       543.0   514.555556  18.0      F   
...       ...           ...    ...         ...          ...   ...    ...   
9995     9995            10      1       538.5   450.444444  42.0      M   
9996     9996             0      0       500.5   430.888889  26.0      F   
9997     9997             3      1       473.0   534.111111  22.0      F   
9998     9998             2      1       495.0   523.222222  67.0      F   
9999     9999             7      1       508.0   475.888889  38.0      F   

        industry  poust_spends  
0     E-commerce    414.444444  
1     E-commerce    4

In [3]:
data.roles

{'user_id': Info(<class 'int'>),
 'signup_month': Feature(<class 'int'>),
 'treat': Treatment(<class 'int'>),
 'pre_spends': PreTarget(<class 'float'>),
 'post_spends': Target(<class 'float'>),
 'age': Feature(<class 'float'>),
 'gender': Feature(<class 'str'>),
 'industry': Feature(<class 'str'>),
 'poust_spends': Target(<class 'float'>)}

## Preprocessing

In [4]:
from hypex.comparators import GroupDifference, KSTest

pl_data = copy.deepcopy(data)
test = KSTest(compare_by="columns", grouping_role=TreatmentRole())
ed = ExperimentData(pl_data)
result = test.execute(ed)

In [5]:
result.ds

      user_id  signup_month  treat  pre_spends  post_spends   age gender  \
0           0             0      0       488.0   414.444444   NaN      M   
1           1             8      1       512.5   462.222222  26.0    NaN   
2           2             7      1       483.0   479.444444  25.0      M   
3           3             0      0       501.5   424.333333  39.0      M   
4           4             1      1       543.0   514.555556  18.0      F   
...       ...           ...    ...         ...          ...   ...    ...   
9995     9995            10      1       538.5   450.444444  42.0      M   
9996     9996             0      0       500.5   430.888889  26.0      F   
9997     9997             3      1       473.0   534.111111  22.0      F   
9998     9998             2      1       495.0   523.222222  67.0      F   
9999     9999             7      1       508.0   475.888889  38.0      F   

        industry  poust_spends  
0     E-commerce    414.444444  
1     E-commerce    4

In [6]:
result.analysis_tables

{"KSTest┴┴['post_spends', 'poust_spends']":               p-value  statistic  pass
 post_spends       0.0     0.5913   1.0
 poust_spends      0.0     0.5913   1.0}