In [1]:
from hypex.dataset.dataset import Dataset, ExperimentData
from hypex.dataset.roles import (
    InfoRole,
    FeatureRole,
    TreatmentRole,
    TargetRole,
)
from hypex.experiments.aa_test import AA_TEST
from hypex.reporters.aa import AADictReporter

In [2]:
data = Dataset(
    roles={
        "user_id": InfoRole(),
        "signup_month": FeatureRole(),
        "treat": TreatmentRole(),
        "pre_spends": TargetRole(),
        "post_spends": TargetRole(),
        "age": FeatureRole(),
        "gender": FeatureRole(),
        "industry": FeatureRole(),
    }, data="data.csv",
)
data

      user_id  signup_month  treat  pre_spends  post_spends   age gender  \
0           0             0      0       488.0   414.444444   NaN      M   
1           1             8      1       512.5   462.222222  26.0    NaN   
2           2             7      1       483.0   479.444444  25.0      M   
3           3             0      0       501.5   424.333333  39.0      M   
4           4             1      1       543.0   514.555556  18.0      F   
...       ...           ...    ...         ...          ...   ...    ...   
9995     9995            10      1       538.5   450.444444  42.0      M   
9996     9996             0      0       500.5   430.888889  26.0      F   
9997     9997             3      1       473.0   534.111111  22.0      F   
9998     9998             2      1       495.0   523.222222  67.0      F   
9999     9999             7      1       508.0   475.888889  38.0      F   

        industry  
0     E-commerce  
1     E-commerce  
2      Logistics  
3     E-com

In [3]:
data.roles

{'user_id': Info,
 'signup_month': Feature,
 'treat': Treatment,
 'pre_spends': Target,
 'post_spends': Target,
 'age': Feature,
 'gender': Feature,
 'industry': Feature}

In [4]:
test = AA_TEST
ed = ExperimentData(data)
result = test.execute(ed)

In [5]:
AADictReporter().report(result)

{'random_state': None,
 'B pre_spends control mean': 486.9154,
 'B pre_spends difference': 0.3567000000000462,
 'B pre_spends difference %': 0.07325707915586399,
 'B pre_spends test mean': 487.2721,
 'B post_spends control mean': 451.436,
 'B post_spends difference': 1.4571111111112032,
 'B post_spends difference %': 0.32277246633214407,
 'B post_spends test mean': 452.8931111111112,
 'B control size': 5000.0,
 'B control size %': 50.0,
 'B test size': 5000.0,
 'B test size %': 50.0,
 'TTest p-value': 0.20454317633846736,
 'TTest pass': 0.0,
 'KSTest p-value': 0.17680525052673585,
 'KSTest pass': 0.5,
 'mean test score': 0.18605122579731304}

In [6]:
result.analysis_tables

{'GroupSizes╰╰[][A]':                      B
 control size    5000.0
 control size %    50.0
 test size       5000.0
 test size %       50.0,
 "GroupDifference╰╰['pre_spends'][A]":                                   B
 pre_spends control mean  486.915400
 pre_spends difference      0.356700
 pre_spends difference %    0.073257
 pre_spends test mean     487.272100,
 "TTest╰╰['pre_spends'][A]":   group  statistic   p-value   pass
 0     B  -0.945119  0.344621  False,
 "KSTest╰╰['pre_spends'][A]":   group  statistic   p-value   pass
 0     B     0.0186  0.352691  False,
 "GroupDifference╰╰['post_spends'][A]":                                    B
 post_spends control mean  451.436000
 post_spends difference      1.457111
 post_spends difference %    0.322772
 post_spends test mean     452.893111,
 "TTest╰╰['post_spends'][A]":   group  statistic   p-value   pass
 0     B  -1.849151  0.064465  False,
 "KSTest╰╰['post_spends'][A]":   group  statistic  p-value  pass
 0     B     0.0392  0.00092

In [7]:
from hypex.experiments.ab_test import AB_TEST

test = AB_TEST
ed = ExperimentData(data)
result = test.execute(ed)

In [8]:
result.analysis_tables['ABAnalyzer╰╰']

   TTest p-value  TTest pass  MannWhitney p-value  MannWhitney pass  \
0   1.157524e-30         1.0         3.812131e-12               1.0   

   GroupATE ['pre_spends'][t]  GroupATE ['post_spends'][t]  
0                   487.09375                   452.164556  

In [9]:
from hypex.experiments.homogeneity_test import HOMOGENEITY_TEST

test = HOMOGENEITY_TEST
ed = ExperimentData(data)
result = test.execute(ed)

In [10]:
result.analysis_tables

{"GroupDifference╰╰['pre_spends'][t]":                                   1
 pre_spends control mean  484.911973
 pre_spends difference      4.308406
 pre_spends difference %    0.888492
 pre_spends test mean     489.220379,
 "TTest╰╰['pre_spends'][t]":    group  statistic       p-value  pass
 0      1 -11.489293  2.315047e-30  True,
 "KSTest╰╰['pre_spends'][t]":    group  statistic       p-value  pass
 0      1   0.077573  1.559150e-13  True,
 "GroupDifference╰╰['post_spends'][t]":                                    1
 post_spends control mean  420.046619
 post_spends difference     63.424045
 post_spends difference %   15.099287
 post_spends test mean     483.470664,
 "TTest╰╰['post_spends'][t]":    group   statistic  p-value  pass
 0      1 -135.560001      0.0  True,
 "KSTest╰╰['post_spends'][t]":    group  statistic  p-value  pass
 0      1     0.8959      0.0  True,
 'OneAASplitAnalyzer╰╰':    TTest p-value  TTest pass  KSTest p-value  KSTest pass  mean test score
 0   1.157524e-3

In [11]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from hypex.comparators.hypothesis_testing import TTest, KSTest
from hypex.comparators.comparators import GroupSizes, GroupDifference
from hypex.utils.enums import SpaceEnum
from hypex.splitters.aa import AASplitter
from hypex.analyzers.aa import OneAASplitAnalyzer
from hypex.experiments.base import CycledExperiment, Experiment, OnRoleExperiment

aa = Experiment(
    executors=[
        AASplitter(),
        OnRoleExperiment(
            executors=[
                GroupSizes(grouping_role=TreatmentRole(), space=SpaceEnum.additional),
                GroupDifference(grouping_role=TreatmentRole(), space=SpaceEnum.additional),
                TTest(grouping_role=TreatmentRole(), space=SpaceEnum.additional),
                KSTest(grouping_role=TreatmentRole(), space=SpaceEnum.additional),
            ],
            role=TargetRole(),
        )
    ]
)
experiment = CycledExperiment(n_iterations=2, inner_executor=aa, analyzer=OneAASplitAnalyzer())
ed = ExperimentData(data)
res = experiment.execute(ed)

In [12]:
data.data

Unnamed: 0,user_id,signup_month,treat,pre_spends,post_spends,age,gender,industry
0,0,0,0,488.0,414.444444,,M,E-commerce
1,1,8,1,512.5,462.222222,26.0,,E-commerce
2,2,7,1,483.0,479.444444,25.0,M,Logistics
3,3,0,0,501.5,424.333333,39.0,M,E-commerce
4,4,1,1,543.0,514.555556,18.0,F,E-commerce
...,...,...,...,...,...,...,...,...
9995,9995,10,1,538.5,450.444444,42.0,M,Logistics
9996,9996,0,0,500.5,430.888889,26.0,F,Logistics
9997,9997,3,1,473.0,534.111111,22.0,F,E-commerce
9998,9998,2,1,495.0,523.222222,67.0,F,E-commerce


In [13]:
res.additional_fields

     AASplitter╰╰0 AASplitter╰╰1
0                B             A
1                A             B
2                A             B
3                A             A
4                A             A
...            ...           ...
9995             A             B
9996             B             A
9997             B             A
9998             B             B
9999             A             B

[10000 rows x 2 columns]

In [14]:
res.analysis_tables

{"GroupSizes╰╰['pre_spends'][A]0":                      B
 control size    5000.0
 control size %    50.0
 test size       5000.0
 test size %       50.0,
 "GroupDifference╰╰['pre_spends'][A]0":                                   B
 pre_spends control mean  487.220700
 pre_spends difference     -0.253900
 pre_spends difference %   -0.052112
 pre_spends test mean     486.966800,
 "TTest╰╰['pre_spends'][A]0":   group  statistic   p-value   pass
 0     B   0.672723  0.501139  False,
 "KSTest╰╰['pre_spends'][A]0":   group  statistic   p-value   pass
 0     B     0.0136  0.744274  False,
 "GroupSizes╰╰['post_spends'][A]0":                      B
 control size    5000.0
 control size %    50.0
 test size       5000.0
 test size %       50.0,
 "GroupDifference╰╰['post_spends'][A]0":                                    B
 post_spends control mean  452.232889
 post_spends difference     -0.136667
 post_spends difference %   -0.030220
 post_spends test mean     452.096222,
 "TTest╰╰['post_spends']

In [15]:
res_analyzer = OneAASplitAnalyzer().execute(res)

In [16]:
res_analyzer.analysis_tables['OneAASplitAnalyzer╰╰']

   TTest p-value  TTest pass  KSTest p-value  KSTest pass  mean test score
0       0.540555        0.25        0.664089         0.25         0.622911

In [17]:
# analyser version

from hypex.dataset.roles import StatisticRole

new_res = {}
fields = ['pre_spends', 'post_spends'] # получаем столбцы по TargetRole
tests = ['KSTest', 'TTest']
for j in fields:
    for i in list(res.analysis_tables.keys()):
        if j in i and i.split('╰╰')[0] in tests:
            test = i.split('╰╰')[0]
            if j not in new_res:
                new_res[j] = {test: []}
            elif test not in new_res[j]:
                new_res[j].update({test: []})
            new_res[j][test].append(list(res.analysis_tables[i].data['pass'])[0])
    new_res[j] = Dataset.from_dict(new_res[j], roles={}).mean()
    new_res[j].add_column(new_res[j].apply(
                lambda x: int(0.8 * 0.05 <= x['TTest'] <= 1.2 * 0.05), {'TTest passed': StatisticRole()}, axis=1
            ))
    new_res[j].add_column(new_res[j].apply(
                lambda x: int(0.8 * 0.05 <= x['KSTest'] <= 1.2 * 0.05), {'KSTest passed': StatisticRole()}, axis=1
            ))
new_res

{'pre_spends':       KSTest  TTest  TTest passed  KSTest passed
 mean     0.5    0.5             0              0,
 'post_spends':       KSTest  TTest  TTest passed  KSTest passed
 mean     0.0    0.0             0              0}

In [18]:
import pandas as pd

t = pd.read_csv("data.csv")
t.sample(frac=1, random_state=None)

Unnamed: 0,user_id,signup_month,treat,pre_spends,post_spends,age,gender,industry
4567,4567,0,0,486.0,415.000000,68.0,F,Logistics
8542,8542,11,1,491.0,437.666667,24.0,M,Logistics
7857,7857,0,0,487.5,420.666667,21.0,F,Logistics
587,587,8,1,458.5,460.111111,57.0,M,Logistics
7319,7319,9,1,485.5,454.666667,65.0,M,Logistics
...,...,...,...,...,...,...,...,...
1822,1822,4,1,477.0,503.111111,67.0,M,E-commerce
2622,2622,1,1,518.5,511.111111,35.0,M,E-commerce
6519,6519,0,0,486.0,418.333333,68.0,M,Logistics
9054,9054,0,0,483.5,415.888889,54.0,M,Logistics
