In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.insert(0, '..')
from retentioneering.eventstream.schema import RawDataSchema, EventstreamSchema
from scipy import stats
from retentioneering.eventstream import Eventstream, EventstreamSchema
from scipy.stats import chi2_contingency, fisher_exact, ks_2samp, mannwhitneyu

  from .autonotebook import tqdm as notebook_tqdm


In [65]:
df = pd.read_csv('../tests/datasets/tooling/stattests/01_simple_data.csv', index_col=0)
source_stream = Eventstream(df)

In [66]:
df_from_es = source_stream.to_dataframe()
df_from_es

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,d0d4f15a-dabb-4034-a288-d897c7d5ad2c,raw,0,catalog,2022-01-01 00:01:00,1
1,3cc215e9-2c17-42d3-be17-11364e1708ae,raw,1,product1,2022-01-01 00:01:00,3
2,060bea04-a228-4af1-b52d-9eb8b188d33b,raw,2,catalog,2022-01-01 00:01:00,4
3,be612bd2-adcd-4e9f-90b1-6d375c9e1171,raw,3,product1,2022-01-01 00:01:00,6
4,c3872ada-c063-41d9-9db1-dcd77b7a9f36,raw,4,catalog,2022-01-01 00:01:00,7
...,...,...,...,...,...,...
56,6ae61bb0-4a65-4a27-8ac6-2836aea1ed72,raw,56,product1,2022-02-01 00:01:00,2
57,61fe7e9d-3cd4-4223-85f1-b273883e768f,start,57,start,2022-02-01 00:01:00,2
58,174da413-b19e-489b-a703-40d3c1777ca0,raw,58,product2,2022-02-01 00:02:00,2
59,8ff786b8-a938-4bf7-b1c3-5e5378797b69,raw,59,cart,2022-02-01 00:07:00,2


In [56]:
users_by_event_cnt = df_from_es.groupby('user_id')['event_id'].nunique().sort_values(ascending=False)
users_by_event_cnt

user_id
4    9
7    9
8    9
3    8
6    8
5    7
1    6
2    5
Name: event_id, dtype: int64

In [57]:
most_active = users_by_event_cnt.index[:int(users_by_event_cnt.shape[0]/2)].values
least_active = users_by_event_cnt.index.values[~np.isin(users_by_event_cnt.index.values, most_active)]

Let us check whether the most active 50% of users are significantly more active than the least active 50%:

In [62]:
test_results = source_stream.stattests(groups=(most_active, least_active),
                                objective=lambda x: x.shape[0],
                                group_names=('most_active', 'least_active'),
                                test='ttest')
test_results.values()

{'group_one_name': 'most_active',
 'group_one_size': 4,
 'group_one_mean': 8.75,
 'group_one_SD': 0.4330127018922193,
 'group_two_name': 'least_active',
 'group_two_size': 4,
 'group_two_mean': 6.5,
 'group_two_SD': 1.118033988749895,
 'greatest_group_name': 'least_active',
 'is_group_one_greatest': False,
 'p_val': 0.008728594297886161,
 'power_estimated': 0.8873712359977801}

In [68]:
test_results = source_stream.stattests( groups=([1, 2, 3, 4], [5, 6, 7, 8]),
                                       objective=lambda x: x.shape[0],
                                        group_names=("group_1", "group_2"),
                                       test='ttest', alpha=0.1)
test_results.values()

{'group_one_name': 'group_1',
 'group_one_size': 4,
 'group_one_mean': 7.0,
 'group_one_SD': 1.5811388300841898,
 'group_two_name': 'group_2',
 'group_two_size': 4,
 'group_two_mean': 8.25,
 'group_two_SD': 0.82915619758885,
 'greatest_group_name': 'group_1',
 'is_group_one_greatest': True,
 'p_val': 0.13541442939197038,
 'power_estimated': 0.4390361715046176}

Same but for "payment_done" indicator:

In [25]:
source_stream.stattests(groups=(most_active, least_active),
                                objective=lambda x: 'payment_done' in x['event'].values,
                                group_names=('most_active', 'least_active'),
                                test='ttest').values()

  tstat = (value1 - value2 - diff) / std_diff


{'group_one_name': 'most_active',
 'group_one_size': 4,
 'group_one_mean': 1.0,
 'group_one_SD': 0.0,
 'group_two_name': 'least_active',
 'group_two_size': 4,
 'group_two_mean': 1.0,
 'group_two_SD': 0.0,
 'greatest_group_name': 'most_active',
 'is_group_one_greatest': True,
 'p_val': nan,
 'power_estimated': 0.05000000000849336}

In [26]:
source_stream.stattests(groups=(most_active, least_active),
                                objective=lambda x: 'payment_done' in x['event'].values,
                                group_names=('most_active', 'least_active'),
                                test='chi2_contingency').values()

{'group_one_name': 'most_active',
 'group_one_size': 4,
 'group_one_mean': 1.0,
 'group_one_SD': 0.0,
 'group_two_name': 'least_active',
 'group_two_size': 4,
 'group_two_mean': 1.0,
 'group_two_SD': 0.0,
 'greatest_group_name': 'most_active',
 'is_group_one_greatest': True,
 'p_val': 1.0,
 'power_estimated': 0.0}

Same but for first 50% chronologically vs the rest:

In [28]:
users_by_appearance = df.groupby('user_id')['event_timestamp'].min().sort_values()
first_half = users_by_appearance.index[:int(users_by_event_cnt.shape[0]/2)].values
second_half = users_by_appearance.index.values[~np.isin(users_by_appearance.index.values, most_active)]

source_stream.stattests(groups=(first_half, second_half),
                                objective=lambda x: 'payment_done' in x['event_name'].values,
                                group_names=('most_active', 'least_active'),
                                test='mannwhitneyu').values()

KeyError: 'Column not found: event_timestamp'

Same but for random 50%:

In [184]:
users = df['user_id'].unique()
np.random.seed(962)
first_half = np.random.choice(users, int(users.shape[0]/2), replace=False)
second_half = users[~np.isin(users, first_half)]

test_stream.stattests(groups=(first_half, second_half),
                                objective=lambda x: 'payment_done' in x['event'].values,
                                group_names=('most_active', 'least_active'),
                                test='chi2_contingency').values()

{'group_one_name': 'most_active',
 'group_one_size': 4,
 'group_one_mean': 1.0,
 'group_one_SD': 0.0,
 'group_two_name': 'least_active',
 'group_two_size': 4,
 'group_two_mean': 1.0,
 'group_two_SD': 0.0,
 'greatest_group_name': 'most_active',
 'is_group_one_greatest': True,
 'p_val': 1.0,
 'power_estimated': 0.0}

In [None]:
users = df['user_id'].unique()
np.random.seed(962)
first_half = np.random.choice(users, int(users.shape[0]/2), replace=False)
second_half = users[~np.isin(users, first_half)]

test_results = test_stream.stattests(groups=(first_half, second_half),
                                objective=lambda x: 'payment_done' in x['event_name'].values,
                                group_names=('most_active', 'least_active'),
                                test='fisher_exact')
test_results.values()

In [None]:
test_results.plot()

In [32]:
df = pd.read_csv('../tests/datasets/tooling/stattests/01_simple_data.csv', index_col=0)
df

Unnamed: 0,user_id,event,event_type,timestamp
0,1,start,start,2022-01-01 00:01:00
1,1,catalog,raw,2022-01-01 00:01:00
2,1,product1,raw,2022-01-01 00:02:00
3,1,product2,raw,2022-01-01 00:03:00
4,1,cart,raw,2022-01-01 00:07:00
...,...,...,...,...
56,8,catalog,raw,2022-01-01 00:04:00
57,8,product1,raw,2022-01-01 00:06:00
58,8,product18,raw,2022-01-01 00:06:30
59,8,payment_done,raw,2022-01-01 00:07:00


In [31]:
df_cont = pd.read_csv('../tests/datasets/tooling/stattests/02_continuous_data.csv', index_col=0)
df_cont

Unnamed: 0,user_id,event,event_type,timestamp,seconds
0,1,start,start,2022-01-01 00:01:00,0
1,1,catalog,raw,2022-01-01 00:01:00,60
2,1,product1,raw,2022-01-01 00:02:00,60
3,1,product2,raw,2022-01-01 00:03:00,60
4,1,cart,raw,2022-01-01 00:07:00,240
...,...,...,...,...,...
56,8,catalog,raw,2022-01-01 00:04:00,120
57,8,product1,raw,2022-01-01 00:06:00,30
58,8,product18,raw,2022-01-01 00:06:30,30
59,8,payment_done,raw,2022-01-01 00:07:00,60


In [36]:
raw_data_schema = RawDataSchema(
        event_name="event",
        event_timestamp="timestamp",
        user_id="user_id",
        custom_cols=[{"custom_col": "seconds", "raw_data_col": "seconds"}],
    )
source = Eventstream(
        schema=EventstreamSchema(
            custom_cols=["seconds"], event_name="event", event_timestamp="timestamp", user_id="user_id"
        ),
        raw_data_schema=raw_data_schema,
        raw_data=df_cont,
    )


In [37]:
test_results = source.stattests(groups=([1, 2, 3, 4], [5, 6, 7, 8]),
                 objective=lambda x: x['seconds'].mean(),
                 group_names=("group_1", "group_2"),
                 test="ks_2samp")
result = test_results.values()
result

{'group_one_name': 'group_1',
 'group_one_size': 4,
 'group_one_mean': 69.625,
 'group_one_SD': 19.246347056000005,
 'group_two_name': 'group_2',
 'group_two_size': 4,
 'group_two_mean': 56.964285714285715,
 'group_two_SD': 10.111867200306975,
 'greatest_group_name': 'group_2',
 'is_group_one_greatest': False,
 'p_val': 0.4,
 'power_estimated': 0.22734318115885754}

In [38]:
#ks_2samp
df1 = df_cont.loc[:27][['seconds', 'user_id']].groupby('user_id').mean()
sample1 = df1['seconds']
sample1

user_id
1    80.0
2    96.0
3    52.5
4    50.0
Name: seconds, dtype: float64

In [39]:
df2 = df_cont.loc[28:][['seconds', 'user_id']].groupby('user_id').mean()
sample2 = df2['seconds']
sample2

user_id
5    72.857143
6    45.000000
7    56.666667
8    53.333333
Name: seconds, dtype: float64

In [40]:
stats.ks_2samp(sample1, sample2, alternative="less")

KstestResult(statistic=0.5, pvalue=0.4)

In [41]:
sample1

user_id
1    80.0
2    96.0
3    52.5
4    50.0
Name: seconds, dtype: float64

In [42]:
np.array(sample1).mean()

69.625

In [43]:
np.array(sample2).std()

10.111867200306975

In [45]:
source_stream.add_start_end().to_dataframe()

  params_schema: dict[str, Any] = cls.schema()
  params_schema: dict[str, Any] = cls.schema()
  params_schema: dict[str, Any] = cls.schema()
  self.__events = pd.concat([result_left_part, result_right_part, result_deleted_events])


Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,79bc9e69-7ad7-415f-ae5b-770bd8177ef8,path_start,0,path_start,2022-01-01 00:01:00,1.0
1,8ef0b503-3c7e-4aaf-9c48-948b093ae785,path_start,1,path_start,2022-01-01 00:01:00,3.0
2,2d128b49-1f30-4e66-a796-5c364daaacbf,path_start,2,path_start,2022-01-01 00:01:00,4.0
3,1505f549-b72f-407d-b365-7c8debec7ca3,path_start,3,path_start,2022-01-01 00:01:00,5.0
4,1a579bde-2e40-40b2-be7c-749dcabd4855,path_start,4,path_start,2022-01-01 00:01:00,6.0
...,...,...,...,...,...,...
72,1bc580dd-a87b-4f59-ac69-b569c849bea8,start,72,start,2022-02-01 00:01:00,2.0
73,48a5fb68-6119-4568-a752-7815e3928d54,raw,73,product2,2022-02-01 00:02:00,2.0
74,6344cad5-2676-4eb2-945a-8aea23e42e21,raw,74,cart,2022-02-01 00:07:00,2.0
75,b48ad801-f16c-4e44-bde5-86dd047d7e5c,raw,75,payment_done,2022-02-01 00:08:00,2.0


In [196]:
#ztest
test_results = test_stream.stattests(groups=([1, 2, 3, 4], [5, 6, 7, 8]),
                 objective=lambda x: x.shape[0],
                 group_names=("group_1", "group_2"),
                 test="ztest")
result = test_results.values()
result

{'group_one_name': 'group_1',
 'group_one_size': 4,
 'group_one_mean': 7.0,
 'group_one_SD': 1.5811388300841898,
 'group_two_name': 'group_2',
 'group_two_size': 4,
 'group_two_mean': 8.25,
 'group_two_SD': 0.82915619758885,
 'greatest_group_name': 'group_1',
 'is_group_one_greatest': True,
 'p_val': 0.11262645318032655,
 'power_estimated': 0.2852558170575349}

In [14]:
df_cutted = pd.read_csv('../tests/datasets/tooling/stattests/03_сutted_data.csv', index_col=0)
source_stream_cut = Eventstream(df_cutted)

In [26]:
def group_col(raw):
    if raw['user_id'] in [1, 2, 3, 4]:
        return 1
    else:
        return 2
df_cutted['group'] = df_cutted.apply(group_col, axis=1)
df_cutted

Unnamed: 0,user_id,event,event_type,timestamp,group,payment_done
0,1,start,start,2022-01-01 00:01:00,1,False
1,1,catalog,raw,2022-01-01 00:01:00,1,False
2,1,product1,raw,2022-01-01 00:02:00,1,False
3,1,product2,raw,2022-01-01 00:03:00,1,False
4,1,cart,raw,2022-01-01 00:07:00,1,False
5,1,payment_done,raw,2022-01-01 00:08:00,1,True
6,2,start,start,2022-02-01 00:01:00,1,False
7,2,product1,raw,2022-02-01 00:01:00,1,False
8,2,product2,raw,2022-02-01 00:02:00,1,False
9,2,cart,raw,2022-02-01 00:07:00,1,False


In [27]:
df_cutted['payment_done'] = df_cutted['event'] == 'payment_done'
df_cutted

Unnamed: 0,user_id,event,event_type,timestamp,group,payment_done
0,1,start,start,2022-01-01 00:01:00,1,False
1,1,catalog,raw,2022-01-01 00:01:00,1,False
2,1,product1,raw,2022-01-01 00:02:00,1,False
3,1,product2,raw,2022-01-01 00:03:00,1,False
4,1,cart,raw,2022-01-01 00:07:00,1,False
5,1,payment_done,raw,2022-01-01 00:08:00,1,True
6,2,start,start,2022-02-01 00:01:00,1,False
7,2,product1,raw,2022-02-01 00:01:00,1,False
8,2,product2,raw,2022-02-01 00:02:00,1,False
9,2,cart,raw,2022-02-01 00:07:00,1,False


In [28]:
contigency= pd.crosstab(df_cutted['group'], df_cutted['payment_done'])
contigency

payment_done,False,True
group,Unnamed: 1_level_1,Unnamed: 2_level_1
1,24,4
2,29,3


In [29]:
chi2_contingency(contigency)[1]

0.8508071203050924

In [30]:
st = source_stream_cut.stattests(groups=([1, 2, 3, 4], [5, 6, 7, 8]),
    objective = lambda x: 'payment_done' in x['event'].values,
    group_names=("group_1", "group_2"),
    test = 'chi2_contingency')
st.fit()
result = st.values()
result

{'group_one_name': 'group_1',
 'group_one_size': 4,
 'group_one_mean': 1.0,
 'group_one_SD': 0.0,
 'group_two_name': 'group_2',
 'group_two_size': 4,
 'group_two_mean': 0.75,
 'group_two_SD': 0.4330127018922193,
 'greatest_group_name': 'group_1',
 'is_group_one_greatest': True,
 'p_val': 1.0,
 'power_estimated': 0.0}

In [31]:
oddsr, p = stats.fisher_exact(contigency, alternative='greater')
p

0.8394989918875613

In [33]:
#fisher_exact
test_results = source_stream_cut.stattests(groups=([1, 2, 3, 4], [5, 6, 7, 8]),
                 objective=lambda x: 'payment_done' in x['event'].values,
                 group_names=("group_1", "group_2"),
                 test="fisher_exact")
result = test_results.values()
result

{'group_one_name': 'group_1',
 'group_one_size': 4,
 'group_one_mean': 1.0,
 'group_one_SD': 0.0,
 'group_two_name': 'group_2',
 'group_two_size': 4,
 'group_two_mean': 0.75,
 'group_two_SD': 0.4330127018922193,
 'greatest_group_name': 'group_1',
 'is_group_one_greatest': True,
 'p_val': 0.7857142857142857,
 'power_estimated': 0.0}

In [66]:
df

Unnamed: 0,user_id,event,event_type,timestamp,group,payment_done
0,1,start,start,2022-01-01 00:01:00,1,False
1,1,catalog,raw,2022-01-01 00:01:00,1,False
2,1,product1,raw,2022-01-01 00:02:00,1,False
3,1,product2,raw,2022-01-01 00:03:00,1,False
4,1,cart,raw,2022-01-01 00:07:00,1,False
...,...,...,...,...,...,...
56,8,catalog,raw,2022-01-01 00:04:00,2,False
57,8,product1,raw,2022-01-01 00:06:00,2,False
58,8,product18,raw,2022-01-01 00:06:30,2,False
59,8,payment_done,raw,2022-01-01 00:07:00,2,True


In [47]:
contigency_table1 = np.array([[0, 4], [1, 3]])
contigency_table2 = np.array([[1, 3], [0, 4]])

In [49]:
_, p1 = fisher_exact(contigency_table1, alternative='greater')
_, p2 = fisher_exact(contigency_table2, alternative='greater')

In [50]:
p1, p2

(1.0, 0.5)