In [1]:
import sys
sys.path.insert(0, '..')

# Инициализируем граф

In [2]:
import pandas as pd

from src.eventstream.eventstream import Eventstream
from src.eventstream.schema import RawDataSchema, EventstreamSchema
from src.graph.p_graph import PGraph, EventsNode

raw_data = pd.read_csv('simple-onlineshop.csv')

raw_data_schema = RawDataSchema(
                            event_name="event",
                            event_timestamp="timestamp",
                            user_id="user_id")

source = Eventstream(
    raw_data=raw_data,
    raw_data_schema=raw_data_schema,
    schema=EventstreamSchema()
)

graph = PGraph(source_stream=source)

In [3]:
source_df = source.to_dataframe()
source_df.head()

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
4,ca3ee2de-4d37-48b1-a4fc-7d5cfea6fdf5,raw,4,catalog,2019-11-01 21:38:19.283663,964964743


# collapse_loops

## Описание функции

Функция позиционной группировки

В траектории каждого пользователя находит последовательности одинаковых событий (2 и больше) и группирует их в одно с прометкой “_loop”

**Параметры**

`full_collapse: bool = True`

Если `True`, то к event_name raw события добавится “_loop”

Если `False`, то к event_name raw события добавится “_loop_num”, где num - длина последовательности одинаковых событий

`agg: str = 'max'`

Если `'max'`, то для нового события проставится  timestamp последнего события в последовательности

Если `'min'`  - то первого

Если `'mean'` - рассчитается среднее время между этими событиями

##  'full_collapse': False, 'agg': 'min'

In [21]:
%%time
from src.data_processors_lib.rete import CollapseLoops, CollapseLoopsParams

params = {
    'full_collapse': False,
    'agg': 'min'
}

collapsed = EventsNode(
    CollapseLoops(params=CollapseLoopsParams(**params)))

graph.add_node(
    node=collapsed,
    parents=[graph.root]
)

result = graph.combine(
    node=collapsed
)


CPU times: user 604 ms, sys: 87.1 ms, total: 692 ms
Wall time: 745 ms


In [22]:
df_false_min = result.to_dataframe()

In [23]:
len(df_false_min), len(source_df)

(29481, 35381)

In [24]:
check_user_id = 219483890
df_false_min[df_false_min['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890.0
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890.0
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890.0
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890.0
3220,76549528-cca8-42dd-b9d8-7916386ef49a,raw,3220,main,2019-12-06 16:22:57.484842,219483890.0
3221,8a6ac897-c397-492f-9470-d17ba599c348,group_alias,3221,catalog_loop_2,2019-12-06 16:23:01.331109,219483890.0
6934,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,6934,main,2020-01-06 22:10:13.635011,219483890.0
6935,2014e572-a73e-448b-b36a-7c7ce7155616,raw,6935,catalog,2020-01-06 22:10:15.228575,219483890.0
6936,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,6936,cart,2020-01-06 22:10:42.309028,219483890.0
6937,14609f7c-cb9f-4c1b-a7e9-0d96aeb79c91,raw,6937,catalog,2020-01-06 22:10:52.255859,219483890.0


In [25]:
source_df[source_df['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2245,76549528-cca8-42dd-b9d8-7916386ef49a,raw,2245,main,2019-12-06 16:22:57.484842,219483890
2246,de17f095-ad35-4741-988f-3faec3525cef,raw,2246,catalog,2019-12-06 16:23:01.331109,219483890
2247,26fbf74c-0741-4bcc-9ed0-1cc5efbd0a5e,raw,2247,catalog,2019-12-06 16:23:48.116617,219483890
4836,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,4836,main,2020-01-06 22:10:13.635011,219483890
4837,2014e572-a73e-448b-b36a-7c7ce7155616,raw,4837,catalog,2020-01-06 22:10:15.228575,219483890
4838,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,4838,cart,2020-01-06 22:10:42.309028,219483890


##  'full_collapse': False, 'agg': 'max'

In [8]:
%%time
from src.data_processors_lib.rete import CollapseLoops, CollapseLoopsParams

params = {
    'full_collapse': False,
    'agg': 'max'
}

collapsed = EventsNode(
    CollapseLoops(params=CollapseLoopsParams(**params)))

graph.add_node(
    node=collapsed,
    parents=[graph.root]
)

result = graph.combine(
    node=collapsed
)


CPU times: user 598 ms, sys: 36.1 ms, total: 634 ms
Wall time: 705 ms


In [10]:
df_false_max = result.to_dataframe()

In [15]:
check_user_id = 219483890
df_false_max[df_false_max['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890.0
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890.0
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890.0
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890.0
3182,76549528-cca8-42dd-b9d8-7916386ef49a,raw,3182,main,2019-12-06 16:22:57.484842,219483890.0
3185,aa4624a3-f158-4c5a-b57e-f72babd8480b,group_alias,3185,catalog_loop_2,2019-12-06 16:23:48.116617,219483890.0
6866,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,6866,main,2020-01-06 22:10:13.635011,219483890.0
6867,2014e572-a73e-448b-b36a-7c7ce7155616,raw,6867,catalog,2020-01-06 22:10:15.228575,219483890.0
6868,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,6868,cart,2020-01-06 22:10:42.309028,219483890.0
6869,14609f7c-cb9f-4c1b-a7e9-0d96aeb79c91,raw,6869,catalog,2020-01-06 22:10:52.255859,219483890.0


In [17]:
source_df[source_df['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2245,76549528-cca8-42dd-b9d8-7916386ef49a,raw,2245,main,2019-12-06 16:22:57.484842,219483890
2246,de17f095-ad35-4741-988f-3faec3525cef,raw,2246,catalog,2019-12-06 16:23:01.331109,219483890
2247,26fbf74c-0741-4bcc-9ed0-1cc5efbd0a5e,raw,2247,catalog,2019-12-06 16:23:48.116617,219483890
4836,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,4836,main,2020-01-06 22:10:13.635011,219483890
4837,2014e572-a73e-448b-b36a-7c7ce7155616,raw,4837,catalog,2020-01-06 22:10:15.228575,219483890
4838,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,4838,cart,2020-01-06 22:10:42.309028,219483890


##  'full_collapse': True, 'agg': 'mean'

In [28]:
%%time
from src.data_processors_lib.rete import CollapseLoops, CollapseLoopsParams

params = {
    'full_collapse': True,
    'agg': 'mean'
}

collapsed = EventsNode(
    CollapseLoops(params=CollapseLoopsParams(**params)))

graph.add_node(
    node=collapsed,
    parents=[graph.root]
)

result = graph.combine(
    node=collapsed
)


CPU times: user 604 ms, sys: 65.1 ms, total: 669 ms
Wall time: 755 ms


In [29]:
df_true_mean = result.to_dataframe()

In [31]:
check_user_id = 219483890
df_true_mean[df_true_mean['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932000,219483890.0
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271000,219483890.0
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214000,219483890.0
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029000,219483890.0
3188,76549528-cca8-42dd-b9d8-7916386ef49a,raw,3188,main,2019-12-06 16:22:57.484842000,219483890.0
3191,549fd6a6-123f-4fc3-889c-fc85ec6e8674,group_alias,3191,catalog_loop,2019-12-06 16:23:24.723863040,219483890.0
6888,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,6888,main,2020-01-06 22:10:13.635011000,219483890.0
6889,2014e572-a73e-448b-b36a-7c7ce7155616,raw,6889,catalog,2020-01-06 22:10:15.228575000,219483890.0
6890,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,6890,cart,2020-01-06 22:10:42.309028000,219483890.0
6891,14609f7c-cb9f-4c1b-a7e9-0d96aeb79c91,raw,6891,catalog,2020-01-06 22:10:52.255859000,219483890.0


In [32]:
source_df[source_df['user_id'] == check_user_id]

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,fdb4f7eb-7d31-46c0-82f9-ab91e7c23b20,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
1,e7475129-8d4e-4460-9671-a6e1e8f93002,raw,1,product1,2019-11-01 17:59:28.459271,219483890
2,61f973a2-282e-4a14-a390-66f880dcba36,raw,2,cart,2019-11-01 17:59:29.502214,219483890
3,82618fe2-be72-4e6f-b2b6-155c8e460b66,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2245,76549528-cca8-42dd-b9d8-7916386ef49a,raw,2245,main,2019-12-06 16:22:57.484842,219483890
2246,de17f095-ad35-4741-988f-3faec3525cef,raw,2246,catalog,2019-12-06 16:23:01.331109,219483890
2247,26fbf74c-0741-4bcc-9ed0-1cc5efbd0a5e,raw,2247,catalog,2019-12-06 16:23:48.116617,219483890
4836,c263564e-8a4c-45ca-8d84-d328bdcc12fb,raw,4836,main,2020-01-06 22:10:13.635011,219483890
4837,2014e572-a73e-448b-b36a-7c7ce7155616,raw,4837,catalog,2020-01-06 22:10:15.228575,219483890
4838,bb01810b-6a19-482a-a797-ecce8f398a9b,raw,4838,cart,2020-01-06 22:10:42.309028,219483890
