# Data processors user guide

The full text of [Data processors](https://doc.retentioneering.com/stable/doc/user_guides/dataprocessors.html) user guide is available on the retentioneering website.

## Prerequisites

Run this cell to prepare the environment. This step is obligatory.

In [1]:
!pip install retentioneering

## Creating an eventstream

In [2]:
import pandas as pd
from retentioneering import datasets
from retentioneering.eventstream import Eventstream

stream = datasets.load_simple_shop()

## What is a data processor?


## Helpers and chaining usage

In [3]:
res = stream\
  .split_sessions(timeout=(10, 'm'))\
  .to_dataframe()
res[res['user_id'] == 219483890].head(15)

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,session_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890,219483890_1
1,16b77e72-7633-4e4b-8fcd-a066dded8983,session_start,0,session_start,2019-11-01 17:59:13.273932,219483890,219483890_1
2,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890,219483890_1
3,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890,219483890_1
4,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890,219483890_1
5,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890,219483890_1
6,cd4dbcda-a43d-44e2-900b-6e6db9b2faf0,session_end,3,session_end,2019-11-01 17:59:32.557029,219483890,219483890_1
3392,57f88a71-eb11-486d-9a0f-e23a7a5233ca,session_start,2096,session_start,2019-12-06 16:22:57.484842,219483890,219483890_2
3393,bd14ea53-9688-466d-87b2-0494ace4e011,raw,2096,main,2019-12-06 16:22:57.484842,219483890,219483890_2
3394,7ef1bf49-c88b-4b40-b424-c7ac53a267a6,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890,219483890_2


## Data processors library

### Adding processors

#### AddStartEndEvents

In [4]:
res = stream.add_start_end_events().to_dataframe()
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
2,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890
3,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890
4,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2566,bd14ea53-9688-466d-87b2-0494ace4e011,raw,2096,main,2019-12-06 16:22:57.484842,219483890
2567,7ef1bf49-c88b-4b40-b424-c7ac53a267a6,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890
2568,e1800047-f1c9-48c7-a49a-80c3a87853e7,raw,2098,catalog,2019-12-06 16:23:48.116617,219483890
5427,d329f192-401c-44c3-8a1e-b0d4da680b6c,raw,4542,main,2020-01-06 22:10:13.635011,219483890
5428,59e82765-c236-4f5f-b05c-73ce6efdc84b,raw,4543,catalog,2020-01-06 22:10:15.228575,219483890


#### SplitSessions

##### timeout delimiter

In [5]:
res = stream.split_sessions(timeout=(10, 'm')).to_dataframe()
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,session_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890,219483890_1
1,0f3fb872-72b6-4aba-8e1d-80f0d99cbcf5,session_start,0,session_start,2019-11-01 17:59:13.273932,219483890,219483890_1
2,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890,219483890_1
3,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890,219483890_1
4,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890,219483890_1
5,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890,219483890_1
6,56228b93-4648-4276-931b-0644829cd8f5,session_end,3,session_end,2019-11-01 17:59:32.557029,219483890,219483890_1
3392,685656a2-5c2b-4879-a3ae-cf948aea8271,session_start,2096,session_start,2019-12-06 16:22:57.484842,219483890,219483890_2
3393,bd14ea53-9688-466d-87b2-0494ace4e011,raw,2096,main,2019-12-06 16:22:57.484842,219483890,219483890_2
3394,7ef1bf49-c88b-4b40-b424-c7ac53a267a6,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890,219483890_2


##### single delimiting event

In [6]:
df = pd.DataFrame(
    [
        [111, "session_delimiter", "2023-01-01 00:00:00"],
        [111, "A", "2023-01-01 00:00:01"],
        [111, "B", "2023-01-01 00:00:02"],
        [111, "session_delimiter", "2023-01-01 00:00:04"],
        [111, "C", "2023-01-01 00:00:04"],
    ],
    columns=["user_id", "event", "timestamp"]
)
Eventstream(df)\
    .split_sessions(delimiter_events=['session_delimiter'])\
    .to_dataframe()\
    .sort_values(['user_id', 'event_index'])\
    [['user_id', 'event', 'timestamp', 'session_id']]

Unnamed: 0,user_id,event,timestamp,session_id
0,111,path_start,2023-01-01 00:00:00,111_0
1,111,session_start,2023-01-01 00:00:00,111_1
2,111,session_end,2023-01-01 00:00:00,111_0
3,111,A,2023-01-01 00:00:01,111_1
4,111,B,2023-01-01 00:00:02,111_1
5,111,session_end,2023-01-01 00:00:02,111_1
6,111,session_start,2023-01-01 00:00:04,111_2
7,111,C,2023-01-01 00:00:04,111_2
8,111,session_end,2023-01-01 00:00:04,111_2
9,111,path_end,2023-01-01 00:00:04,111_2


##### paired delimiting event

In [7]:
df = pd.DataFrame(
    [
        [111, "custom_start", "2023-01-01 00:00:00"],
        [111, "A", "2023-01-01 00:00:01"],
        [111, "B", "2023-01-01 00:00:02"],
        [111, "custom_end", "2023-01-01 00:00:02"],
        [111, "custom_start", "2023-01-01 00:00:04"],
        [111, "C", "2023-01-01 00:00:04"],
        [111, "custom_end", "2023-01-01 00:00:04"]
    ],
    columns=["user_id", "event", "timestamp"]
)
dummy_stream = Eventstream(df)
dummy_stream.split_sessions(delimiter_events=['custom_start', 'custom_end'])\
    .to_dataframe()\
    .sort_values(['user_id', 'event_index'])\
    [['user_id', 'event', 'timestamp', 'session_id']]

Unnamed: 0,user_id,event,timestamp,session_id
0,111,path_start,2023-01-01 00:00:00,111_0
1,111,session_start,2023-01-01 00:00:00,111_1
2,111,A,2023-01-01 00:00:01,111_1
3,111,B,2023-01-01 00:00:02,111_1
4,111,session_end,2023-01-01 00:00:02,111_1
5,111,session_start,2023-01-01 00:00:04,111_2
6,111,C,2023-01-01 00:00:04,111_2
7,111,session_end,2023-01-01 00:00:04,111_2
8,111,path_end,2023-01-01 00:00:04,111_2


##### custom session column

In [8]:
df = pd.DataFrame(
    [
        [111, "A", "2023-01-01 00:00:01", "session_1"],
        [111, "B", "2023-01-01 00:00:02", "session_1"],
        [111, "C", "2023-01-01 00:00:03", "session_2"],
        [111, "D", "2023-01-01 00:00:04", "session_2"],
    ],
    columns=["user_id", "event", "timestamp", "custom_ses_id"]
)
raw_data_schema = {"custom_cols": [{"raw_data_col": "custom_ses_id", "custom_col": "custom_ses_id"}]}
dummy_stream = Eventstream(df, raw_data_schema=raw_data_schema)
dummy_stream.split_sessions(delimiter_col="custom_ses_id")\
    .to_dataframe()\
    .sort_values(["user_id", "event_index"])\
    [["user_id", "event", "timestamp", "session_id", "custom_ses_id"]]


Unnamed: 0,user_id,event,timestamp,session_id,custom_ses_id
0,111,path_start,2023-01-01 00:00:01,111_1,session_1
1,111,session_start,2023-01-01 00:00:01,111_1,session_1
2,111,A,2023-01-01 00:00:01,111_1,session_1
3,111,B,2023-01-01 00:00:02,111_1,session_1
4,111,session_end,2023-01-01 00:00:02,111_1,session_1
5,111,session_start,2023-01-01 00:00:03,111_2,session_2
6,111,C,2023-01-01 00:00:03,111_2,session_2
7,111,D,2023-01-01 00:00:04,111_2,session_2
8,111,session_end,2023-01-01 00:00:04,111_2,session_2
9,111,path_end,2023-01-01 00:00:04,111_2,session_2


#### LabelNewUsers

In [9]:
new_users = [219483890, 964964743, 965024600]
res = stream.label_new_users(new_users_list=new_users).to_dataframe()
res[res['user_id'] == 219483890].head()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,new_user,0,new_user,2019-11-01 17:59:13.273932,219483890
2,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
3,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890
4,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890


In [10]:
res[res['user_id'] == 501098384].head()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
19089,1a9aaa3d-b99c-4535-9b9e-31dc45c31288,path_start,14768,path_start,2020-04-02 05:36:04.896839,501098384
19090,1a9aaa3d-b99c-4535-9b9e-31dc45c31288,existing_user,14768,existing_user,2020-04-02 05:36:04.896839,501098384
19091,1a9aaa3d-b99c-4535-9b9e-31dc45c31288,raw,14768,main,2020-04-02 05:36:04.896839,501098384
19092,fdf942d2-6963-47a9-a676-33fce9ae9598,raw,14769,catalog,2020-04-02 05:36:05.371141,501098384
19093,dc9b2f40-5473-42d4-990b-fb72f71c6022,raw,14770,main,2020-04-02 05:36:40.814504,501098384


#### LabelLostUsers

In [11]:
lost_users_list = [219483890, 964964743, 965024600]
res = stream.label_lost_users(lost_users_list=lost_users_list).to_dataframe()
res[res['user_id'] == 219483890].tail()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
5766,5d7091d0-e8ec-4974-9dcf-f8336bb2b92d,raw,4548,catalog,2020-01-06 22:11:28.271366,219483890
10282,098dbd63-8c5b-4842-bb44-bad07923f13d,raw,8215,main,2020-02-14 21:04:49.450696,219483890
10283,83896884-2fd2-4567-9d29-791aa1d45795,raw,8216,catalog,2020-02-14 21:04:51.717127,219483890
10284,83896884-2fd2-4567-9d29-791aa1d45795,lost_user,8216,lost_user,2020-02-14 21:04:51.717127,219483890
10285,83896884-2fd2-4567-9d29-791aa1d45795,path_end,8216,path_end,2020-02-14 21:04:51.717127,219483890


In [12]:
res[res['user_id'] == 501098384].tail()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
43531,42ff0327-d17e-42c7-b116-66c447f46848,raw,32280,catalog,2020-04-29 12:48:01.809577,501098384
43532,d79171c2-8164-4e74-b306-0c531583716e,raw,32281,main,2020-04-29 12:48:01.938488,501098384
43533,91d3aca3-8031-4782-80ec-0f0abd1d68a1,raw,32282,catalog,2020-04-29 12:48:06.595390,501098384
43534,91d3aca3-8031-4782-80ec-0f0abd1d68a1,absent_user,32282,absent_user,2020-04-29 12:48:06.595390,501098384
43535,91d3aca3-8031-4782-80ec-0f0abd1d68a1,path_end,32282,path_end,2020-04-29 12:48:06.595390,501098384


In [13]:
res = stream.label_lost_users(timeout=(30, 'D')).to_dataframe()

In [14]:
res['timestamp'].max()

Timestamp('2020-04-29 12:48:06.595390')

In [15]:
res[res['user_id'] == 495985018]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
51,999137f1-9316-4d9f-969e-e3781fdedca4,path_start,47,path_start,2019-11-02 01:14:08.664850,495985018
52,999137f1-9316-4d9f-969e-e3781fdedca4,raw,47,catalog,2019-11-02 01:14:08.664850,495985018
53,9e37b313-2536-4a52-8b28-a67da83db18a,raw,48,cart,2019-11-02 01:14:37.435643,495985018
54,9e37b313-2536-4a52-8b28-a67da83db18a,lost_user,48,lost_user,2019-11-02 01:14:37.435643,495985018
55,9e37b313-2536-4a52-8b28-a67da83db18a,path_end,48,path_end,2019-11-02 01:14:37.435643,495985018


In [16]:
res[res['user_id'] == 819489198]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
29525,095d2d6e-969d-49c2-b161-05580f7a297a,path_start,22394,path_start,2020-04-15 21:02:36.903678,819489198
29526,095d2d6e-969d-49c2-b161-05580f7a297a,raw,22394,main,2020-04-15 21:02:36.903678,819489198
29527,3ef986a9-1032-4028-a43e-2ef003f744c6,raw,22395,catalog,2020-04-15 21:02:37.658557,819489198
29528,df5c68ca-56af-49b1-bdfb-9c20530da455,raw,22396,catalog,2020-04-15 21:02:48.699804,819489198
29529,512434e5-eb39-4631-8a1f-fffcac3f431e,raw,22397,product2,2020-04-15 21:02:51.173118,819489198
29532,176711f2-f2ce-4c24-91c0-1499418344b0,raw,22399,catalog,2020-04-15 21:03:05.813046,819489198
29534,de5eba5c-bef1-406b-9699-15c28c1bd5b5,raw,22401,cart,2020-04-15 21:03:35.216033,819489198
29540,d60cefd1-df17-4ebb-8344-e70528b50f27,raw,22404,delivery_choice,2020-04-15 21:03:40.745520,819489198
29541,fa3025a2-6148-4d45-a92b-02984b44b85c,raw,22405,delivery_pickup,2020-04-15 21:03:46.448349,819489198
29542,fd4a6b2b-5883-45d6-b891-f800f9de8b6c,raw,22406,payment_choice,2020-04-15 21:03:46.575300,819489198


#### AddPositiveEvents

In [17]:
positive_events = ['cart', 'payment_done']
res = stream.add_positive_events(
    targets=positive_events
    ).to_dataframe()

In [18]:
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,b3cfcc8e-45ea-43da-86b5-f629f066edcd,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,ff9aef51-ac0a-4f28-b180-5bf2594cb316,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
2,11d21f53-5058-4141-8e27-cc7cd1198eb3,raw,1,product1,2019-11-01 17:59:28.459271,219483890
3,a1194898-0973-4190-9561-4dbc00ee8b48,raw,2,cart,2019-11-01 17:59:29.502214,219483890
4,a1194898-0973-4190-9561-4dbc00ee8b48,positive_target,2,positive_target_cart,2019-11-01 17:59:29.502214,219483890
5,63f87bb5-19ce-4482-a877-9210a26499c4,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2714,36301165-07c7-44b8-8ca1-a393a223c630,raw,2096,main,2019-12-06 16:22:57.484842,219483890
2715,a96b35df-cb62-4172-88eb-c3c3f83d3f07,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890
2716,628246e3-7192-4855-8dc1-649efe6636ac,raw,2098,catalog,2019-12-06 16:23:48.116617,219483890
5705,e98ac714-fbdf-4ea6-ba6f-f5e7708f7280,raw,4542,main,2020-01-06 22:10:13.635011,219483890


In [19]:
res[res['user_id'] == 24427596]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
75,4be8a46a-4463-4268-8978-739cb6dbcd09,path_start,60,path_start,2019-11-02 07:28:07.285541,24427596
76,2d23e50c-aea2-4bcc-a316-1f91439e907d,raw,60,main,2019-11-02 07:28:07.285541,24427596
77,dca1467a-511b-4747-8c0d-cff0d9a9b99a,raw,61,catalog,2019-11-02 07:28:14.319850,24427596
78,f7a4cd26-6364-4486-aaac-cf5e5adaedc7,raw,62,catalog,2019-11-02 07:29:08.301333,24427596
79,e1f8a956-f98b-4857-bb4c-0895dc41d828,raw,63,catalog,2019-11-02 07:29:41.848396,24427596
80,4633035e-853e-429c-8721-2b6be2a4fdb2,path_end,63,path_end,2019-11-02 07:29:41.848396,24427596


In [20]:
def custom_func(eventstream, targets) -> pd.DataFrame:

    event_col = eventstream.schema.event_name
    df = eventstream.to_dataframe()

    return df[df[event_col].isin(targets)]

res = stream.add_positive_events(
      targets=positive_events,
      func=custom_func
      ).to_dataframe()

In [21]:
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,e474f5c7-13ed-4adc-99e7-1ccc79cd8f08,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,4583f92a-e2ce-4dae-8c36-1551713bc387,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
2,b383388a-fcaa-4fe4-b261-b19305be33ef,raw,1,product1,2019-11-01 17:59:28.459271,219483890
3,840eab3d-0ff3-420b-b40a-22ffae6e2288,raw,2,cart,2019-11-01 17:59:29.502214,219483890
4,840eab3d-0ff3-420b-b40a-22ffae6e2288,positive_target,2,positive_target_cart,2019-11-01 17:59:29.502214,219483890
5,ec59652a-45f1-4621-9f2a-0749e5385afe,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2810,a2107536-9673-470f-ad9d-754990b8f30c,raw,2096,main,2019-12-06 16:22:57.484842,219483890
2811,19785679-dca7-4834-be1c-bb3b7a5a0698,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890
2812,811193b0-05a4-43b7-af58-08e70061d02a,raw,2098,catalog,2019-12-06 16:23:48.116617,219483890
5926,bc01debb-96d9-40d7-b2bb-8c28fb2f6c02,raw,4542,main,2020-01-06 22:10:13.635011,219483890


#### AddNegativeEvents

In [22]:
negative_events = ['delivery_courier']

res = stream.add_negative_events(
          targets=negative_events
          ).to_dataframe()

In [23]:
res[res['user_id'] == 629881394].loc[36:48]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
37,1ea61a7b-aaaf-4396-a1d6-de1832c34e48,raw,33,product2,2019-11-01 22:35:33.142711,629881394
40,7be57d4e-10cf-4843-b133-66b78dc92c18,raw,36,cart,2019-11-01 22:35:50.437706,629881394
42,e9e5ce4d-6e6e-4ab6-9ff0-e6df52aa7319,raw,38,delivery_choice,2019-11-01 22:35:57.649549,629881394
43,ca50768a-b4db-45dd-bd8a-01fe22a3bfeb,raw,39,delivery_courier,2019-11-01 22:36:02.009271,629881394
44,ca50768a-b4db-45dd-bd8a-01fe22a3bfeb,negative_target,39,negative_target_delivery_courier,2019-11-01 22:36:02.009271,629881394
48,a90e7550-4949-458d-99ec-d52240b2a3d1,raw,42,payment_choice,2019-11-01 22:36:02.243274,629881394


#### LabelCroppedPaths

In [24]:
params = {
    'left_cutoff': (4, 'D'),
    'right_cutoff': (3, 'D')
}

res = stream.label_cropped_paths(**params).to_dataframe()

In [25]:
print('Eventstream start: {}'.format(res.timestamp.min()))
print('Eventstream end: {}'.format(res.timestamp.max()))

Eventstream start: 2019-11-01 17:59:13.273932
Eventstream end: 2020-04-29 12:48:06.595390


In [26]:
res[res['user_id'] == 495985018]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
51,999137f1-9316-4d9f-969e-e3781fdedca4,path_start,47,path_start,2019-11-02 01:14:08.664850,495985018
52,999137f1-9316-4d9f-969e-e3781fdedca4,cropped_left,47,cropped_left,2019-11-02 01:14:08.664850,495985018
53,999137f1-9316-4d9f-969e-e3781fdedca4,raw,47,catalog,2019-11-02 01:14:08.664850,495985018
54,9e37b313-2536-4a52-8b28-a67da83db18a,raw,48,cart,2019-11-02 01:14:37.435643,495985018
55,9e37b313-2536-4a52-8b28-a67da83db18a,path_end,48,path_end,2019-11-02 01:14:37.435643,495985018


In [27]:
res[res['user_id'] == 831491833]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
40028,b23acb30-1775-47d4-a12e-fe9770bb8ce3,path_start,32258,path_start,2020-04-29 12:24:21.538805,831491833
40029,b23acb30-1775-47d4-a12e-fe9770bb8ce3,raw,32258,catalog,2020-04-29 12:24:21.538805,831491833
40030,977a2deb-b62f-4c5e-909d-5d1e1c392fc3,raw,32259,catalog,2020-04-29 12:24:33.841264,831491833
40031,a4b1d5fd-2494-4e0a-b20f-58bf3c6fb9b1,raw,32260,product2,2020-04-29 12:24:39.415424,831491833
40032,4ed1e625-2fff-4978-ab66-ccab5ed2fdfc,raw,32261,cart,2020-04-29 12:24:59.928499,831491833
40033,653ed2cc-e52e-469f-a8c0-57f069fea29f,raw,32262,catalog,2020-04-29 12:25:06.262205,831491833
40034,653ed2cc-e52e-469f-a8c0-57f069fea29f,cropped_right,32262,cropped_right,2020-04-29 12:25:06.262205,831491833
40035,653ed2cc-e52e-469f-a8c0-57f069fea29f,path_end,32262,path_end,2020-04-29 12:25:06.262205,831491833


### Removing processors

#### FilterEvents

In [28]:
def save_specific_users(df, schema):
    users_to_save = [219483890, 964964743, 965024600]
    return df[schema.user_id].isin(users_to_save)

res = stream.filter_events(func=save_specific_users).to_dataframe()

In [29]:
res['user_id'].unique().astype(int)

array([219483890, 964964743, 965024600])

In [30]:
stream.to_dataframe()\
    ['event']\
    .value_counts()\
    [lambda s: s.index.isin(['catalog', 'main'])]

catalog    14518
main        5635
Name: event, dtype: int64

In [31]:
def exclude_events(df, schema):
    events_to_exclude = ['catalog', 'main']
    return ~df[schema.event_name].isin(events_to_exclude)

res = stream.filter_events(func=exclude_events).to_dataframe()

In [32]:
res['event']\
    .value_counts()\
    [lambda s: s.index.isin(['catalog', 'main'])]

Series([], Name: event, dtype: int64)

#### DropPaths

In [33]:
res = stream.drop_paths(min_steps=25).to_dataframe()

In [34]:
res[res['user_id'] == 629881394]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,faab069d-3d78-49b2-aeee-b8ee32646b34,path_start,7,path_start,2019-11-01 22:28:54.791683,629881394
1,faab069d-3d78-49b2-aeee-b8ee32646b34,raw,7,main,2019-11-01 22:28:54.791683,629881394
2,775dd323-bb8e-4d93-89c5-aa9f3879182a,raw,9,catalog,2019-11-01 22:29:01.049513,629881394
3,e38453d7-f7ff-4cb9-8b28-b71854aba7c4,raw,11,catalog,2019-11-01 22:29:32.322458,629881394
4,06a1521b-8449-48fa-aa46-e2c143c4eba9,raw,13,catalog,2019-11-01 22:30:09.450839,629881394
5,70ddfbab-ea29-42cd-bd37-3d6bcfd2b9d3,raw,14,catalog,2019-11-01 22:31:05.565762,629881394
6,d23f9b60-823f-4d5b-ac15-c598a01cf932,raw,15,main,2019-11-01 22:31:08.333560,629881394
7,01afe727-9206-45d7-a132-658278277912,raw,16,catalog,2019-11-01 22:31:09.010626,629881394
8,bfbb6659-eb96-49ec-b9a4-f558f4ad5953,raw,17,product1,2019-11-01 22:31:10.416231,629881394
9,25212bf7-ad65-4b5b-8f59-783d9822ea56,raw,18,catalog,2019-11-01 22:31:43.019527,629881394


In [35]:
res = stream.drop_paths(min_time=(1, 'M')).to_dataframe()

In [36]:
res[res['user_id'] == 964964743]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
5,ef3f8502-1bd3-4fb2-a285-a33fc80ad9ae,path_start,4,path_start,2019-11-01 21:38:19.283663,964964743
6,ef3f8502-1bd3-4fb2-a285-a33fc80ad9ae,raw,4,catalog,2019-11-01 21:38:19.283663,964964743
7,1f94de87-e9b6-472e-997c-1dc2a9fee367,raw,5,cart,2019-11-01 21:38:36.761221,964964743
8,9ef1ef56-658f-455e-bdfb-8e6315d151dd,raw,6,delivery_choice,2019-11-01 21:38:37.564693,964964743
1235,83b22098-bcc5-436e-915d-92794a935637,raw,2275,main,2019-12-09 01:42:22.801831,964964743
1236,41f40816-ba2e-4829-8a41-e53415897508,raw,2276,catalog,2019-12-09 01:42:23.617764,964964743
1237,cc22204f-7d6e-43ed-b0a8-4f204cec78c6,raw,2277,product2,2019-12-09 01:42:56.877340,964964743
1238,e0995a5a-549e-4536-bf50-7350a11fccaf,raw,2278,catalog,2019-12-09 01:43:05.436223,964964743
1239,0987e6eb-b194-407d-9058-1c275a30ce99,raw,2279,catalog,2019-12-09 01:43:36.923178,964964743
1240,59542694-71a2-4bba-be0f-f38ee5086b39,raw,2280,product2,2019-12-09 01:43:41.174195,964964743


#### TruncatePaths

In [37]:
res = stream.truncate_paths(
        drop_before='cart',
        shift_before=-2
        ).to_dataframe()

In [38]:
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
2,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890
3,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890
4,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
1980,bd14ea53-9688-466d-87b2-0494ace4e011,raw,2096,main,2019-12-06 16:22:57.484842,219483890
1981,7ef1bf49-c88b-4b40-b424-c7ac53a267a6,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890
1982,e1800047-f1c9-48c7-a49a-80c3a87853e7,raw,2098,catalog,2019-12-06 16:23:48.116617,219483890
4223,d329f192-401c-44c3-8a1e-b0d4da680b6c,raw,4542,main,2020-01-06 22:10:13.635011,219483890
4224,59e82765-c236-4f5f-b05c-73ce6efdc84b,raw,4543,catalog,2020-01-06 22:10:15.228575,219483890


In [39]:
res[res['user_id'] == 24427596]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
37,1e79e27f-5b47-43d1-b542-462e6ec7953b,path_start,60,path_start,2019-11-02 07:28:07.285541,24427596
38,1e79e27f-5b47-43d1-b542-462e6ec7953b,raw,60,main,2019-11-02 07:28:07.285541,24427596
39,8eed4edd-291c-458a-8bc0-28710a7fca6f,raw,61,catalog,2019-11-02 07:28:14.319850,24427596
40,22e66711-1f43-4e34-818d-c814542d5c7a,raw,62,catalog,2019-11-02 07:29:08.301333,24427596
41,632f12a8-d984-47b4-8e3b-68764c694383,raw,63,catalog,2019-11-02 07:29:41.848396,24427596
42,632f12a8-d984-47b4-8e3b-68764c694383,path_end,63,path_end,2019-11-02 07:29:41.848396,24427596


In [40]:
res = stream.truncate_paths(
          drop_after='cart',
          occurrence_after="last"
          ).to_dataframe()

In [41]:
res[res['user_id'] == 219483890]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890
1,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890
2,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890
3,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890
4,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890
2183,bd14ea53-9688-466d-87b2-0494ace4e011,raw,2096,main,2019-12-06 16:22:57.484842,219483890
2184,7ef1bf49-c88b-4b40-b424-c7ac53a267a6,raw,2097,catalog,2019-12-06 16:23:01.331109,219483890
2185,e1800047-f1c9-48c7-a49a-80c3a87853e7,raw,2098,catalog,2019-12-06 16:23:48.116617,219483890
4623,d329f192-401c-44c3-8a1e-b0d4da680b6c,raw,4542,main,2020-01-06 22:10:13.635011,219483890
4624,59e82765-c236-4f5f-b05c-73ce6efdc84b,raw,4543,catalog,2020-01-06 22:10:15.228575,219483890


### Editing processors

#### GroupEvents

With ``GroupEvents``, we can group events based on the event name. Suppose
we need to assign a common name ``product`` to events ``product1`` and
``product2``:

In [42]:
def group_events(df, schema):
    events_to_group = ['product1', 'product2']
    return df[schema.event_name].isin(events_to_group)

params = {
    'event_name': 'product',
    'func': group_events
}

res = stream.group_events(**params).to_dataframe()

As we can see, user ``456870964`` now has two ``product`` events
(``event_index=160, 164``) with ``event_type=‘group_alias’``).

In [43]:
res[res['user_id'] == 456870964]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
160,ea591a5a-61fd-4f7b-b120-638cfeace55f,path_start,129,path_start,2019-11-03 11:46:55.411714,456870964
161,ea591a5a-61fd-4f7b-b120-638cfeace55f,raw,129,catalog,2019-11-03 11:46:55.411714,456870964
162,c06e61ab-9b12-470d-a8bd-958321a89d23,raw,130,catalog,2019-11-03 11:47:46.131302,456870964
163,f14a959c-d810-4ba5-a8bf-35ef8d30b7a4,raw,131,catalog,2019-11-03 11:47:58.401143,456870964
164,e535652a-6036-4029-bbe8-829e1a6298b5,group_alias,132,product,2019-11-03 11:48:43.243587,456870964
165,4e175e40-dd60-4b03-84dd-21fd1c089c1e,raw,133,cart,2019-11-03 11:49:17.050519,456870964
166,8e7438a2-7ee7-403f-ba77-8c4a495d2787,raw,134,catalog,2019-11-03 11:49:17.516398,456870964
167,2ec87c38-2d08-4465-8a43-a1db673fba35,group_alias,135,product,2019-11-03 11:49:28.927721,456870964
168,64b3b4ed-4e7b-4605-bf18-1d27f38f0938,raw,136,catalog,2019-11-03 11:49:30.788195,456870964
169,64b3b4ed-4e7b-4605-bf18-1d27f38f0938,path_end,136,path_end,2019-11-03 11:49:30.788195,456870964


Previously, both events were named
``product1`` and ``product2`` and had ``raw`` event types:

In [44]:
stream.to_dataframe().query('user_id == 456870964')

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
160,ea591a5a-61fd-4f7b-b120-638cfeace55f,path_start,129,path_start,2019-11-03 11:46:55.411714,456870964
161,ea591a5a-61fd-4f7b-b120-638cfeace55f,raw,129,catalog,2019-11-03 11:46:55.411714,456870964
162,c06e61ab-9b12-470d-a8bd-958321a89d23,raw,130,catalog,2019-11-03 11:47:46.131302,456870964
163,f14a959c-d810-4ba5-a8bf-35ef8d30b7a4,raw,131,catalog,2019-11-03 11:47:58.401143,456870964
164,e535652a-6036-4029-bbe8-829e1a6298b5,raw,132,product1,2019-11-03 11:48:43.243587,456870964
165,4e175e40-dd60-4b03-84dd-21fd1c089c1e,raw,133,cart,2019-11-03 11:49:17.050519,456870964
166,8e7438a2-7ee7-403f-ba77-8c4a495d2787,raw,134,catalog,2019-11-03 11:49:17.516398,456870964
167,2ec87c38-2d08-4465-8a43-a1db673fba35,raw,135,product2,2019-11-03 11:49:28.927721,456870964
168,64b3b4ed-4e7b-4605-bf18-1d27f38f0938,raw,136,catalog,2019-11-03 11:49:30.788195,456870964
169,64b3b4ed-4e7b-4605-bf18-1d27f38f0938,path_end,136,path_end,2019-11-03 11:49:30.788195,456870964


#### GroupEventsBulk

Similar to `GroupEvent`, but allows to apply multiple grouping rules simultaneously.

In [45]:
res = stream.group_events_bulk(
    [
        {
            'event_name': 'product',
            'event_type': 'group_product',
            'func': lambda _df: _df['event'].str.startswith('product')
        },
        {
            'event_name': 'delivery',
            'func': lambda _df: _df['event'].str.startswith('delivery')
        }
    ]
).to_dataframe()

In [46]:
res[res['user_id'] == 83322888]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
28828,33944c49-501e-4c8a-b653-7d7b487855a9,path_start,23620,path_start,2020-04-17 23:55:39.856717,83322888
28829,33944c49-501e-4c8a-b653-7d7b487855a9,raw,23620,catalog,2020-04-17 23:55:39.856717,83322888
28830,b4a67d8c-3e93-4ca3-afb8-a4753206c640,group_product,23621,product,2020-04-17 23:55:45.482196,83322888
28831,e56ba71e-dad3-4aaa-b7bc-124c8366309e,raw,23622,catalog,2020-04-17 23:55:45.522250,83322888
28832,a7f18a56-91af-4c07-9b87-f1d883721ba9,group_product,23623,product,2020-04-17 23:55:57.458809,83322888
28833,80b47b04-2f15-4ce5-b494-44c3137d4211,raw,23624,cart,2020-04-17 23:56:48.674344,83322888
28834,cf3358c7-d5c5-4106-84ee-1842703ece22,group_alias,23625,delivery,2020-04-17 23:56:49.380728,83322888
28835,24580b02-c897-4f25-b6d0-e3a2f441f437,group_alias,23626,delivery,2020-04-17 23:56:51.312632,83322888
28836,24580b02-c897-4f25-b6d0-e3a2f441f437,path_end,23626,path_end,2020-04-17 23:56:51.312632,83322888


In [47]:
res = stream.group_events_bulk(
    {
        'product': lambda _df: _df['event'].str.startswith('product'),
        'delivery': lambda _df: _df['event'].str.startswith('delivery')
    }
).to_dataframe()

In [48]:
res[res['user_id'] == 83322888]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
28828,33944c49-501e-4c8a-b653-7d7b487855a9,path_start,23620,path_start,2020-04-17 23:55:39.856717,83322888
28829,33944c49-501e-4c8a-b653-7d7b487855a9,raw,23620,catalog,2020-04-17 23:55:39.856717,83322888
28830,b4a67d8c-3e93-4ca3-afb8-a4753206c640,group_alias,23621,product,2020-04-17 23:55:45.482196,83322888
28831,e56ba71e-dad3-4aaa-b7bc-124c8366309e,raw,23622,catalog,2020-04-17 23:55:45.522250,83322888
28832,a7f18a56-91af-4c07-9b87-f1d883721ba9,group_alias,23623,product,2020-04-17 23:55:57.458809,83322888
28833,80b47b04-2f15-4ce5-b494-44c3137d4211,raw,23624,cart,2020-04-17 23:56:48.674344,83322888
28834,cf3358c7-d5c5-4106-84ee-1842703ece22,group_alias,23625,delivery,2020-04-17 23:56:49.380728,83322888
28835,24580b02-c897-4f25-b6d0-e3a2f441f437,group_alias,23626,delivery,2020-04-17 23:56:51.312632,83322888
28836,24580b02-c897-4f25-b6d0-e3a2f441f437,path_end,23626,path_end,2020-04-17 23:56:51.312632,83322888


#### CollapseLoops

In [49]:
res = stream.collapse_loops(suffix='loop', time_agg='max').to_dataframe()

In [50]:
stream.to_dataframe().query('user_id == 2112338')

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
4015,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,path_start,3327,path_start,2019-12-24 12:58:04.891249,2112338
4016,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,raw,3327,main,2019-12-24 12:58:04.891249,2112338
4017,c8afeb3b-f12d-4f41-941d-81634adf48d8,raw,3328,catalog,2019-12-24 12:58:08.096923,2112338
4018,9c9874fa-5728-4c2c-b9f6-4a9c75027077,raw,3329,catalog,2019-12-24 12:58:16.429552,2112338
4019,5efb4a14-a77b-43b1-b06c-286ea2b0a745,raw,3330,catalog,2019-12-24 12:58:44.965104,2112338
4020,5a01f957-fb07-449e-8121-fdbe1de4e0a5,raw,3331,main,2019-12-24 12:58:52.984853,2112338
4021,5a01f957-fb07-449e-8121-fdbe1de4e0a5,path_end,3331,path_end,2019-12-24 12:58:52.984853,2112338


In [51]:
res[res['user_id'] == 2112338]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
3141,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,path_start,3327,path_start,2019-12-24 12:58:04.891249,2112338
3142,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,raw,3327,main,2019-12-24 12:58:04.891249,2112338
3143,b3446101-0bf2-4aa7-a275-71b02f0dcd15,group_alias,3330,catalog_loop,2019-12-24 12:58:44.965104,2112338
3144,5a01f957-fb07-449e-8121-fdbe1de4e0a5,raw,3331,main,2019-12-24 12:58:52.984853,2112338
3145,5a01f957-fb07-449e-8121-fdbe1de4e0a5,path_end,3331,path_end,2019-12-24 12:58:52.984853,2112338


In [52]:
params = {
    'suffix': 'count',
    'time_agg': 'mean'
}

res = stream.collapse_loops(**params).to_dataframe()
res[res['user_id'] == 2112338]

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id
3151,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,path_start,3327,path_start,2019-12-24 12:58:04.891249000,2112338
3152,167dd3a9-9b87-481a-9cc6-8c7ee9d5917c,raw,3327,main,2019-12-24 12:58:04.891249000,2112338
3153,90da9bbe-fea4-4959-9d41-69f6b8f7820b,group_alias,3329,catalog_loop_3,2019-12-24 12:58:23.163859712,2112338
3154,5a01f957-fb07-449e-8121-fdbe1de4e0a5,raw,3331,main,2019-12-24 12:58:52.984853000,2112338
3155,5a01f957-fb07-449e-8121-fdbe1de4e0a5,path_end,3331,path_end,2019-12-24 12:58:52.984853000,2112338


## Pipe

In [53]:
stream.pipe(lambda _df: _df.assign(new_column=100))\
    .to_dataframe()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,new_column
0,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890,100
1,66f48c7c-bb89-4de8-88d9-4c8226a3dfe0,raw,0,catalog,2019-11-01 17:59:13.273932,219483890,100
2,1a1748eb-0063-4d9d-aaea-ba564ad70b1d,raw,1,product1,2019-11-01 17:59:28.459271,219483890,100
3,dd691ad2-5ab3-4450-83d8-21485a565abb,raw,2,cart,2019-11-01 17:59:29.502214,219483890,100
4,13f2b0a9-b2f4-4ffa-b0d1-446f12f6c570,raw,3,catalog,2019-11-01 17:59:32.557029,219483890,100
...,...,...,...,...,...,...,...
39780,1ab7adfe-6b74-42c5-a2ee-b5ef69348719,raw,32279,catalog,2020-04-29 12:47:40.975732,501098384,100
39781,42ff0327-d17e-42c7-b116-66c447f46848,raw,32280,catalog,2020-04-29 12:48:01.809577,501098384,100
39782,d79171c2-8164-4e74-b306-0c531583716e,raw,32281,main,2020-04-29 12:48:01.938488,501098384,100
39783,91d3aca3-8031-4782-80ec-0f0abd1d68a1,raw,32282,catalog,2020-04-29 12:48:06.595390,501098384,100
