In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import pandas as pd

# download_url = 'https://drive.google.com/uc?id=1tY-4xg6m_dv6IaVPIcd4oC1bK1lW5Tr9&export=download&confirm=t'
# df0 = pd.read_csv(download_url, compression='gzip')
df0 = pd.DataFrame(data=[], columns=['event', 'timestamp', 'user_id'])
df0

Unnamed: 0,event,timestamp,user_id


In [3]:
from src.eventstream.schema import RawDataSchema, EventstreamSchema
from src.eventstream.eventstream import Eventstream
from src.graph.p_graph import PGraph, EventsNode
from src.data_processors_lib.rete import CollapseLoops, CollapseLoopsParams
from src.data_processors_lib.rete import DeleteUsersByPathLength, DeleteUsersByPathLengthParams
from src.data_processors_lib.rete import FilterEvents, FilterEventsParams
from src.data_processors_lib.rete import GroupEvents, GroupEventsParams
from src.data_processors_lib.rete import NewUsersEvents, NewUsersParams
from src.data_processors_lib.rete import SplitSessions, SplitSessionsParams
from src.data_processors_lib.rete import StartEndEvents, StartEndEventsParams
from src.data_processors_lib.rete import TruncatePath, TruncatePathParams
from src.data_processors_lib.rete import TruncatedEvents, TruncatedEventsParams
from src.graph.p_graph import PGraph, EventsNode
import inspect


raw_data_schema = RawDataSchema(
    event_name='event', 
    event_timestamp='timestamp', 
    user_id='user_id'
)

stream = Eventstream(
    raw_data=df0,
    raw_data_schema=raw_data_schema,
    schema=EventstreamSchema()
)

graph = PGraph(source_stream=stream)

TARGET_EVENT = 'finances/deposit/<payment_name>/success'

def users_with_target_event(df, schema) -> pd.DataFrame:
    target_users = df[df['event_name'] == TARGET_EVENT]['user_id'].unique()
    return df['user_id'].isin(target_users)

def first_session_filter(df, schema)  -> pd.DataFrame:
    return df['session_id'].str.endswith('_1')

def new_and_not_truncated_users(df, schema)  -> pd.DataFrame:
    truncated_users = df[(df['event_name'] == 'truncated_right')]['user_id'].unique()
    new_users = df[(df['event_name'] == 'new_user')]['user_id'].unique()
    target_users = np.setdiff1d(new_users, truncated_users)
    return df['user_id'].isin(target_users)

def first_session_filter(df, schema)  -> pd.DataFrame:
    return df['session_id'].str.endswith('_1')

node0 = EventsNode(CollapseLoops(params=CollapseLoopsParams(**{})))
node1 = EventsNode(StartEndEvents(params=StartEndEventsParams(**{})))
node2 = EventsNode(NewUsersEvents(params=NewUsersParams(new_users_list="all")))
node3 = EventsNode(TruncatedEvents(params=TruncatedEventsParams(right_truncated_cutoff=(12, 'D'))))
node4 = EventsNode(FilterEvents(params=FilterEventsParams(filter=new_and_not_truncated_users)))
node5 = EventsNode(SplitSessions(params=SplitSessionsParams(
    session_cutoff=(1, 'h'),
    session_col='session_id'
)))
node6 = EventsNode(FilterEvents(params=FilterEventsParams(filter=first_session_filter)))

node7 = EventsNode(FilterEvents(params=FilterEventsParams(filter=users_with_target_event)))
node8 = EventsNode(TruncatePath(params=TruncatePathParams(drop_after=TARGET_EVENT)))
node9 = EventsNode(FilterEvents(params=FilterEventsParams(filter=first_session_filter)))

graph.add_node(node=node0, parents=[graph.root])
graph.add_node(node=node1, parents=[graph.root])
graph.add_node(node=node2, parents=[node1])
graph.add_node(node=node3, parents=[node2])
graph.add_node(node=node4, parents=[node3])
graph.add_node(node=node5, parents=[node4])

graph.add_node(node=node6, parents=[node5])
graph.add_node(node=node7, parents=[node6])

graph.add_node(node=node8, parents=[node5])
graph.add_node(node=node9, parents=[node8])


graph_data = graph.export({})
graph_data["nodes"][1]["processor"]["values"]["suffix"] = "invalid_suffix"
graph_data["nodes"][1]["processor"]["values"]["timestamp_aggregation_type"] = "invalid_timestamp_aggregation_type"

errrr = None
try:
    graph._set_graph_handler(graph_data)
except Exception as err:
    errrr = err



  params_schema: dict[str, Any] = cls.schema()
  params_schema: dict[str, Any] = cls.schema()
  params_schema: dict[str, Any] = cls.schema()


In [5]:
graph.display()

In [20]:
graph.combine_result.to_dataframe()

Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,283c2130-1418-4600-9a15-bd7cb2c931ba,raw,0,landing/*,2022-06-01 00:00:04,c074e4d8-2d0e-470b-8239-5a9e2c9651e4
3,4cdc890b-a5f7-4296-a743-66426bee39c0,raw,3,landing/*,2022-06-01 00:00:08,6848bf45-cdaa-4e44-849b-a764a36e6d8d
4,af768f93-9225-4793-970b-d3e7e78df245,raw,4,/,2022-06-01 00:00:13,20b148f5-26d6-4d23-8829-e3a165def84f
7,520c252d-4d3e-44c0-8208-5b57e1791404,group_alias,7,login_loop,2022-06-01 00:00:14,20b148f5-26d6-4d23-8829-e3a165def84f
8,1e15fc88-d042-4404-816b-734f297d0750,raw,8,dashboard,2022-06-01 00:00:14,c3f90a2c-5b8a-4564-8365-0c769855b0e2
...,...,...,...,...,...,...
4149261,ce5fbb2e-bdae-4e63-a5dc-34c3893a8212,raw,4149261,/,2022-08-31 23:59:52,4fd6fe14-ed36-454c-8748-20cefc75b74c
4149262,ffc63b0b-7854-4eed-8403-1d3f9c575583,raw,4149262,dashboard,2022-08-31 23:59:52,e81a3ae6-5818-4ff1-897b-a6b9f9d19faf
4149263,aa2b9e1e-4f72-4d91-ac72-8f62a94c3b9d,raw,4149263,profile/*,2022-08-31 23:59:53,adf77d26-4236-4194-8831-a0f276212a26
4149264,23e5f2f6-a131-43e4-b23a-ddc97d9df540,raw,4149264,registration/account/real,2022-08-31 23:59:58,048b58e2-7d4d-4d3c-8c3a-af2aa5192519
