In [2]:
from eventstream.eventstream import Eventstream
import datetime
from eventstream.schema import RawDataSchema, EventstreamSchema
from graph.p_graph import PGraph, MergeNode, SourceNode, EventsNode
from factories.simple_factories import simple_group, delete_events
import pandas as pd
import networkx as nx


source_df = pd.DataFrame([
  { "event_name": "pageview", "event_timestamp": "2021-10-26 12:00", "user_id": "1" },
  { "event_name": "cart_btn_click",  "event_timestamp": "2021-10-26 12:02", "user_id": "1" },
  { "event_name": "pageview",  "event_timestamp": "2021-10-26 12:03", "user_id": "1" },
  { "event_name": "trash_event",  "event_timestamp": "2021-10-26 12:03", "user_id": "1" },
  { "event_name": "exit_btn_click",  "event_timestamp": "2021-10-26 12:04", "user_id": "2" },
  { "event_name": "plus_icon_click",  "event_timestamp": "2021-10-26 12:05", "user_id": "1" },
])

source = Eventstream(
  raw_data=source_df,
  raw_data_schema=RawDataSchema(
    event_name="event_name",
    event_timestamp="event_timestamp",
    user_id="user_id"
  )
)

cart_events = EventsNode(
  simple_group(
    event_name="add_to_cart",
    filter=lambda df, schema : df[schema.event_name].isin(["cart_btn_click", "plus_icon_click"])
  )
)
logout_events = EventsNode(
  simple_group(
    event_name="logout",
    filter=lambda df, schema : df[schema.event_name] == "exit_btn_click"
  )
)
trash_events = EventsNode(
  delete_events(
    filter=lambda df, schema : df[schema.event_name] == "trash_event"
  )
)
merge = MergeNode()

graph = PGraph(source)
graph.add_node(
  node=cart_events,
  parents=[(graph.root, 0)]
)
graph.add_node(
  node=logout_events,
  parents=[(graph.root, 0)]
)
graph.add_node(
  node=trash_events,
  parents=[(graph.root, 0)]
)
graph.add_node(
  node=merge,
  parents=[
    (cart_events, 0),
    (logout_events, 1),
    (trash_events, 2)
  ]
)

groups_after_merge = EventsNode(
  simple_group(
    event_name="grouped",
    filter=lambda df, schema : df[schema.event_type] == "group_alias"
  )
)

graph.add_node(
  node=groups_after_merge,
  parents=[(merge, 0)]
)

result = graph.combine(groups_after_merge)
result.to_dataframe()

# nx.draw(graph.get_graph())


Unnamed: 0,event_id,event_type,event_index,event_name,event_timestamp,user_id
0,c5d6c5ec-2b27-48ba-b3ac-0c02554a35b4,raw,0,pageview,2021-10-26 12:00:00,1
1,80dd667b-4148-40bc-95f4-a9dbb66e92c1,group_alias,1,add_to_cart,2021-10-26 12:02:00,1
3,9b7bb48a-35c0-411e-958c-0bb90f01fd75,raw,3,pageview,2021-10-26 12:03:00,1
6,750a403d-fca8-4f54-9e47-4ad6aa3e136c,group_alias,6,logout,2021-10-26 12:04:00,2
8,0656688e-35e0-44e7-9c04-14e8498b2376,group_alias,8,add_to_cart,2021-10-26 12:05:00,1


In [3]:
import pandas as pd

parent = pd.DataFrame([
  { "id": 1, "event_name": "pageview" },
  { "id": 2, "event_name": "cart_btn_click" },
  { "id": 3, "event_name": "pageview" },
  { "id": 4, "event_name": "plus_icon_click" },
])

child = pd.DataFrame([
  { "id": 136, "event_name": "add_to_cart", "ref_id": 2 },
  { "id": 137, "event_name": "add_to_cart", "ref_id": 4 },
  { "id": 137, "event_name": "add_to_cart", "ref_id": 434343 },
  { "id": 137, "event_name": "add_to_cart123", "ref_id": None },
])

df = pd.merge(parent, child, left_on="id", right_on="ref_id", how="outer", indicator=True)
df = df[(df["_merge"] == "left_only") | (df["_merge"] == "both") | df["ref_id"].isna()]
print(df)

   id_x     event_name_x   id_y    event_name_y  ref_id      _merge
0   1.0         pageview    NaN             NaN     NaN   left_only
1   2.0   cart_btn_click  136.0     add_to_cart     2.0        both
2   3.0         pageview    NaN             NaN     NaN   left_only
3   4.0  plus_icon_click  137.0     add_to_cart     4.0        both
5   NaN              NaN  137.0  add_to_cart123     NaN  right_only


In [5]:
import pandas as pd


df1 = pd.DataFrame([
  { "a": 1, "b": 2 },
  { "a": 22, "b": 33 },
])


df2 = pd.DataFrame([
  { "a": 111111, "b": 2111, "c": 1 },
  { "a": 22111, "b": 33111, "c": 33333333333 },
])

df1.append(df2)

Unnamed: 0,a,b,c
0,1,2,
1,22,33,
0,111111,2111,1.0
1,22111,33111,33333330000.0


In [2]:
import networkx as nx
import pandas as pd



graph = nx.DiGraph()

graph.add_nodes_from([1,2,3,4,5])
graph.add_edge(1, 2)
graph.add_edge(1, 3)
graph.add_edge(3, 4)
graph.add_edge(5, 4)


for i in graph.predecessors(1):
  print(i)


df = pd.DataFrame([
  { "a": 3, "b": 4 },
  { "a": 32, "b": 33 },
])

"dd" in df.columns
  

False

In [1]:

tuples = [(i, "10") for i in range(10)]


for index, value in tuples:
  print(index, value)

0 10
1 10
2 10
3 10
4 10
5 10
6 10
7 10
8 10
9 10
