In [2]:
import pandas as pd
import os
from glob import glob
import platform
import matplotlib.pyplot as plt
import koreanize_matplotlib
import matplotlib.font_manager as fm
import seaborn as sns
import ast

# 마이너스 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

# pandas 전체 출력 형식 설정(지수 표기 -> 자연수 표기)
pd.set_option('display.float_format', '{:,.0f}'.format)

pd.set_option('display.max_columns',100) # pandas 내 모든 칼럼 표시
pd.set_option('display.max_rows', None) # pandas 내 모든 행 표시
pd.set_option('display.max_seq_items', None) # 출력 옵션 설정
pd.set_option('display.max_colwidth', None) # pandas 최대 열 너비 늘리기기

In [3]:
hackle_events_df = pd.read_parquet('./data/hackle/hackle_events.parquet')

In [4]:
hackle_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11441319 entries, 0 to 11441318
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   event_id        object        
 1   event_datetime  datetime64[ms]
 2   event_key       object        
 3   session_id      object        
 4   id              object        
 5   item_name       object        
 6   page_name       object        
 7   friend_count    float64       
 8   votes_count     float64       
 9   heart_balance   float64       
 10  question_id     float64       
dtypes: datetime64[ms](1), float64(4), object(6)
memory usage: 960.2+ MB


In [5]:
hackle_events_df.head()

Unnamed: 0,event_id,event_datetime,event_key,session_id,id,item_name,page_name,friend_count,votes_count,heart_balance,question_id
0,00000533-3f1c-4b3b-81f1-0c8f35754b4e,2023-07-18 19:40:17,$session_start,4OzYh3seq3VKytpSn5pvQkZNQii1,00000533-3f1c-4b3b-81f1-0c8f35754b4e,,,,,,
1,00000716-27e9-4e72-a602-d0ce61784b06,2023-07-18 21:07:24,click_question_open,8QXy31PQxbW9qLzq0Y1dhR8Ypm52,00000716-27e9-4e72-a602-d0ce61784b06,,,64.0,436.0,4830.0,
2,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,2023-08-06 20:18:03,click_bottom_navigation_profile,6bcea65d-9f40-46fc-888c-700fe707483f,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,,,26.0,174.0,4729.0,
3,00000981-5e2a-4111-993e-4f1891ad9a53,2023-08-05 01:46:10,view_shop,XVYNT6zfhFWqIg9omwg2AHDjTLx2,00000981-5e2a-4111-993e-4f1891ad9a53,,,61.0,44.0,142.0,
4,00000a7a-ba72-4332-b4a9-7910670aaeb2,2023-07-24 15:03:37,click_bottom_navigation_lab,XFB2SPiGfjbVhvJ3Q3DBsaT3m2B3,00000a7a-ba72-4332-b4a9-7910670aaeb2,,,119.0,545.0,3287.0,


In [23]:
# 특정 session_id에서 event_key가 존재하는지 확인하고 앞 뒤 세 개의 event_key 확인하기

def inspect_event_surroundings(df, session_id, target_event):
    # 1. 해당 세션 필터링
    session_df = df[df["session_id"] == session_id].sort_values("event_datetime").reset_index(drop=True)

    # 2. target event 인덱스 찾기
    idx_list = session_df[session_df["event_key"] == target_event].index.tolist()

    # 3. 없을 경우 메시지 출력
    if not idx_list:
        print(f"[{session_id}] 세션에는 '{target_event}' 이벤트가 없습니다.")
        return

    # 4. 앞뒤 이벤트 출력
    for idx in idx_list:
        print(f"\n[이벤트 인덱스 {idx}]")
        print("이전 이벤트:", session_df.loc[idx - 3:idx - 1, "event_key"].tolist() if idx >= 3 else "이전 이벤트 없음")
        print("현재 이벤트:", session_df.loc[idx, "event_key"])
        print("다음 이벤트:", session_df.loc[idx + 1:idx + 3, "event_key"].tolist() if idx + 3 < len(session_df) else "다음 이벤트 없음")

In [None]:
session_sequence = hackle_events_df.sort_values(['session_id', 'event_datetime'])
session_sequence = session_sequence.groupby('session_id')['event_key'].apply(list)

session_id
000137bc-80de-4bb5-b61d-df7f217a4501                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [26]:
session_sequence.head(10)

session_id
000137bc-80de-4bb5-b61d-df7f217a4501                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [32]:
# inspect_event_surroundings(hackle_events_df, "session_id", "event_key")
inspect_event_surroundings(hackle_events_df, "008DAA55-8DF3-4405-9E11-D4A95E76825C", "click_random_ask_normal")


[이벤트 인덱스 1183]
이전 이벤트: ['view_lab_tap', 'view_timeline_tap', 'click_bottom_navigation_timeline']
현재 이벤트: click_random_ask_normal
다음 이벤트: ['click_bottom_navigation_questions', 'view_questions_tap', 'click_question_open']

[이벤트 인덱스 2044]
이전 이벤트: ['view_timeline_tap', 'click_bottom_navigation_timeline', 'view_timeline_tap']
현재 이벤트: click_random_ask_normal
다음 이벤트: ['$session_end', 'launch_app', '$session_start']
