In [3]:
# 모든 행 출력
pd.set_option('display.max_rows', None)

# 모든 열 출력
pd.set_option('display.max_columns', None)

# 열 너비 자동 조정 (가독성 향상)
pd.set_option('display.width', None)

# 각 컬럼에 표시할 최대 문자 수 늘리기 (딕셔너리 컬럼 등)
pd.set_option('display.max_colwidth', None)


In [1]:
import pandas as pd
import ast
import json
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

# 폰트 경로로 FontProperties 객체 생성
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fontprop = fm.FontProperties(fname=font_path)

pd.set_option('display.max_rows', None)


def read_parquet_from_gcs(file_names, gcs_prefix, key_path):
    dfs = {}
    for name in file_names:
        path = f"{gcs_prefix}/{name}.parquet"
        dfs[name] = pd.read_parquet(path, storage_options={"token": key_path})
    return dfs


file_list = [
    "2023-05-06"]

gcs_prefix = "gs://codeit-project/hackle"
key_path = "/home/leesh/codeit_project/codeit-project-docker/config/key.json"

dfs = read_parquet_from_gcs(file_list, gcs_prefix, key_path)


df_230506 = dfs["2023-05-06"]

In [2]:
df_230506.head()

Unnamed: 0,Asia/Seoul,event_key,session_id,user_id,value,user_properties,hackle_properties,event_properties,id,device_id
0,2023-05-07T00:00:00.014Z,$session_end,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,,0.0,[],"{'language': 'ko-KR', 'isapp': 'true', 'osvers...",{'sessionid': '1683384973159.076d488c'},af49a3ac-cf10-49c9-a846-75c6350a331e,498dbc0f-4537-41dc-93ec-f4406302a3b0
1,2023-05-07T00:00:00.159Z,$session_end,CrFi87nvwiVvHExxP9uSfIHOmBy1,,0.0,[],"{'language': 'ko-KR', 'isapp': 'true', 'osvers...",{'sessionid': '1683377977126.ecbb48be'},ea3608d7-51f7-417f-9560-484896fb265d,7770afa8-e228-4736-aa2a-658ebe2e3d25
2,2023-05-07T00:00:00.244Z,$session_end,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,,0.0,[],"{'language': 'ko-KR', 'isapp': 'true', 'osvers...",{'sessionid': '1683385158861.10bc1ad8'},0d9da666-4047-4d40-a5ac-992876799cf4,9fbdc45d-a74a-4d9f-9d40-0f400b72877e
3,2023-05-07T00:00:00.255Z,$session_start,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,,0.0,[],"{'language': 'ko-KR', 'isapp': 'true', 'osvers...",{'sessionid': '1683385200255.b26bcb8c'},947d1fff-5e4c-44e2-b106-f23cddbe0dc4,4efb4d45-0aaf-424a-8e4e-2f19249c8892
4,2023-05-07T00:00:00.316Z,$session_start,JdxVFvtz6AcxxOZVvl6plHayjYJ3,,0.0,[],"{'language': 'ko', 'isapp': 'true', 'osversion...",{'sessionid': '1683385200316.4c60e55a'},28e3afe4-098a-4bef-a699-4e3fa325b3ab,96348488-4d6a-49a8-8995-452718157fd7


In [2]:
# 주요 정보 추출
df_simple = df_230506.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key', 'devicemodel', 'language']]

In [5]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
df_simple.head()

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko


In [6]:
df_simple.to_csv('/home/leesh/codeit_project/codeit-project-docker/notebooks/data/df_230506.csv',index=False)