In [3]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import os
from ast import literal_eval  # 문자열로 저장된 딕셔너리를 진짜 딕셔너리로 변환

# GCS 파일 경로에서 데이터 불러오기 위한 라이브러리
from google.cloud import storage
from google.oauth2 import service_account

# 2023-05-06 작업

In [2]:
# 경로 설정
file_path = "gs://part4_project/events/2023-05-06.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
df_230506 = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(df_230506.shape)
print(df_230506.info())

(7106588, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7106588 entries, 0 to 7106587
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Asia/Seoul         object 
 1   event_key          object 
 2   session_id         object 
 3   user_id            object 
 4   value              float64
 5   user_properties    object 
 6   hackle_properties  object 
 7   event_properties   object 
 8   id                 object 
 9   device_id          object 
dtypes: float64(1), object(9)
memory usage: 542.2+ MB
None


In [3]:
# 주요 정보 추출
df_simple = df_230506.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key', 'devicemodel', 'language']]

In [4]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')


df_simple.to_parquet("../data/230506.parquet", engine="pyarrow", compression="snappy")

In [5]:
# 230506 불러오기 테스트
df_230506_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/230506.parquet")

print(df_230506_test.shape)
df_230506_test.head()

(7106588, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko


In [12]:
df_230506_test['event_datetime'] = pd.to_datetime(df_230506_test['event_datetime'])
df_230506_test.describe()

Unnamed: 0,event_datetime
count,7106588
mean,2023-05-10 20:36:47.453259776
min,2023-05-07 09:00:00
25%,2023-05-09 09:51:39
50%,2023-05-11 01:18:50
75%,2023-05-12 06:08:12
max,2023-05-13 08:59:59


# 2023-05-12 작업

In [6]:
# 경로 설정
file_path = "gs://part4_project/events/2023-05-12.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
df_230512 = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(df_230512.shape)
print(df_230512.info())

(12188804, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12188804 entries, 0 to 12188803
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Asia/Seoul         object 
 1   event_key          object 
 2   session_id         object 
 3   user_id            object 
 4   value              float64
 5   user_properties    object 
 6   hackle_properties  object 
 7   event_properties   object 
 8   id                 object 
 9   device_id          object 
dtypes: float64(1), object(9)
memory usage: 929.9+ MB
None


In [7]:
# 주요 정보 추출
df_simple = df_230512.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key', 'devicemodel', 'language']]

In [8]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')


df_simple.to_parquet("../data/230512.parquet", engine="pyarrow", compression="snappy")

In [9]:
# 230512 불러오기 테스트
df_230512_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/230512.parquet")

print(df_230512_test.shape)
df_230512_test.head()

(12188804, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-13 09:00:00,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13 09:00:00,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13 09:00:00,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13 09:00:00,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13 09:00:00,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


In [11]:
df_230512_test['event_datetime'] = pd.to_datetime(df_230512_test['event_datetime'])
df_230512_test.describe()

Unnamed: 0,event_datetime
count,12188804
mean,2023-05-16 12:42:39.236488448
min,2023-05-13 09:00:00
25%,2023-05-15 01:42:00
50%,2023-05-16 14:15:06
75%,2023-05-18 01:24:40
max,2023-05-19 08:59:59


# 2023-06-23 작업

In [2]:
# 경로 설정
file_path = "gs://part4_project/events/2023-06-23.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
df_230623 = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(df_230623.shape)
print(df_230623.info())
display(df_230623.head())

(6820315, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6820315 entries, 0 to 6820314
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Asia/Seoul         object 
 1   event_key          object 
 2   session_id         object 
 3   user_id            object 
 4   value              float64
 5   user_properties    object 
 6   hackle_properties  object 
 7   event_properties   object 
 8   id                 object 
 9   device_id          object 
dtypes: float64(1), object(9)
memory usage: 520.3+ MB
None


Unnamed: 0,Asia/Seoul,event_key,session_id,user_id,value,user_properties,hackle_properties,event_properties,id,device_id
0,2023-06-24T00:00:00.023Z,view_question_question,2b7ad8fc-88ee-409c-9ac0-01e182be56a8,mNLDRM56CjeiREW9NH8dHObLiTM2,0.0,"[(heart_balance, 822)]","{'language': 'ko', 'isapp': 'true', 'osversion...",[],f13f1761-addc-4076-9d2d-14c6f80903bb,2b7ad8fc-88ee-409c-9ac0-01e182be56a8
1,2023-06-24T00:00:00.064Z,view_timeline_tap,XjKd6fh3CCfKJxobCxGssTIcoCp1,XjKd6fh3CCfKJxobCxGssTIcoCp1,0.0,[],"{'language': 'ko-KR', 'isapp': 'true', 'osvers...",[],4d28cfc7-3ea1-45fe-aa2b-a564ef7db359,216d665c-7e79-414d-b9e4-100a9a1d4737
2,2023-06-24T00:00:00.148Z,view_message_tap,QepNLzWEkTNvfHSSkhCldJH5eMw2,QepNLzWEkTNvfHSSkhCldJH5eMw2,0.0,[],"{'language': 'ko-US', 'isapp': 'true', 'osvers...",[],903df3b8-c2dd-4d17-b044-6034ad25adbb,b3de9810-5a82-4192-a69d-a2535307a88f
3,2023-06-24T00:00:00.169Z,$session_start,C03B343F-B566-46BA-9030-8E441B71EC33,WVcHhMtN5uVbI9ZW7OM3iMWeXMl1,0.0,[],"{'language': 'ko-US', 'isapp': 'true', 'osvers...",[],11931083-cc18-4a5a-bcba-a265d6434f8a,c03b343f-b566-46ba-9030-8e441b71ec33
4,2023-06-24T00:00:00.174Z,view_home_tap,C03B343F-B566-46BA-9030-8E441B71EC33,WVcHhMtN5uVbI9ZW7OM3iMWeXMl1,0.0,[],"{'language': 'ko-US', 'isapp': 'true', 'osvers...",[],f2127c1c-b8b6-42d2-9a46-5953df5f2541,c03b343f-b566-46ba-9030-8e441b71ec33


In [4]:
# 주요 정보 추출
df_simple = df_230623.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
# df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
# df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
# df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key']]

In [5]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')


df_simple.to_parquet("../data/230623.parquet", engine="pyarrow", compression="snappy")

In [6]:
# 230623 불러오기 테스트
df_230623_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/230623.parquet")

print(df_230623_test.shape)
df_230623_test.head()

(6820315, 4)


Unnamed: 0,event_datetime,session_id,device_id,event_key
0,2023-06-24 09:00:00,2b7ad8fc-88ee-409c-9ac0-01e182be56a8,2b7ad8fc-88ee-409c-9ac0-01e182be56a8,view_question_question
1,2023-06-24 09:00:00,XjKd6fh3CCfKJxobCxGssTIcoCp1,216d665c-7e79-414d-b9e4-100a9a1d4737,view_timeline_tap
2,2023-06-24 09:00:00,QepNLzWEkTNvfHSSkhCldJH5eMw2,b3de9810-5a82-4192-a69d-a2535307a88f,view_message_tap
3,2023-06-24 09:00:00,C03B343F-B566-46BA-9030-8E441B71EC33,c03b343f-b566-46ba-9030-8e441b71ec33,$session_start
4,2023-06-24 09:00:00,C03B343F-B566-46BA-9030-8E441B71EC33,c03b343f-b566-46ba-9030-8e441b71ec33,view_home_tap


In [7]:
df_230623_test['event_datetime'] = pd.to_datetime(df_230623_test['event_datetime'])
df_230623_test.describe()

Unnamed: 0,event_datetime
count,6820315
mean,2023-06-26 19:22:55.648187392
min,2023-06-24 09:00:00
25%,2023-06-25 07:10:58
50%,2023-06-26 07:05:57
75%,2023-06-28 05:18:11
max,2023-06-30 08:59:59


# 기존 hackle_event 파일 전처리 (명세서 있는거!)

In [None]:
# session_id 소문자로 바꾸고 
# 컬럼 없애기 hackle_events.drop(columns=['question_id', 'heart_balance', 'votes_count', 'friend_count', 'page_name', 'item_name', 'id'])

In [4]:
# 경로 설정
file_path = "gs://part4_project/hackle/hackle_events.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
hackle_events_df = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(hackle_events_df.shape)
print(hackle_events_df.info())
display(hackle_events_df.head())

(11441319, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11441319 entries, 0 to 11441318
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   event_id        object        
 1   event_datetime  datetime64[ms]
 2   event_key       object        
 3   session_id      object        
 4   id              object        
 5   item_name       object        
 6   page_name       object        
 7   friend_count    float64       
 8   votes_count     float64       
 9   heart_balance   float64       
 10  question_id     float64       
dtypes: datetime64[ms](1), float64(4), object(6)
memory usage: 960.2+ MB
None


Unnamed: 0,event_id,event_datetime,event_key,session_id,id,item_name,page_name,friend_count,votes_count,heart_balance,question_id
0,00000533-3f1c-4b3b-81f1-0c8f35754b4e,2023-07-18 19:40:17,$session_start,4OzYh3seq3VKytpSn5pvQkZNQii1,00000533-3f1c-4b3b-81f1-0c8f35754b4e,,,,,,
1,00000716-27e9-4e72-a602-d0ce61784b06,2023-07-18 21:07:24,click_question_open,8QXy31PQxbW9qLzq0Y1dhR8Ypm52,00000716-27e9-4e72-a602-d0ce61784b06,,,64.0,436.0,4830.0,
2,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,2023-08-06 20:18:03,click_bottom_navigation_profile,6bcea65d-9f40-46fc-888c-700fe707483f,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,,,26.0,174.0,4729.0,
3,00000981-5e2a-4111-993e-4f1891ad9a53,2023-08-05 01:46:10,view_shop,XVYNT6zfhFWqIg9omwg2AHDjTLx2,00000981-5e2a-4111-993e-4f1891ad9a53,,,61.0,44.0,142.0,
4,00000a7a-ba72-4332-b4a9-7910670aaeb2,2023-07-24 15:03:37,click_bottom_navigation_lab,XFB2SPiGfjbVhvJ3Q3DBsaT3m2B3,00000a7a-ba72-4332-b4a9-7910670aaeb2,,,119.0,545.0,3287.0,


In [5]:
# session_id 소문자로 바꾸기
hackle_events_df['session_id'] = hackle_events_df['session_id'].str.lower()

In [6]:
# 필요없는 컬럼 날리기
hackle_events_df = hackle_events_df.drop(columns=['question_id', 'heart_balance', 'votes_count', 'friend_count', 'page_name', 'item_name', 'id'])

In [7]:
hackle_events_df.shape

(11441319, 4)

In [8]:
hackle_events_df.head()

Unnamed: 0,event_id,event_datetime,event_key,session_id
0,00000533-3f1c-4b3b-81f1-0c8f35754b4e,2023-07-18 19:40:17,$session_start,4ozyh3seq3vkytpsn5pvqkznqii1
1,00000716-27e9-4e72-a602-d0ce61784b06,2023-07-18 21:07:24,click_question_open,8qxy31pqxbw9qlzq0y1dhr8ypm52
2,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,2023-08-06 20:18:03,click_bottom_navigation_profile,6bcea65d-9f40-46fc-888c-700fe707483f
3,00000981-5e2a-4111-993e-4f1891ad9a53,2023-08-05 01:46:10,view_shop,xvynt6zfhfwqig9omwg2ahdjtlx2
4,00000a7a-ba72-4332-b4a9-7910670aaeb2,2023-07-24 15:03:37,click_bottom_navigation_lab,xfb2spigfjbvhvj3q3dbsat3m2b3


In [9]:
# parquet 파일로 내보내기
hackle_events_df.to_parquet("../data/hackle_events_yoochang.parquet", engine="pyarrow", compression="snappy")

In [10]:
# hackle_events_yoochang 불러오기 테스트
hackle_events_yoochang_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/hackle_events_yoochang.parquet")

print(hackle_events_yoochang_test.shape)
hackle_events_yoochang_test.head()

(11441319, 4)


Unnamed: 0,event_id,event_datetime,event_key,session_id
0,00000533-3f1c-4b3b-81f1-0c8f35754b4e,2023-07-18 19:40:17,$session_start,4ozyh3seq3vkytpsn5pvqkznqii1
1,00000716-27e9-4e72-a602-d0ce61784b06,2023-07-18 21:07:24,click_question_open,8qxy31pqxbw9qlzq0y1dhr8ypm52
2,000007c8-68ce-40e6-9b1e-f0e34e8ff9cc,2023-08-06 20:18:03,click_bottom_navigation_profile,6bcea65d-9f40-46fc-888c-700fe707483f
3,00000981-5e2a-4111-993e-4f1891ad9a53,2023-08-05 01:46:10,view_shop,xvynt6zfhfwqig9omwg2ahdjtlx2
4,00000a7a-ba72-4332-b4a9-7910670aaeb2,2023-07-24 15:03:37,click_bottom_navigation_lab,xfb2spigfjbvhvj3q3dbsat3m2b3
