In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import os
from ast import literal_eval  # 문자열로 저장된 딕셔너리를 진짜 딕셔너리로 변환

# GCS 파일 경로에서 데이터 불러오기 위한 라이브러리
from google.cloud import storage
from google.oauth2 import service_account

# 2023-05-06 작업

In [2]:
# 경로 설정
file_path = "gs://part4_project/events/2023-05-06.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
df_230506 = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(df_230506.shape)
print(df_230506.info())

(7106588, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7106588 entries, 0 to 7106587
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Asia/Seoul         object 
 1   event_key          object 
 2   session_id         object 
 3   user_id            object 
 4   value              float64
 5   user_properties    object 
 6   hackle_properties  object 
 7   event_properties   object 
 8   id                 object 
 9   device_id          object 
dtypes: float64(1), object(9)
memory usage: 542.2+ MB
None


In [3]:
# 주요 정보 추출
df_simple = df_230506.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key', 'devicemodel', 'language']]

In [4]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')


df_simple.to_parquet("../data/230506.parquet", engine="pyarrow", compression="snappy")

In [5]:
# 230506 불러오기 테스트
df_230506_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/230506.parquet")

print(df_230506_test.shape)
df_230506_test.head()

(7106588, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko


In [12]:
df_230506_test['event_datetime'] = pd.to_datetime(df_230506_test['event_datetime'])
df_230506_test.describe()

Unnamed: 0,event_datetime
count,7106588
mean,2023-05-10 20:36:47.453259776
min,2023-05-07 09:00:00
25%,2023-05-09 09:51:39
50%,2023-05-11 01:18:50
75%,2023-05-12 06:08:12
max,2023-05-13 08:59:59


# 2023-05-12 작업

In [6]:
# 경로 설정
file_path = "gs://part4_project/events/2023-05-12.parquet"
key_path = "../config/codeit_project_vm_key.json"

# GCS에서 parquet 불러오기
df_230512 = pd.read_parquet(file_path, storage_options={"token": key_path})

# 확인
print(df_230512.shape)
print(df_230512.info())

(12188804, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12188804 entries, 0 to 12188803
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Asia/Seoul         object 
 1   event_key          object 
 2   session_id         object 
 3   user_id            object 
 4   value              float64
 5   user_properties    object 
 6   hackle_properties  object 
 7   event_properties   object 
 8   id                 object 
 9   device_id          object 
dtypes: float64(1), object(9)
memory usage: 929.9+ MB
None


In [7]:
# 주요 정보 추출
df_simple = df_230512.copy()

# 딕셔너리 → Series로 분해 (중요한 컬럼만)
df_simple['language'] = df_simple['hackle_properties'].apply(lambda x: x.get('language'))
df_simple['devicemodel'] = df_simple['hackle_properties'].apply(lambda x: x.get('devicemodel'))
df_simple['sessionid'] = df_simple['event_properties'].apply(lambda x: x.get('sessionid'))


# 1. UTC 기준으로 파싱
df_simple['event_datetime'] = pd.to_datetime(df_simple['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_simple['event_datetime'] = df_simple['event_datetime'].dt.tz_convert('Asia/Seoul')

# 최종 확인용 컬럼만 선택
df_simple = df_simple[['event_datetime','session_id','device_id', 'event_key', 'devicemodel', 'language']]

In [8]:
df_simple['event_datetime'] = df_simple['event_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')


df_simple.to_parquet("../data/230512.parquet", engine="pyarrow", compression="snappy")

In [9]:
# 230512 불러오기 테스트
df_230512_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/230512.parquet")

print(df_230512_test.shape)
df_230512_test.head()

(12188804, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-13 09:00:00,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13 09:00:00,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13 09:00:00,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13 09:00:00,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13 09:00:00,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


In [11]:
df_230512_test['event_datetime'] = pd.to_datetime(df_230512_test['event_datetime'])
df_230512_test.describe()

Unnamed: 0,event_datetime
count,12188804
mean,2023-05-16 12:42:39.236488448
min,2023-05-13 09:00:00
25%,2023-05-15 01:42:00
50%,2023-05-16 14:15:06
75%,2023-05-18 01:24:40
max,2023-05-19 08:59:59
