In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import os
from ast import literal_eval  # 문자열로 저장된 딕셔너리를 진짜 딕셔너리로 변환

# GCS 파일 경로에서 데이터 불러오기 위한 라이브러리
from google.cloud import storage
from google.oauth2 import service_account

In [2]:
# 인증
key_path = "../config/codeit_project_vm_key.json"
credentials = service_account.Credentials.from_service_account_file(key_path)

# GCS 클라이언트 생성
client = storage.Client(credentials=credentials)
bucket = client.bucket("part4_project")

# 'events/' 경로에 있는 .csv 파일 리스트 가져오기
blobs = bucket.list_blobs(prefix="events/")
events_file_list = [f"gs://part4_project/{blob.name}" for blob in blobs if blob.name.endswith(".csv")] 

# 결과 확인
events_file_list

['gs://part4_project/events/df_230430.csv',
 'gs://part4_project/events/df_230506.csv',
 'gs://part4_project/events/df_230512.csv',
 'gs://part4_project/events/df_230518.csv',
 'gs://part4_project/events/df_230524.csv',
 'gs://part4_project/events/df_230530.csv',
 'gs://part4_project/events/df_230605.csv',
 'gs://part4_project/events/df_230611.csv',
 'gs://part4_project/events/df_230617.csv']

In [3]:
# 파일들을 각각 변수로 불러오기
for title in events_file_list:
    # GCS 경로에서 파일명 추출
    file_name = title.split('/')[-1].replace('.csv', '')
    
    # Parquet 파일 읽기
    df = pd.read_csv(title, storage_options={"token": key_path})
    
    # 각각의 이름_df로 저장 (ex. device_properties_df)
    globals()[f"{file_name}"] = df
    
    # # 선택적으로 출력 확인
    # print(f"{file_name} 불러오기 완료, shape: {df.shape}")
    
    # 출력 확인 (shape + head)
    print(f"\n {file_name} 불러오기 완료")
    print(f"→ shape: {df.shape}")
    print(f"→ 첫 5행:")
    display(df.head())


 df_230430 불러오기 완료
→ shape: (1200705, 8)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,description,location,devicemodel,language
0,2023-05-01 09:00:00,jgGFnoyc6GWuGIEgjp3nUoKRQbF3,cfc34eab-6930-488b-8109-70d37cec7dae,button,다음,signotherinputscreen,"iPhone10,4",ko-KR
1,2023-05-01 09:00:00,n3PI8GZnRFOKpJslIRgckMeRiln1,0dde2ecb-ff03-440f-98f3-94f20978b796,button,vote 선택,homevotesscreen,"iPhone14,4",ko-CA
2,2023-05-01 09:00:00,L31Dl8vRtdWPJ8XC0gGs1mPwzJH2,b50774ed-24ce-49e1-9383-945ff3aea9ff,button,친구선택,homeenquetescreen,SM-A125N,ko
3,2023-05-01 09:00:00,QjToElcYNkVxszJDHBhqtITlzDJ3,0999c6b6-c579-4c0f-916c-9a113010f84a,button,이름 셔플!,homeenquetescreen,SM-A305N,ko
4,2023-05-01 09:00:00,VRpXf303RGU730fIO4EjWstzAJi1,45985493-ed35-4cc5-935f-1bcf922e213c,button,앱바 뒤로가기,homefriendprofilescreen,"iPhone14,5",ko-KR



 df_230506 불러오기 완료
→ shape: (1266122, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko



 df_230512 불러오기 완료
→ shape: (1097522, 6)
→ 첫 5행:


Unnamed: 0,Asia/Seoul,session_id,device_id,event_key,devicemodel,language
0,2023-05-13T00:00:00.004Z,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13T00:00:00.027Z,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13T00:00:00.027Z,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13T00:00:00.046Z,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13T00:00:00.070Z,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR



 df_230518 불러오기 완료
→ shape: (995594, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-19 09:00:00,nALWgF7izudSzKXArsiAOaYYsr23,c370e36d-b84c-4c97-af37-db356caf1741,$session_start,"iPhone14,2",ko-KR
1,2023-05-19 09:00:00,E9A0131C-A70A-4917-88D8-257223F0A767,e9a0131c-a70a-4917-88d8-257223f0a767,$session_end,"iPhone11,2",ko-KR
2,2023-05-19 09:00:00,euhK4oLwdGPVT6sPNNN1Rs7xEKz1,2ab9b1b1-41a7-4f6e-af31-85daac4509ea,$session_start,"iPhone14,7",ko-KR
3,2023-05-19 09:00:00,TYAFQPAWozbqfQ3I296QuXJnWrx1,c3236b64-a328-4743-8afd-461f85f1f1e0,$session_start,"iPhone15,3",ko-KR
4,2023-05-19 09:00:00,XhhuxAQvEcXdlD1JjbPC8H3AF602,efe3418d-ad59-4a20-a5f9-43d2195d1a08,$session_start,"iPhone14,7",ko-KR



 df_230524 불러오기 완료
→ shape: (903053, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-25 09:00:00,TeeVVRGrpuXE5q6f7pAu9ULVNvG3,b6afceaa-cc9c-4a66-b92f-9e202f9eb328,$session_start,"iPhone14,7",ko-KR
1,2023-05-25 09:00:00,9tcpOeHvyldCUXMgRyFVQI86rf13,54a8af4d-47bf-4951-87c2-1817b26d1c31,$session_start,"iPhone12,1",ko-KR
2,2023-05-25 09:00:00,aiDrtN8uHiQXRXmLMJhf27hKA4B3,b08ee1df-0bc6-4b38-9474-8cbc55fd6f17,$session_start,"iPhone12,1",ko-KR
3,2023-05-25 09:00:00,Oqmt3cCrCQZAa0TT3B4s2kTOG1k1,ace73d0e-04d6-4d03-9460-a0a1143253f7,$session_start,"iPhone11,8",ko-KR
4,2023-05-25 09:00:00,GA1Nz2ehK2TmsRcqBzXQn6hkDHH2,2adda5ce-1d37-43cd-8cbd-c75c06f84f74,$session_start,"iPhone14,7",ko-KR



 df_230530 불러오기 완료
→ shape: (2313, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-31 09:01:06,VAb6ai91iKPZZQhGKenhHZpAG5B2,6DA6A8DC-1E61-4643-963A-ED3D782BE77A,$session_start,"iPhone15,2",ko-KR
1,2023-05-31 09:02:20,fe556647-de19-43ff-9152-b3c38a8b4d26,fe556647-de19-43ff-9152-b3c38a8b4d26,$session_start,SM-G991N,ko
2,2023-05-31 09:02:46,LsAqqCTVtQPpevfSHXI8oRepVs73,5D47D549-EF22-41F3-A0D2-AD9EC1D21A2B,$session_start,"iPhone15,2",ko-KR
3,2023-05-31 09:03:50,NBfi1jm5FKfB6We7YfVRHFgeJY83,FD419DE8-B3CE-4630-B234-959ADA5F47CF,$session_start,"iPhone9,3",ko-KR
4,2023-05-31 09:04:12,3CBFC179-1BB1-40A2-84EF-C39EFBD40C14,3CBFC179-1BB1-40A2-84EF-C39EFBD40C14,$session_start,"iPhone14,7",ko-KR



 df_230605 불러오기 완료
→ shape: (649, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-06 09:01:35,7918DF48-A239-45CE-AC1E-7E2E356454C4,$session_start,"iPhone13,1",ko-KR
1,2023-06-06 09:16:11,02F51978-FDD9-4C66-AE8D-820943E00AEF,$session_start,"iPad13,8",ko-KR
2,2023-06-06 09:27:07,E75FFCB7-5AAB-4674-AE34-7A231136C367,$session_start,"iPad13,4",ko-KR
3,2023-06-06 09:28:38,92D752F2-E7B9-4C01-892B-C9B3F1416B74,$session_start,"iPhone15,2",ko-KR
4,2023-06-06 09:44:06,47955d3d-b77f-47a6-9d3e-3fcb45a350df,$session_start,SM-A235N,ko



 df_230611 불러오기 완료
→ shape: (263, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-12 09:00:33,8A5F41F9-D126-453C-8EA8-E0C2484584E3,$session_start,"iPhone13,1",ko-KR
1,2023-06-12 09:10:18,A81A2A45-6260-41C6-B81B-071F006D60BD,$session_start,"iPhone14,2",ko-KR
2,2023-06-12 09:17:21,CD41A81E-8D5C-4E7A-8289-587AD8C306F5,$session_start,"iPhone13,1",ko-KR
3,2023-06-12 09:21:00,5A92997F-362C-400B-8613-893A2FD6197F,$session_start,"iPhone13,2",ko-KR
4,2023-06-12 09:22:44,7FB46B64-B405-49C4-9378-8FABA73D4038,$session_start,"iPad13,16",ko-KR



 df_230617 불러오기 완료
→ shape: (1124983, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-18 09:07:08,8b6f9e35-473d-45d7-b8ad-884359609fdd,$session_start,"iPhone15,2",ko-KR
1,2023-06-18 09:07:11,8b6f9e35-473d-45d7-b8ad-884359609fdd,$session_end,"iPhone15,2",ko-KR
2,2023-06-18 09:07:34,4ff18855-0de6-4e51-a00c-7cdfe36bfc64,$session_start,"iPhone8,4",ko-KR
3,2023-06-18 10:01:12,99a6de2d-d222-46ff-81c2-729daff8e688,$session_start,"iPad13,1",ko-KR
4,2023-06-18 11:30:47,4ff18855-0de6-4e51-a00c-7cdfe36bfc64,$session_start,"iPhone8,4",ko-KR


In [None]:
# 20230430
# 20230506 ~ 20230530 파일까지 concat 가능
# 20230605 ~ 20230617 파일까지 concat 가능

In [4]:

# 기본 정보 확인 함수 
def show_df_info(df, df_name):
    """
    DataFrame의 기본 정보와 통계 요약을 출력하는 함수
    """
    print(f"\n{df_name} 형태:")
    display(df.shape)
    print(f"\n{df_name} 첫 5행:")
    display(df.head())
    print(f"\n{df_name} 정보:")
    df.info()
    print(f"\n{df_name} 통계 요약:")
    display(df.describe())
    print(f"\n{df_name} 통계 요약:")
    display(df.describe(include = 'O'))
    print(f"\n{df_name} 결측치 확인:")
    display(df.isnull().sum())
    # print(f"\n{df_name} 중복값 확인:")
    # print(df.iloc[:,1:].duplicated().sum())

In [5]:
# 1. UTC 기준으로 파싱
df_230512['Asia/Seoul'] = pd.to_datetime(df_230512['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_230512['Asia/Seoul'] = df_230512['Asia/Seoul'].dt.tz_convert('Asia/Seoul')
df_230512['Asia/Seoul'] = df_230512['Asia/Seoul'].dt.strftime('%Y-%m-%d %H:%M:%S')

# 3. 'Asia/Seoul' 컬럼명 event_datetime으로 변경
df_230512 = df_230512.rename(columns={'Asia/Seoul': 'event_datetime'})

df_230512.head()

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-13 09:00:00,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13 09:00:00,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13 09:00:00,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13 09:00:00,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13 09:00:00,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


In [6]:
# 20230506 ~ 20230530 파일까지 concat

# 합치고 싶은 파일명 리스트
df_names = ['df_230506', 'df_230512', 'df_230518', 'df_230524', 'df_230530']

# globals()에서 해당 이름의 변수 가져와 리스트에 담기
dfs_to_concat = [globals()[name] for name in df_names]

# concat 병합
merged_0506_0530_df = pd.concat(dfs_to_concat, ignore_index=True)

# 결과 확인
print(f"병합된 DataFrame shape: {merged_0506_0530_df.shape}")
merged_0506_0530_df.head()

병합된 DataFrame shape: (4264604, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko
