In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
import os
from ast import literal_eval  # 문자열로 저장된 딕셔너리를 진짜 딕셔너리로 변환

# GCS 파일 경로에서 데이터 불러오기 위한 라이브러리
from google.cloud import storage
from google.oauth2 import service_account

In [4]:

# 기본 정보 확인 함수 
def show_df_info(df, df_name):
    """
    DataFrame의 기본 정보와 통계 요약을 출력하는 함수
    """
    print(f"\n{df_name} 형태:")
    display(df.shape)
    print(f"\n{df_name} 첫 5행:")
    display(df.head())
    print(f"\n{df_name} 정보:")
    df.info()
    print(f"\n{df_name} 통계 요약:")
    display(df.describe())
    print(f"\n{df_name} 통계 요약:")
    display(df.describe(include = 'O'))
    print(f"\n{df_name} 결측치 확인:")
    display(df.isnull().sum())
    # print(f"\n{df_name} 중복값 확인:")
    # print(df.iloc[:,1:].duplicated().sum())

# 파일 불러오기 및 확인

In [69]:
# 230430 test
# GCS 파일 경로
gcs_path = "gs://part4_project/events/230430.parquet"
# 데이터 불러오기
test_df = pd.read_parquet(gcs_path, storage_options={"token": None}) # key_path // 오늘은 에러나서 None으로...

# 확인
display(test_df.head())
test_df.shape

Unnamed: 0,event_datetime,session_id,device_id,event_key,description,location,devicemodel,language
0,2023-05-01 09:00:00,jgGFnoyc6GWuGIEgjp3nUoKRQbF3,cfc34eab-6930-488b-8109-70d37cec7dae,button,다음,signotherinputscreen,"iPhone10,4",ko-KR
1,2023-05-01 09:00:00,n3PI8GZnRFOKpJslIRgckMeRiln1,0dde2ecb-ff03-440f-98f3-94f20978b796,button,vote 선택,homevotesscreen,"iPhone14,4",ko-CA
2,2023-05-01 09:00:00,L31Dl8vRtdWPJ8XC0gGs1mPwzJH2,b50774ed-24ce-49e1-9383-945ff3aea9ff,button,친구선택,homeenquetescreen,SM-A125N,ko
3,2023-05-01 09:00:00,QjToElcYNkVxszJDHBhqtITlzDJ3,0999c6b6-c579-4c0f-916c-9a113010f84a,button,이름 셔플!,homeenquetescreen,SM-A305N,ko
4,2023-05-01 09:00:00,VRpXf303RGU730fIO4EjWstzAJi1,45985493-ed35-4cc5-935f-1bcf922e213c,button,앱바 뒤로가기,homefriendprofilescreen,"iPhone14,5",ko-KR


(8588422, 8)

In [2]:
# 인증
key_path = "../config/codeit_project_vm_key.json"
credentials = service_account.Credentials.from_service_account_file(key_path)

# GCS 클라이언트 생성
client = storage.Client(credentials=credentials)
bucket = client.bucket("part4_project")

# 'events/' 경로에 있는 .csv 파일 리스트 가져오기
blobs = bucket.list_blobs(prefix="events/")
events_file_list = [f"gs://part4_project/{blob.name}" for blob in blobs if blob.name.endswith(".csv")] 

# 결과 확인
events_file_list

['gs://part4_project/events/df_230430.csv',
 'gs://part4_project/events/df_230506.csv',
 'gs://part4_project/events/df_230512.csv',
 'gs://part4_project/events/df_230518.csv',
 'gs://part4_project/events/df_230524.csv',
 'gs://part4_project/events/df_230530.csv',
 'gs://part4_project/events/df_230605.csv',
 'gs://part4_project/events/df_230611.csv',
 'gs://part4_project/events/df_230617.csv']

In [None]:
# 파일들을 각각 변수로 불러오기
for title in events_file_list:
    # GCS 경로에서 파일명 추출
    file_name = title.split('/')[-1].replace('.csv', '')
    
    # Parquet 파일 읽기
    df = pd.read_csv(title, storage_options={"token": key_path})
    
    # 각각의 이름_df로 저장 (ex. device_properties_df)
    globals()[f"{file_name}"] = df
    
    # 출력 확인 (shape + head)
    print(f"\n {file_name} 불러오기 완료")
    print(f"→ shape: {df.shape}")
    print(f"→ 첫 5행:")
    display(df.head())
    
    # event_datetime이 있는 경우 날짜 변환 및 범위 출력
    if 'event_datetime' in df.columns:
        try:
            df['event_datetime'] = pd.to_datetime(df['event_datetime'])
            min_date = df['event_datetime'].min().date()
            max_date = df['event_datetime'].max().date()
            print(f"📆 {file_name}의 event_datetime 범위: {min_date} ~ {max_date}")
        except Exception as e:
            print(f"⚠️ {file_name}의 event_datetime 변환 중 오류 발생: {e}")
    else:
        print(f"⛔ {file_name}에는 'event_datetime' 컬럼이 존재하지 않음")


 df_230430 불러오기 완료
→ shape: (1200705, 8)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,description,location,devicemodel,language
0,2023-05-01 09:00:00,jgGFnoyc6GWuGIEgjp3nUoKRQbF3,cfc34eab-6930-488b-8109-70d37cec7dae,button,다음,signotherinputscreen,"iPhone10,4",ko-KR
1,2023-05-01 09:00:00,n3PI8GZnRFOKpJslIRgckMeRiln1,0dde2ecb-ff03-440f-98f3-94f20978b796,button,vote 선택,homevotesscreen,"iPhone14,4",ko-CA
2,2023-05-01 09:00:00,L31Dl8vRtdWPJ8XC0gGs1mPwzJH2,b50774ed-24ce-49e1-9383-945ff3aea9ff,button,친구선택,homeenquetescreen,SM-A125N,ko
3,2023-05-01 09:00:00,QjToElcYNkVxszJDHBhqtITlzDJ3,0999c6b6-c579-4c0f-916c-9a113010f84a,button,이름 셔플!,homeenquetescreen,SM-A305N,ko
4,2023-05-01 09:00:00,VRpXf303RGU730fIO4EjWstzAJi1,45985493-ed35-4cc5-935f-1bcf922e213c,button,앱바 뒤로가기,homefriendprofilescreen,"iPhone14,5",ko-KR


📆 df_230430_df의 event_datetime 범위: 2023-05-01 ~ 2023-05-01

 df_230506 불러오기 완료
→ shape: (1266122, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko


📆 df_230506_df의 event_datetime 범위: 2023-05-07 ~ 2023-05-09

 df_230512 불러오기 완료
→ shape: (1097522, 6)
→ 첫 5행:


Unnamed: 0,Asia/Seoul,session_id,device_id,event_key,devicemodel,language
0,2023-05-13T00:00:00.004Z,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13T00:00:00.027Z,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13T00:00:00.027Z,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13T00:00:00.046Z,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13T00:00:00.070Z,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


⛔ df_230512_df에는 'event_datetime' 컬럼이 존재하지 않음

 df_230518 불러오기 완료
→ shape: (995594, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-19 09:00:00,nALWgF7izudSzKXArsiAOaYYsr23,c370e36d-b84c-4c97-af37-db356caf1741,$session_start,"iPhone14,2",ko-KR
1,2023-05-19 09:00:00,E9A0131C-A70A-4917-88D8-257223F0A767,e9a0131c-a70a-4917-88d8-257223f0a767,$session_end,"iPhone11,2",ko-KR
2,2023-05-19 09:00:00,euhK4oLwdGPVT6sPNNN1Rs7xEKz1,2ab9b1b1-41a7-4f6e-af31-85daac4509ea,$session_start,"iPhone14,7",ko-KR
3,2023-05-19 09:00:00,TYAFQPAWozbqfQ3I296QuXJnWrx1,c3236b64-a328-4743-8afd-461f85f1f1e0,$session_start,"iPhone15,3",ko-KR
4,2023-05-19 09:00:00,XhhuxAQvEcXdlD1JjbPC8H3AF602,efe3418d-ad59-4a20-a5f9-43d2195d1a08,$session_start,"iPhone14,7",ko-KR


📆 df_230518_df의 event_datetime 범위: 2023-05-19 ~ 2023-05-20

 df_230524 불러오기 완료
→ shape: (903053, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-25 09:00:00,TeeVVRGrpuXE5q6f7pAu9ULVNvG3,b6afceaa-cc9c-4a66-b92f-9e202f9eb328,$session_start,"iPhone14,7",ko-KR
1,2023-05-25 09:00:00,9tcpOeHvyldCUXMgRyFVQI86rf13,54a8af4d-47bf-4951-87c2-1817b26d1c31,$session_start,"iPhone12,1",ko-KR
2,2023-05-25 09:00:00,aiDrtN8uHiQXRXmLMJhf27hKA4B3,b08ee1df-0bc6-4b38-9474-8cbc55fd6f17,$session_start,"iPhone12,1",ko-KR
3,2023-05-25 09:00:00,Oqmt3cCrCQZAa0TT3B4s2kTOG1k1,ace73d0e-04d6-4d03-9460-a0a1143253f7,$session_start,"iPhone11,8",ko-KR
4,2023-05-25 09:00:00,GA1Nz2ehK2TmsRcqBzXQn6hkDHH2,2adda5ce-1d37-43cd-8cbd-c75c06f84f74,$session_start,"iPhone14,7",ko-KR


⚠️ df_230524_df의 event_datetime 변환 중 오류 발생: time data "2023-05" doesn't match format "%Y-%m-%d %H:%M:%S", at position 95147. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

 df_230530 불러오기 완료
→ shape: (2313, 6)
→ 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-31 09:01:06,VAb6ai91iKPZZQhGKenhHZpAG5B2,6DA6A8DC-1E61-4643-963A-ED3D782BE77A,$session_start,"iPhone15,2",ko-KR
1,2023-05-31 09:02:20,fe556647-de19-43ff-9152-b3c38a8b4d26,fe556647-de19-43ff-9152-b3c38a8b4d26,$session_start,SM-G991N,ko
2,2023-05-31 09:02:46,LsAqqCTVtQPpevfSHXI8oRepVs73,5D47D549-EF22-41F3-A0D2-AD9EC1D21A2B,$session_start,"iPhone15,2",ko-KR
3,2023-05-31 09:03:50,NBfi1jm5FKfB6We7YfVRHFgeJY83,FD419DE8-B3CE-4630-B234-959ADA5F47CF,$session_start,"iPhone9,3",ko-KR
4,2023-05-31 09:04:12,3CBFC179-1BB1-40A2-84EF-C39EFBD40C14,3CBFC179-1BB1-40A2-84EF-C39EFBD40C14,$session_start,"iPhone14,7",ko-KR


📆 df_230530_df의 event_datetime 범위: 2023-05-31 ~ 2023-06-06

 df_230605 불러오기 완료
→ shape: (649, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-06 09:01:35,7918DF48-A239-45CE-AC1E-7E2E356454C4,$session_start,"iPhone13,1",ko-KR
1,2023-06-06 09:16:11,02F51978-FDD9-4C66-AE8D-820943E00AEF,$session_start,"iPad13,8",ko-KR
2,2023-06-06 09:27:07,E75FFCB7-5AAB-4674-AE34-7A231136C367,$session_start,"iPad13,4",ko-KR
3,2023-06-06 09:28:38,92D752F2-E7B9-4C01-892B-C9B3F1416B74,$session_start,"iPhone15,2",ko-KR
4,2023-06-06 09:44:06,47955d3d-b77f-47a6-9d3e-3fcb45a350df,$session_start,SM-A235N,ko


📆 df_230605_df의 event_datetime 범위: 2023-06-06 ~ 2023-06-12

 df_230611 불러오기 완료
→ shape: (263, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-12 09:00:33,8A5F41F9-D126-453C-8EA8-E0C2484584E3,$session_start,"iPhone13,1",ko-KR
1,2023-06-12 09:10:18,A81A2A45-6260-41C6-B81B-071F006D60BD,$session_start,"iPhone14,2",ko-KR
2,2023-06-12 09:17:21,CD41A81E-8D5C-4E7A-8289-587AD8C306F5,$session_start,"iPhone13,1",ko-KR
3,2023-06-12 09:21:00,5A92997F-362C-400B-8613-893A2FD6197F,$session_start,"iPhone13,2",ko-KR
4,2023-06-12 09:22:44,7FB46B64-B405-49C4-9378-8FABA73D4038,$session_start,"iPad13,16",ko-KR


📆 df_230611_df의 event_datetime 범위: 2023-06-12 ~ 2023-06-18

 df_230617 불러오기 완료
→ shape: (1124983, 5)
→ 첫 5행:


Unnamed: 0,event_datetime,device_id,event_key,devicemodel,language
0,2023-06-18 09:07:08,8b6f9e35-473d-45d7-b8ad-884359609fdd,$session_start,"iPhone15,2",ko-KR
1,2023-06-18 09:07:11,8b6f9e35-473d-45d7-b8ad-884359609fdd,$session_end,"iPhone15,2",ko-KR
2,2023-06-18 09:07:34,4ff18855-0de6-4e51-a00c-7cdfe36bfc64,$session_start,"iPhone8,4",ko-KR
3,2023-06-18 10:01:12,99a6de2d-d222-46ff-81c2-729daff8e688,$session_start,"iPad13,1",ko-KR
4,2023-06-18 11:30:47,4ff18855-0de6-4e51-a00c-7cdfe36bfc64,$session_start,"iPhone8,4",ko-KR


📆 df_230617_df의 event_datetime 범위: 2023-06-18 ~ 2023-06-23


In [None]:
# 20230430의 event_datetime, session_id, device_id,	event_key, devicemodel,	language 컬럼만 가져오면 5월꺼랑 concat 가능
# 20230506 ~ 20230530 파일까지 concat 가능
# 20230605 ~ 20230617 파일까지 concat 가능

In [58]:
df_230524.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903053 entries, 0 to 903052
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   event_datetime  903053 non-null  object
 1   session_id      903052 non-null  object
 2   device_id       903052 non-null  object
 3   event_key       903052 non-null  object
 4   devicemodel     903052 non-null  object
 5   language        903052 non-null  object
dtypes: object(6)
memory usage: 41.3+ MB


In [62]:
print(df_230524[df_230524.isnull().any(axis=1)])

df_230524 = df_230524.dropna() 

       event_datetime session_id device_id event_key devicemodel language
903052        2023-05        NaN       NaN       NaN         NaN      NaN


In [63]:
df_230524['event_datetime'] = pd.to_datetime(df_230524['event_datetime'])
df_230524['event_datetime'].min(), df_230524['event_datetime'].max()

(Timestamp('2023-05-25 09:00:00'), Timestamp('2023-05-26 15:04:21'))

# 230512 kst 로 변경

In [66]:
# 1. UTC 기준으로 파싱
df_230512['Asia/Seoul'] = pd.to_datetime(df_230512['Asia/Seoul'], utc=True)

# 2. 한국시간(Asia/Seoul)으로 타임존 변환
df_230512['Asia/Seoul'] = df_230512['Asia/Seoul'].dt.tz_convert('Asia/Seoul')
df_230512['Asia/Seoul'] = df_230512['Asia/Seoul'].dt.strftime('%Y-%m-%d %H:%M:%S')

# 3. 'Asia/Seoul' 컬럼명 event_datetime으로 변경
df_230512 = df_230512.rename(columns={'Asia/Seoul': 'event_datetime'})

print(df_230512.shape)
df_230512.head()

(1097522, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-13 09:00:00,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13 09:00:00,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13 09:00:00,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13 09:00:00,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13 09:00:00,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


In [67]:
# # df_230512 한국시간 변경 버전 csv로 저장
# df_230512.to_csv('../data/df_230512_kst.csv', index=False)


# parquet으로 저장 
df_230512.to_parquet('../data/df_230512_kst.parquet', index=False)

In [68]:
# 230512 kst 버전 불러오기
df_230512_kst_test = pd.read_parquet("/home/codeit_project_vm/codeit_project/codeit-project-docker/data/df_230512_kst.parquet")

print(df_230512_kst_test.shape)
df_230512_kst_test.head()

(1097522, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-13 09:00:00,6zkImM4PD0MHIbJ9zRVXs6vX6e62,cb7a48f6-6882-4223-98fd-4fe73cb3903b,$session_start,"iPhone14,4",ko-KR
1,2023-05-13 09:00:00,Sq5vui6fg2Nhz2EHeC8e4PWyxo13,b2512274-1161-4fd2-a4e8-b40f1ecc0372,$session_end,"iPhone12,8",ko-KR
2,2023-05-13 09:00:00,RyWeiMNMACUhbawb63ITBMX1c1U2,75f9bcd9-78c8-4dd6-a91e-0faf85a10526,$session_end,"iPhone15,2",ko-KR
3,2023-05-13 09:00:00,3OaNLuqTVDTkMEc8IsZNBxF9hsH2,380ebcac-4f9d-4004-968d-cf6372908adc,$session_start,"iPhone12,1",en-KR
4,2023-05-13 09:00:00,rWBKAsEaG3cpHuAPRyohFr3k6PB2,290b1a00-659d-4d24-a26d-5524d7ac44ce,$session_end,"iPhone14,3",ko-KR


In [54]:
df_230512_kst_test['event_datetime'] = pd.to_datetime(df_230512_kst_test['event_datetime'])
df_230512_kst_test.describe()

Unnamed: 0,event_datetime
count,1097522
mean,2023-05-13 20:38:04.599332608
min,2023-05-13 09:00:00
25%,2023-05-13 18:42:42
50%,2023-05-13 21:43:26
75%,2023-05-14 00:25:34
max,2023-05-14 03:01:39


# 2023.04.30 + 2023.05.06 ~ 2023.05.30. 파일 병합

In [6]:
# 20230430 파일의 event_datetime, session_id, device_id, event_key, devicemodel, language 컬럼만 가져와서 저장
df_230430_new = df_230430[['event_datetime', 'session_id', 'device_id', 'event_key', 'devicemodel', 'language']]

In [7]:
# 20230506 ~ 20230530 파일까지 concat

# 합치고 싶은 파일명 리스트
df_names = ['df_230506', 'df_230512', 'df_230518', 'df_230524', 'df_230530']

# globals()에서 해당 이름의 변수 가져와 리스트에 담기
dfs_to_concat = [globals()[name] for name in df_names]

# concat 병합
merged_0506_0530_df = pd.concat(dfs_to_concat, ignore_index=True)

# 결과 확인
print(f"병합된 DataFrame shape: {merged_0506_0530_df.shape}")
merged_0506_0530_df.head()

병합된 DataFrame shape: (4264604, 6)


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-07 09:00:00,OoWMTpTWpUa4m8I1RiKOk75Xnvs1,498dbc0f-4537-41dc-93ec-f4406302a3b0,$session_end,"iPhone14,5",ko-KR
1,2023-05-07 09:00:00,CrFi87nvwiVvHExxP9uSfIHOmBy1,7770afa8-e228-4736-aa2a-658ebe2e3d25,$session_end,"iPhone12,8",ko-KR
2,2023-05-07 09:00:00,W6frWgWJtbWTJwdm6eVWPGKwY2Y2,9fbdc45d-a74a-4d9f-9d40-0f400b72877e,$session_end,"iPhone14,8",ko-KR
3,2023-05-07 09:00:00,4EFB4D45-0AAF-424A-8E4E-2F19249C8892,4efb4d45-0aaf-424a-8e4e-2f19249c8892,$session_start,"iPhone12,3",ko-KR
4,2023-05-07 09:00:00,JdxVFvtz6AcxxOZVvl6plHayjYJ3,96348488-4d6a-49a8-8995-452718157fd7,$session_start,SM-A315N,ko


In [8]:
# 20230430과 5월 파일들 concat
merged_0430_0530_df = pd.concat([df_230430_new, merged_0506_0530_df], ignore_index=True)
merged_0430_0530_df.shape

(5465309, 6)

In [9]:
show_df_info(merged_0430_0530_df, "merged_0430_0530_df")


merged_0430_0530_df 형태:


(5465309, 6)


merged_0430_0530_df 첫 5행:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
0,2023-05-01 09:00:00,jgGFnoyc6GWuGIEgjp3nUoKRQbF3,cfc34eab-6930-488b-8109-70d37cec7dae,button,"iPhone10,4",ko-KR
1,2023-05-01 09:00:00,n3PI8GZnRFOKpJslIRgckMeRiln1,0dde2ecb-ff03-440f-98f3-94f20978b796,button,"iPhone14,4",ko-CA
2,2023-05-01 09:00:00,L31Dl8vRtdWPJ8XC0gGs1mPwzJH2,b50774ed-24ce-49e1-9383-945ff3aea9ff,button,SM-A125N,ko
3,2023-05-01 09:00:00,QjToElcYNkVxszJDHBhqtITlzDJ3,0999c6b6-c579-4c0f-916c-9a113010f84a,button,SM-A305N,ko
4,2023-05-01 09:00:00,VRpXf303RGU730fIO4EjWstzAJi1,45985493-ed35-4cc5-935f-1bcf922e213c,button,"iPhone14,5",ko-KR



merged_0430_0530_df 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5465309 entries, 0 to 5465308
Data columns (total 6 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   event_datetime  object
 1   session_id      object
 2   device_id       object
 3   event_key       object
 4   devicemodel     object
 5   language        object
dtypes: object(6)
memory usage: 250.2+ MB

merged_0430_0530_df 통계 요약:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
count,5465309,5465308,5465305,5465304,5465304,5465304
unique,405553,696031,548955,3,635,203
top,2023-05-14 01:38:46,fT1BNDNQRyR5PBAgG9tg3dQkfnm1,b66aeb3c-1e08-403a-8c73-14e02e3fb2c0,$session_start,"iPhone14,5",ko-KR
freq,125,985,985,2664451,681014,4123106



merged_0430_0530_df 통계 요약:


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
count,5465309,5465308,5465305,5465304,5465304,5465304
unique,405553,696031,548955,3,635,203
top,2023-05-14 01:38:46,fT1BNDNQRyR5PBAgG9tg3dQkfnm1,b66aeb3c-1e08-403a-8c73-14e02e3fb2c0,$session_start,"iPhone14,5",ko-KR
freq,125,985,985,2664451,681014,4123106



merged_0430_0530_df 결측치 확인:


event_datetime    0
session_id        1
device_id         4
event_key         5
devicemodel       5
language          5
dtype: int64

In [10]:
# 결측치 확인 
merged_0430_0530_df[merged_0430_0530_df.isnull().any(axis = 1)]


Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
1200704,2023-05-01 23:27:41,qN1W72oUIfd4laYwk6Ux103cFX42,f9ddd6f5-69d9-4639-81cd-ffd80a,,,
2466826,2023-05-09 01:53:10,BnHO3D8rBrM,,,,
3564348,2023-05-14 03:01:39,IYlwbLIgh4,,,,
4559942,2023-05-20 02:23:13,4I9Exy4M1ogpOGgZNDPH5,,,,
5462995,2023-05,,,,,


In [11]:
# 저 2023-05는 230512 파일에서 온건가?? 
df_230512[df_230512['event_datetime'] == '2023-05']

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language


- 결측치가 있는 4개의 행 중 event_datetime이 년-월 2023-05 까지만 있는 경우 존재
- 2023-05-12 파일에서 온 행은 아닌 것 같음...?

In [11]:
# 우선 결측치 있는 행 버리고 보자
merged_0430_0530_df = merged_0430_0530_df.dropna()
merged_0430_0530_df.shape

(5465304, 6)

In [12]:
# event_datetime 컬럼을 datetime 형식으로 변환
merged_0430_0530_df['event_datetime'] = pd.to_datetime(merged_0430_0530_df['event_datetime'])

In [13]:
# 기초 통계량 재확인
merged_0430_0530_df.describe(include = 'all')

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
count,5465304,5465304,5465304,5465304,5465304,5465304
unique,,696028,548954,3,635,203
top,,fT1BNDNQRyR5PBAgG9tg3dQkfnm1,b66aeb3c-1e08-403a-8c73-14e02e3fb2c0,$session_start,"iPhone14,5",ko-KR
freq,,985,985,2664451,681014,4123106
mean,2023-05-12 23:59:18.904272128,,,,,
min,2023-05-01 09:00:00,,,,,
25%,2023-05-07 20:15:32,,,,,
50%,2023-05-13 18:35:02,,,,,
75%,2023-05-19 19:39:29,,,,,
max,2023-06-06 08:55:03,,,,,


- 2023-05-01 09:00:00 ~ 2023-06-06 08:55:03까지 있음(0430 ~ 0530 병합 시)
- session_id의 고유값 개수가 device_id 보다 많음
- event_key : session_start, session_end, button 3개인데, session_start가 가장 많음

In [None]:
# event_key 고유값 개수 확인 
merged_0430_0530_df['event_key'].value_counts()  # button은 230430 파일만 있음 

event_key
$session_start    2664451
$session_end      1653814
button            1147039
Name: count, dtype: int64

- session_start가 session_end보다 많음 -> 모든게 짝지어지진 않을 확률 높음....
- button은 0430 파일에만 있는거라 사실 의미 없음.

In [16]:
# 무엇을 기준으로 봐야함??
# session_id 별로 device_id 몇 개인지
merged_0430_0530_df.groupby('session_id')['device_id'].nunique().reset_index().sort_values(by='device_id', ascending=False)

Unnamed: 0,session_id,device_id
650373,uHVfeabKAfPVlsaVzCdbmhyByTC3,20
322619,ISz4K45Bu6RApABHjdl0DAh1Hn33,19
537785,gE1y248PkmTzwpzhKzWCMSzOjvg1,16
403259,Sa3OqUfCeLakNbD3G0qzarA10Pk2,16
13415,0JlVUGW5bdbW8nrM3halq8O2XtR2,16
...,...,...
233869,BYoWnF1JpQXS2AtMBwgV7VMHipH2,1
233870,BYon1bp4WfPxLG3Xk9Gr0Tg32I32,1
233871,BYp9y9GPgXMowTasMma5a6cuIS32,1
233872,BYpap1fyARaNsUbAArZj96I0bEz1,1


In [None]:
# 샘플 session_id 확인
merged_0430_0530_df[merged_0430_0530_df['session_id'] == 'uHVfeabKAfPVlsaVzCdbmhyByTC3']

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
1212317,2023-05-07 09:18:28,uHVfeabKAfPVlsaVzCdbmhyByTC3,56c268f4-dc82-4c70-8080-de52b2bc8c3e,$session_start,"iPhone12,8",ko-KR
1390493,2023-05-07 20:57:14,uHVfeabKAfPVlsaVzCdbmhyByTC3,3827cffd-c1f8-46ff-b078-2db64212b42d,$session_start,"iPhone12,8",ko-KR
1409875,2023-05-07 21:27:27,uHVfeabKAfPVlsaVzCdbmhyByTC3,a03d7d91-e820-4d3b-9ed0-4a1c60931539,$session_start,"iPhone12,8",ko-KR
1476544,2023-05-07 23:03:24,uHVfeabKAfPVlsaVzCdbmhyByTC3,79064dc0-f2f1-46c0-9dc0-b1e6eda23ac4,$session_start,"iPhone12,8",ko-KR
1566108,2023-05-08 01:12:30,uHVfeabKAfPVlsaVzCdbmhyByTC3,efccb5ec-b656-4019-a0c1-af597666d246,$session_start,"iPhone12,8",ko-KR
1668301,2023-05-08 03:32:54,uHVfeabKAfPVlsaVzCdbmhyByTC3,d0ccadb5-61ff-4109-b160-2ddb042c5f06,$session_start,"iPhone12,8",ko-KR
1802049,2023-05-08 06:16:17,uHVfeabKAfPVlsaVzCdbmhyByTC3,8e5d0723-2b2f-42d1-a38f-4290bc6aedfc,$session_start,"iPhone12,8",ko-KR
1960919,2023-05-08 09:02:59,uHVfeabKAfPVlsaVzCdbmhyByTC3,363cf2c2-e4f4-4aa0-992f-c4bf041f6d1a,$session_start,"iPhone12,8",ko-KR
2016428,2023-05-08 10:19:00,uHVfeabKAfPVlsaVzCdbmhyByTC3,2a2c65b4-aa09-4996-921c-e2cedaa33a14,$session_start,"iPhone12,8",ko-KR
2026262,2023-05-08 10:39:33,uHVfeabKAfPVlsaVzCdbmhyByTC3,b05008bc-b7bf-4a30-8e43-a570378b1aaf,$session_start,"iPhone12,8",ko-KR


- device_id가 66a9a11f-c527-4116-b05e-3eac949c706f인 경우만 session_start 다음 session_end가 오지만 다시 session_start 로 마무리..
- 그 외 나머지는 모두 session_start 뿐임...

In [23]:
# device_id 별로 session_id 몇 개인지
merged_0430_0530_df.groupby('device_id')['session_id'].nunique().reset_index().sort_values(by='session_id', ascending=False)

Unnamed: 0,device_id,session_id
372245,ad97ef42-477f-42c0-b1fd-3ec83098020a,5
54947,19a3d22c-5b44-49b6-93a5-d124d5245b4f,5
401879,bb47b31f-cf5b-41b6-b859-aafbd375b519,4
173037,50c7f22c-b6cd-4685-8af3-6f828f430f0b,4
270149,7e00cbd3-294f-4af5-9114-4ac58e466442,4
...,...,...
218101,65b91c67-77a8-4fd0-80ee-87d2230377ce,1
218100,65b90754-ddef-4c08-9aca-c3588e74e3dd,1
218098,65b8f4a2-7317-4e23-b1d2-9151d46a5d3b,1
218097,65b8eeb0-5017-4014-8d7c-6a0440150673,1


In [24]:
(merged_0430_0530_df.groupby('device_id')['session_id'].nunique().reset_index().sort_values(by='session_id', ascending=False)['session_id'] == 1).sum()

371262

- 548954 중에서 371262개는 session_id 1개
- 548954 중에서 177692개가 2개 이상의 session_id를 가지고 있음

In [14]:
# 샘플 device_id 확인
merged_0430_0530_df[merged_0430_0530_df['device_id'] == 'ad97ef42-477f-42c0-b1fd-3ec83098020a']

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
270510,2023-05-01 10:17:52,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,button,"iPhone13,2",ko-KR
270705,2023-05-01 10:17:58,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,button,"iPhone13,2",ko-KR
270763,2023-05-01 10:17:59,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,button,"iPhone13,2",ko-KR
270847,2023-05-01 10:18:01,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,button,"iPhone13,2",ko-KR
270935,2023-05-01 10:18:03,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,button,"iPhone13,2",ko-KR
...,...,...,...,...,...,...
281967,2023-05-01 10:22:33,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
2017473,2023-05-08 10:21:01,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
2017483,2023-05-08 10:21:02,fvscVUlvnnTqnH8FopkHiTIu3jg1,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
2019039,2023-05-08 10:24:05,fvscVUlvnnTqnH8FopkHiTIu3jg1,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR


In [None]:
# event_key 가 button인 경우 제외하고 확인
merged_0430_0530_df[(merged_0430_0530_df['device_id'] == 'ad97ef42-477f-42c0-b1fd-3ec83098020a') & \
                     (merged_0430_0530_df['event_key'] != 'button')]

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
271364,2023-05-01 10:18:15,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
271412,2023-05-01 10:18:16,jMa8EuiY23XL5yvOaiFL4Vcqxo32,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
273083,2023-05-01 10:18:56,jMa8EuiY23XL5yvOaiFL4Vcqxo32,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
273106,2023-05-01 10:18:57,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
273525,2023-05-01 10:19:08,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
273574,2023-05-01 10:19:10,jMa8EuiY23XL5yvOaiFL4Vcqxo32,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
274000,2023-05-01 10:19:21,jMa8EuiY23XL5yvOaiFL4Vcqxo32,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
274055,2023-05-01 10:19:22,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR
275086,2023-05-01 10:19:49,AD97EF42-477F-42C0-B1FD-3EC83098020A,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_end,"iPhone13,2",ko-KR
275148,2023-05-01 10:19:51,YBj6GbybYJU17f2Z4YMAexV7ys72,ad97ef42-477f-42c0-b1fd-3ec83098020a,$session_start,"iPhone13,2",ko-KR


- session_id가 같을 때 session_start와 session_end가 잘 짝지어져서 나옴
- 어떤 session_id는 device_id와 같은데 알파벳만 소문자 -> 대문자로 바뀜 (AD97EF42-477F-42C0-B1FD-3EC83098020A)

In [18]:
# devide_id 기준 하나 더 확인
# event_key 가 button인 경우 제외하고 확인
merged_0430_0530_df[(merged_0430_0530_df['device_id'] == 'bb47b31f-cf5b-41b6-b859-aafbd375b519') & \
                     (merged_0430_0530_df['event_key'] != 'button')]

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
1319302,2023-05-07 18:37:07,bb47b31f-cf5b-41b6-b859-aafbd375b519,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_start,SM-G981N,ko
1319404,2023-05-07 18:37:25,bb47b31f-cf5b-41b6-b859-aafbd375b519,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_end,SM-G981N,ko
1319408,2023-05-07 18:37:26,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_start,SM-G981N,ko
1326496,2023-05-07 18:56:04,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_end,SM-G981N,ko
1347648,2023-05-07 19:41:47,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_start,SM-G981N,ko
1348019,2023-05-07 19:42:36,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_end,SM-G981N,ko
1387827,2023-05-07 20:52:59,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_start,SM-G981N,ko
1388087,2023-05-07 20:53:23,bIuEKENvxKYdTHRTDj215UpDXAw2,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_end,SM-G981N,ko
1388093,2023-05-07 20:53:23,bb47b31f-cf5b-41b6-b859-aafbd375b519,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_start,SM-G981N,ko
1388236,2023-05-07 20:53:38,bb47b31f-cf5b-41b6-b859-aafbd375b519,bb47b31f-cf5b-41b6-b859-aafbd375b519,$session_end,SM-G981N,ko


- 이전에 확인한 device_id와 비슷한 양상을 보임
- session_id == device_id 인 경우가 있음 

In [20]:
# session_id와 device_id 같은 경우 확인
merged_0430_0530_df[merged_0430_0530_df['session_id'] == merged_0430_0530_df['device_id']]['session_id'].nunique()

67083

- session_id 고유값 개수 : 696028
- device_id 고유값 개수 : 548954     
그 중, 67083개가 동일함!! (대문자 <-> 소문자 까지 매칭한다면 더 많아질지도?)

In [36]:
# 대소문자를 무시하고 비교 (모두 소문자로 변환한 후 비교)
merged_0430_0530_df[merged_0430_0530_df['session_id'].str.lower() == merged_0430_0530_df['device_id'].str.lower()]['session_id'].str.lower().nunique()

199292

- 대소문자까지 변환해서 모두 같은 경우 199292개! 

In [30]:
# device_id 별로 session_id 개수 셌을 때, 1개인 경우는 device_id == session_id 인 경우일까? 
group_device_df = merged_0430_0530_df.groupby('device_id')['session_id'].nunique().reset_index(name = 'session_cnt')
# device_id에 session_id 1개인 경우의 device_id 추출
device_list = group_device_df[group_device_df['session_cnt'] == 1]['device_id'].unique()
device_list

array(['0000065b-8c82-4982-b67e-ac67b1e4c38e',
       '000007c9-e103-4eb5-9777-a9084d4952df',
       '00002245-458f-4cdd-8533-b448cd43dbd2', ...,
       'ffff2485-894d-4854-b4f0-a3b5c79c23d5',
       'ffff29f0-c716-42a8-9347-3efaf7c21ca6',
       'ffff80ca-d517-40f4-bdef-e759411e1fe7'], dtype=object)

In [33]:
merged_0430_0530_df[(merged_0430_0530_df['device_id'].isin(device_list)) & (merged_0430_0530_df['device_id'] == merged_0430_0530_df['session_id'])]

Unnamed: 0,event_datetime,session_id,device_id,event_key,devicemodel,language
23651,2023-05-01 09:05:16,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,button,SM-G965N,ko
23999,2023-05-01 09:05:20,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,button,SM-G965N,ko
24124,2023-05-01 09:05:21,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,button,SM-G965N,ko
24296,2023-05-01 09:05:23,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,button,SM-G965N,ko
25514,2023-05-01 09:05:38,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,dc8b8d7c-03b1-42f0-9541-a88c4f282e86,button,SM-G965N,ko
...,...,...,...,...,...,...
5465291,2023-06-06 06:12:08,5ceb4e48-d1f9-4069-96b9-98e9957911af,5ceb4e48-d1f9-4069-96b9-98e9957911af,$session_start,SM-N976N,ko
5465294,2023-06-06 06:26:30,7d6e0b00-d462-4ecb-a336-fa25adef3c61,7d6e0b00-d462-4ecb-a336-fa25adef3c61,$session_start,SM-F711N,ko
5465297,2023-06-06 07:18:25,B9CE934F-EDC8-43DC-A2DF-412EB4E05EB1,B9CE934F-EDC8-43DC-A2DF-412EB4E05EB1,$session_start,"iPhone13,2",ko-KR
5465304,2023-06-06 08:28:56,39275C5A-CEF4-4182-807B-19B2495AB6BF,39275C5A-CEF4-4182-807B-19B2495AB6BF,$session_start,"iPhone13,3",ko-KR


- device_id에 session_id가 1개인 371262개 중에서 8483개만 device_id == session_id