#### 모듈

In [72]:
import os
import os.path as path

import pandas as pd

from sklearn.preprocessing import LabelEncoder
import numpy as np

#### 인코더 불러오기

In [73]:
# path
BasePath = "./encoders"

# get list of encoder files
encoder_file_list = os.listdir(BasePath)

# print list of encoder files
print(encoder_file_list)

# load encoder files
encoder_dict = {}

for file in encoder_file_list:
    encoder_name = file.replace(".npy", "")
    encoder_dict[encoder_name] = LabelEncoder()
    classes = np.load(path.join(BasePath, file), allow_pickle=True)
    encoder_dict[encoder_name].classes_ = np.load(path.join(BasePath, file), allow_pickle=True)
    
    if encoder_name == "os_version":
        encoder_dict[encoder_name].fit(list(map(lambda x: str(x), classes)))

# encoder modify
for encoder_name in encoder_dict.keys():
    # reorder encoder : make encoder[0] = nan
    encoder_dict[encoder_name].classes_ = \
        np.append(encoder_dict[encoder_name].classes_[-1],\
                    encoder_dict[encoder_name].classes_[:-1])

    print(f"{encoder_name} : {encoder_dict[encoder_name].classes_.shape[0]} / {encoder_dict[encoder_name].classes_[:5]}")

# reorder 'event_type' encoder classes
encoder_dict['event_type'].classes_ = np.array([
    'error',
    'enter.main_page',
    'enter.signup_page',
    'complete.signup',
    'enter.content_page',
    'click.content_page_start_content_button',
    'click.content_page_more_review_button',
    'enter.payment_page',
    'complete.subscription',
    'renew.subscription',
    'resubscribe.subscription',
    'start.free_trial',
    'start.content',
    'enter.lesson_page',
    'complete.lesson',
    'click.lesson_page_related_question_box',
    'end.content',
    'click.cancel_plan_button'
])

# encoder test
print(encoder_dict["event_type"].classes_[:5])


['button.name.npy', 'button_name.npy', 'city.npy', 'content.difficulty.npy', 'content.id.npy', 'country.npy', 'coupon.discount_amount.npy', 'device_family.npy', 'device_type.npy', 'event_type.npy', 'is_free_trial.npy', 'is_trial.npy', 'language.npy', 'lesson.id.npy', 'os_name.npy', 'os_version.npy', 'paid_amount.npy', 'pg.type.npy', 'plan.price.npy', 'plan.type.npy', 'platform.npy', 'question.id.npy', 'trial.type.npy', 'type.npy', 'user_id.npy']
button.name : 10 / [nan '구독 시작하기' '무료로 들어보기' '무료로 시작하기' '최저가 혜택 받기']
button_name : 6 / [nan '무료로 시작하기' '수강시작' '수강하기' '이어하기']
city : 4341 / [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
content.difficulty : 6 / [nan 'advanced' 'beginner' 'hard' 'intermediate']
content.id : 213 / [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']
country : 190 / [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
coupon.discount_amount : 46 / [nan -52

#### 인코더 테스트

In [74]:
## 주의사항
# 1. BaseFilePath에 데이터 파일 경로를 설정해주세요.
# 2. 제외하고 싶은 컬럼이 있으면 exclude_columns에 추가해주세요.
# 3. 인코딩을 하고 컬럼 이름을 바꿔 주세요.

## 함수 설명
# .transform()을 이용해서 데이터를 변환해주세요.
# .inverse_transform()을 이용해서 데이터를 역변환해주세요.


## 인코딩

# 개인적으로 BaseFilePath를 설정해주었는데, 이를 이용해서 파일 리스트를 불러오는 코드입니다.
BaseFilePath = 'd:\Codeit\intermediate_project\주제 2. 구독서비스 프로덕트 데이터 분석'
filelist = os.listdir(BaseFilePath)[:-1]

# 제외 컬럼
exclude_columns = ['device_carrier', 'os_version']

# 제외 컬럼 : 제외하고 싶은 컬럼을 추가합니다.
exclude_columns_plus = []


# display file list
print(filelist)

# load data
dfs = []

for file in filelist:
    if file.endswith('.csv'):
        df = pd.read_csv(path.join(BaseFilePath, file), nrows=1000)
        
        # preprocess
        # convert os_version to string
        df['os_version'] = df['os_version'].astype(str)
        # drop 
        df = df.drop('device_carrier', axis=1)
        df = df.drop('os_version', axis=1)
        # convert client_event_time to datetime
        df['client_event_time'] = pd.to_datetime(df['client_event_time'])
        
        dfs.append(df)
        print(f"file: {file}, shape: {df.shape}")

for i, df in enumerate(dfs):
    print(f"file: {filelist[i]} encodings - ")
    
    # encode categorical columns using the loaded encoders
    for col in df.columns:
        # exception handling
        if col not in encoder_dict:
            continue
        if col in exclude_columns + exclude_columns_plus:
            continue
        
        # encode using the loaded encoder
        df[col] = encoder_dict[col].transform(df[col])
        print(f"{col}: {encoder_dict[col].classes_[:5]}")
    
    # display result head
    display(df.head())

  BaseFilePath = 'd:\Codeit\intermediate_project\주제 2. 구독서비스 프로덕트 데이터 분석'


['click.cancel_plan_button.csv', 'click.content_page_more_review_button.csv', 'click.content_page_start_content_button.csv', 'click.lesson_page_related_question_box.csv', 'complete.lesson.csv', 'complete.signup.csv', 'complete.subscription.csv', 'end.content.csv', 'enter.content_page.csv', 'enter.lesson_page-002.csv', 'enter.main_page.csv', 'enter.payment_page.csv', 'enter.signup_page.csv', 'renew.subscription.csv', 'resubscribe.subscription.csv', 'start.content.csv', 'start.free_trial.csv']


  BaseFilePath = 'd:\Codeit\intermediate_project\주제 2. 구독서비스 프로덕트 데이터 분석'


KeyboardInterrupt: 

In [None]:
## 디코딩

for i, df in enumerate(dfs):
    print(f"file: {filelist[i]} decodings - ")
    
    # encode categorical columns using the loaded encoders
    for col in df.columns:
        # exception handling
        if col not in encoder_dict:
            continue
        if col in exclude_columns + exclude_columns_plus:
            continue
        
        # encode using the loaded encoder
        df[col] = encoder_dict[col].inverse_transform(df[col])
        print(f"{col}: {encoder_dict[col].classes_[:5]}")
    
    # display result head
    display(df.head())

file: click.cancel_plan_button.csv decodings - 


city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id
0,Daegu,2023-04-10 14:57:48.556,South Korea,Windows,Windows,click.cancel_plan_button,Korean,Chrome,Web,9e59ecf9a8fbe9051bd0d54f4b702f30
1,Seoul,2023-03-12 08:26:16.224,South Korea,Apple iPad,Apple iPad,click.cancel_plan_button,English,Mobile Safari,Web,4ed4d8f12c45504a148bfff0f2e9466e
2,Uijeongbu-si,2023-03-12 08:23:04.955,South Korea,Windows,Windows,click.cancel_plan_button,Korean,Chrome,Web,b6a1bba3cc4139d7dc821561319221e6
3,Dobong-gu,2023-03-12 08:32:39.271,South Korea,Windows,Windows,click.cancel_plan_button,Korean,Chrome,Web,40acc1ddfc012f3d2f58a3de95e9e748
4,Geumjeong-gu,2023-04-12 11:36:28.311,South Korea,Windows,Windows,click.cancel_plan_button,Korean,Chrome,Web,bdffe1f18684704d4ea2d7eebcaed180


file: click.content_page_more_review_button.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id
0,Seo-gu,2023-04-10 14:10:01.402,South Korea,Windows,Windows,click.content_page_more_review_button,Korean,Chrome,Web,558fe7e8c06b8505f11b33d76f076e56,a1eeebeb6c307641b75b94a2a1d5b9aa
1,London,2023-01-24 21:37:43.417,United Kingdom,Windows,Windows,click.content_page_more_review_button,Korean,Whale,Web,e48956538e4df690a5d12adf1e6f2ee3,f491517f4737a60d661cd1fcacc702c0
2,Eunpyeong-gu,2023-01-31 14:44:16.897,South Korea,Windows,Windows,click.content_page_more_review_button,Korean,Chrome,Web,e745da89019e303b122bfceac0521a8f,9aa7628a347707fb155943041e2cb524
3,Seoul,2023-01-31 14:30:36.568,South Korea,Apple iPhone,Apple iPhone,click.content_page_more_review_button,Korean,Mobile Safari,Web,e2c59d444736eceee9eedba85bd4e209,f491517f4737a60d661cd1fcacc702c0
4,Seoul,2023-02-12 03:24:32.337,South Korea,Windows,Windows,click.content_page_more_review_button,Korean,Chrome,Web,74b66b738185ffdbec960db4a0da330a,101e1d0dcc38d9c86156f008a145083e


file: click.content_page_start_content_button.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']
button.name: [nan '구독 시작하기' '무료로 들어보기' '무료로 시작하기' '최저

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,button.name,button_name
0,Daejeon,2023-04-05 21:14:38.432,South Korea,Windows,Windows,click.content_page_start_content_button,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,콘텐츠 구독하기,
1,Seongnam-si,2023-04-10 14:57:20.033,South Korea,Windows,Windows,click.content_page_start_content_button,Korean,Edge,Web,f7155af01d277f11b1c7d05cf772c686,18992b667be06d6f29ba0008f99d9745,콘텐츠 이어보기,
2,Yangp'yong,2023-04-10 14:28:29.515,South Korea,Windows,Windows,click.content_page_start_content_button,Korean,Chrome,Web,1b05f9fe82b80771323caa1cd5683143,dfbdae782996c25daed7517b8835f3aa,콘텐츠 이어보기,
3,Seo-gu,2023-04-10 14:09:30.564,South Korea,Windows,Windows,click.content_page_start_content_button,Korean,Chrome,Web,353528475f2bd85934263421ccd579b3,a1eeebeb6c307641b75b94a2a1d5b9aa,콘텐츠 이어보기,
4,Seo-gu,2023-04-10 14:10:52.368,South Korea,Windows,Windows,click.content_page_start_content_button,Korean,Chrome,Web,353528475f2bd85934263421ccd579b3,7de35a9d8f14c68e5f2a8788dda5f72e,콘텐츠 구독하기,


file: click.lesson_page_related_question_box.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
question.id: [nan '000468dc7826f179a01db37bff526ca4' '001024597887f49ec01816cc0360ce2a'
 '0015d6a21cc7eaa4c71241b0128d8d09' '0031005db08ae21ea9152cd538a388ca']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,question.id,content.id,lesson.id
0,Dong-gu,2023-04-05 21:51:48.377,South Korea,Samsung Phone,Samsung Galaxy S21 5G,click.lesson_page_related_question_box,English,Chrome,Web,117f95e12266dcf8f911ae39ff55a800,c654f9825245640595af9b8aecb150aa,c269eb6df3a374b464f7c18f12fa398f,e7dbf75457e67ef644d4ae1981b9e01b
1,Dong-gu,2023-04-05 21:52:21.689,South Korea,Samsung Phone,Samsung Galaxy S21 5G,click.lesson_page_related_question_box,English,Chrome,Web,117f95e12266dcf8f911ae39ff55a800,898fb79fc740dc8705b52fb32521d5d5,c269eb6df3a374b464f7c18f12fa398f,e7dbf75457e67ef644d4ae1981b9e01b
2,Dong-gu,2023-04-05 21:52:42.481,South Korea,Samsung Phone,Samsung Galaxy S21 5G,click.lesson_page_related_question_box,English,Chrome,Web,117f95e12266dcf8f911ae39ff55a800,9fb80c124d57fd31494738e0af5437a3,c269eb6df3a374b464f7c18f12fa398f,e7dbf75457e67ef644d4ae1981b9e01b
3,Goyang-si,2023-04-05 21:20:15.032,South Korea,Mac,Mac,click.lesson_page_related_question_box,Korean,Chrome,Web,e1f3ecae035a3c8ae8cf789fbde98995,9016e2246486e852992ccd9ae0eb27ca,c269eb6df3a374b464f7c18f12fa398f,395bdf0293b24ec47d5a9e960574f851
4,Goyang-si,2023-04-05 21:21:55.275,South Korea,Mac,Mac,click.lesson_page_related_question_box,Korean,Chrome,Web,e1f3ecae035a3c8ae8cf789fbde98995,4ecd552eeded70b3274b8edf3ecacbd9,c269eb6df3a374b464f7c18f12fa398f,a738c7835388066e2618af1e5ebacb3c


file: complete.lesson.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']
lesson.id: [nan '001e4570bb0fc346a1c969b019bdc22b' '0021f3a8597cb32f360563144

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,lesson.id
0,Wonju,2023-04-05 21:55:51.836,South Korea,Windows,Windows,complete.lesson,Korean,Chrome,Web,91053eda0bffb4db6c9317acf15bc44a,94bdcbd9f329aafa84ab464b6721187d,d360d68b65fd3e0eafb7982386d75fd9
1,Daejeon,2023-04-05 21:19:01.243,South Korea,Windows,Windows,complete.lesson,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,7e0f6aae886368ef9fa97007a8e9f7ac
2,Daejeon,2023-04-05 21:20:27.939,South Korea,Windows,Windows,complete.lesson,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,d1e93c9dc08267efab31849bc6083854
3,Daejeon,2023-04-05 21:23:01.534,South Korea,Windows,Windows,complete.lesson,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,f22ea5dc431035305e705552bbc4f45b
4,Daejeon,2023-04-05 21:28:34.965,South Korea,Windows,Windows,complete.lesson,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,ce0bfeb7fb1259c588f437d48ab2690c


file: complete.signup.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
type: [nan 'email' 'facebook' 'google' 'kakao']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,type
0,,2023-04-05 21:33:26.068,,,,complete.signup,,,,74d17c27a41656e7788b0c81d9f76cda,kakao
1,,2023-04-05 21:45:18.672,,,,complete.signup,,,,06b6868000a5d576180aca47896d6cce,kakao
2,Seo-gu,2023-04-10 14:47:56.424,South Korea,Windows,Windows,complete.signup,Korean,Chrome,Web,e131c319972bd580146c5a2faba26613,kakao
3,Suwon,2023-04-10 14:25:45.920,South Korea,Apple iPhone,Apple iPhone,complete.signup,Korean,Mobile Safari,Web,e3b8bfa80d866f296aad139e09023ef3,naver
4,,2023-04-10 14:09:50.698,,,,complete.signup,,,,5208a60918f18562ac9d668a35b20303,kakao


file: complete.subscription.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
plan.price: [nan 14328 15920 19920 42960]
paid_amount: [nan 3960 7960 9552 11144]
coupon.discount_amount: [nan -52400 -48400 0 478]
pg.type: [nan 'A' 'B' 'C' None]


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,plan.price,paid_amount,coupon.discount_amount,pg.type
0,Nowon-gu,2023-04-05 21:40:04.000,South Korea,Windows,Windows,complete.subscription,Korean,Edge,Web,1116a2ae90ca6b06f85feba27ad51ef0,15920,15920,0,
1,Seoul,2023-04-10 14:20:03.008,South Korea,Apple iPhone,Apple iPhone,complete.subscription,Korean,Mobile Safari,Web,c0f5678df8967163e6e2c464ed006791,131600,131600,0,
2,Yangsan,2023-03-12 08:46:14.026,South Korea,Windows,Windows,complete.subscription,Korean,Chrome,Web,79421b5d038205ad7adbb1e2c3b89741,131600,119756,11844,
3,Geumjeong-gu,2023-04-12 11:35:01.865,South Korea,Windows,Windows,complete.subscription,Korean,Chrome,Web,bdffe1f18684704d4ea2d7eebcaed180,131600,131600,0,
4,Gwangmyeong,2023-05-31 01:25:55.820,South Korea,Mac,Mac,complete.subscription,Korean,Chrome,Web,d4f54ecf6c4d7bf8742f80505f02d6dc,15920,15920,0,


file: end.content.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id
0,Yokohama,2023-04-10 14:02:10.600,Japan,Windows,Windows,end.content,Japanese,Chrome,Web,3fd801c6192296cf7b85efc01bd547b9,a914987711c7e2db9fd4bff7555ec87c
1,Seoul,2023-04-10 14:21:26.466,South Korea,Windows,Windows,end.content,Korean,Chrome,Web,85a332649d34bc7b5ebdec7660b1d57d,f64c475280676531a31f2f8e0648599c
2,Seongbuk-gu,2023-04-10 14:37:04.440,South Korea,Windows,Windows,end.content,Korean,Chrome,Web,2bf8cb1d9c5d4556f5165c2aaf804f2e,7fef928cdccef0185dfb931ef59b37d7
3,Seo-gu,2023-04-10 14:37:19.909,South Korea,Windows,Windows,end.content,Korean,Chrome,Web,831813f54c0d55c17db612f0ea311c93,68a3658f1b936bfff3bd6960cd561b0a
4,Seo-gu,2023-04-10 14:08:58.968,South Korea,Windows,Windows,end.content,Korean,Chrome,Web,353528475f2bd85934263421ccd579b3,a1eeebeb6c307641b75b94a2a1d5b9aa


file: enter.content_page.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id
0,Seoul,2023-04-05 21:40:35.740,South Korea,Apple iPhone,Apple iPhone,enter.content_page,Korean,Chrome,Web,d57aba7b4527e71b96135deaaec57def,220ce7873b8895d0c81c37600b5bd1e7
1,Hwaseong-si,2023-04-05 21:01:59.950,South Korea,Samsung Galaxy Note,Samsung Galaxy Note10+ 5G,enter.content_page,Korean,Chrome,Web,,ecde9a1b01763f791c3a185d53b5f394
2,Daejeon,2023-04-05 21:13:12.038,South Korea,Windows,Windows,enter.content_page,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,b8d4ec0133e5ce485055bfd69cd28abd
3,Daejeon,2023-04-05 21:14:25.355,South Korea,Windows,Windows,enter.content_page,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430
4,Seoul,2023-04-05 21:44:23.759,South Korea,Samsung Phone,Samsung Galaxy S20 FE 5G,enter.content_page,Korean,Samsung Browser,Web,,7cebec6e79cc981dbf421c5ec2aedd9d


file: enter.lesson_page-002.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']
is_trial: [nan False True]
lesson.id: [nan '001e4570bb0fc346a1c969b019b

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,is_trial,lesson.id,is_free_trial
0,Wonju,2023-04-05 21:46:04.333,South Korea,Windows,Windows,enter.lesson_page,Korean,Chrome,Web,91053eda0bffb4db6c9317acf15bc44a,94bdcbd9f329aafa84ab464b6721187d,False,d360d68b65fd3e0eafb7982386d75fd9,
1,Wonju,2023-04-05 21:55:50.787,South Korea,Windows,Windows,enter.lesson_page,Korean,Chrome,Web,91053eda0bffb4db6c9317acf15bc44a,94bdcbd9f329aafa84ab464b6721187d,False,d360d68b65fd3e0eafb7982386d75fd9,
2,Wonju,2023-04-05 21:55:55.684,South Korea,Windows,Windows,enter.lesson_page,Korean,Chrome,Web,91053eda0bffb4db6c9317acf15bc44a,94bdcbd9f329aafa84ab464b6721187d,False,770740696ab1809d6b50d4013b64a75c,
3,Dongdaemun-gu,2023-04-05 21:04:55.828,South Korea,Mac,Mac,enter.lesson_page,Korean,Chrome,Web,b2b31f00b82edc6419b25324a7ae41a4,5fc64d786416b980cfda86afd69e4516,False,68cc362ca81b5899b3c9b4dea0aaddff,
4,Dongdaemun-gu,2023-04-05 21:05:34.139,South Korea,Mac,Mac,enter.lesson_page,Korean,Chrome,Web,b2b31f00b82edc6419b25324a7ae41a4,db43a841c994231e2795d4df8931af50,False,144682f4648f81de8e6aabd11b92105e,


file: enter.main_page.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id
0,Seongnam-si,2023-04-05 21:55:06.140,South Korea,Samsung Phone,Samsung Galaxy S21 5G,enter.main_page,Korean,Whale,Web,
1,Dongdaemun-gu,2023-04-05 21:04:28.967,South Korea,Mac,Mac,enter.main_page,Korean,Chrome,Web,b2b31f00b82edc6419b25324a7ae41a4
2,Hwaseong-si,2023-04-05 21:01:31.258,South Korea,Samsung Galaxy Note,Samsung Galaxy Note10+ 5G,enter.main_page,Korean,Chrome,Web,
3,Hwaseong-si,2023-04-05 21:02:50.374,South Korea,Samsung Galaxy Note,Samsung Galaxy Note10+ 5G,enter.main_page,Korean,Chrome,Web,
4,Seoul,2023-04-05 21:30:33.727,South Korea,Samsung Phone,Samsung Galaxy A51 5G,enter.main_page,Korean,Chrome,Web,


file: enter.payment_page.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id
0,Seoul,2023-04-05 21:31:27.602,South Korea,Samsung Phone,Samsung Galaxy A51 5G,enter.payment_page,Korean,Chrome,Web,8ee58bcaa05b234cdbf84167c0983385
1,Wonju,2023-04-05 21:33:30.699,South Korea,Windows,Windows,enter.payment_page,Korean,Chrome,Web,74d17c27a41656e7788b0c81d9f76cda
2,Daejeon,2023-04-05 21:05:28.337,South Korea,Windows,Windows,enter.payment_page,Korean,Edge,Web,62cfa08370da60b8db4495baaff62806
3,Seoul,2023-04-05 21:45:27.935,South Korea,Mac,Mac,enter.payment_page,Korean,Safari,Web,06b6868000a5d576180aca47896d6cce
4,Seoul,2023-04-05 21:46:50.632,South Korea,Mac,Mac,enter.payment_page,Korean,Safari,Web,06b6868000a5d576180aca47896d6cce


file: enter.signup_page.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id
0,Seoul,2023-04-05 21:30:42.557,South Korea,Samsung Phone,Samsung Galaxy A51 5G,enter.signup_page,Korean,Chrome,Web,
1,Seoul,2023-04-05 21:37:41.467,South Korea,Samsung SM-S916N,,enter.signup_page,Korean,Chrome,Web,
2,Seoul,2023-04-05 21:05:53.237,South Korea,Apple iPhone,Apple iPhone,enter.signup_page,Korean,Mobile Safari,Web,
3,Gangnam-gu,2023-04-05 21:41:58.269,South Korea,Mac,Mac,enter.signup_page,Korean,Safari,Web,
4,Seoul,2023-04-05 21:44:53.294,South Korea,Mac,Mac,enter.signup_page,Korean,Safari,Web,


file: renew.subscription.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
plan.price: [nan 14328 15920 19920 42960]
paid_amount: [nan 3960 7960 9552 11144]
coupon.discount_amount: [nan -52400 -48400 0 478]
pg.type: [nan 'A' 'B' 'C' None]


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,plan.price,paid_amount,coupon.discount_amount,pg.type
0,Nam-gu,2022-09-08 03:05:02.068,South Korea,Windows,Windows,renew.subscription,Korean,Edge,Web,6ddea3af2eaae869861c1190dcc9d4a3,15920,15920,0,
1,Nam-gu,2022-09-08 03:40:02.050,South Korea,Windows,Windows,renew.subscription,Korean,Chrome,Web,c2d47034960fdff5df495822ff66fbc1,15920,15920,0,
2,Dongjak-gu,2022-09-08 03:49:00.829,South Korea,Windows,Windows,renew.subscription,Korean,Chrome,Web,de91f1226dd938990fefc79545daf468,42960,40812,2148,
3,Yongin-si,2022-09-08 04:10:06.406,South Korea,Windows,Windows,renew.subscription,Korean,Chrome,Web,bf14840a8cb4643e6d557da96e35a3af,15920,15920,0,
4,Dongjak-gu,2022-09-08 04:40:01.895,South Korea,Android,Android,renew.subscription,Korean,Chrome Mobile,Web,7ddc1c2990d0645d82885394e3c518b5,15920,15920,0,


file: resubscribe.subscription.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
plan.price: [nan 14328 15920 19920 42960]
paid_amount: [nan 3960 7960 9552 11144]
coupon.discount_amount: [nan -52400 -48400 0 478]
pg.type: [nan 'A' 'B' 'C' None]


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,plan.price,paid_amount,coupon.discount_amount,pg.type
0,Daejeon,2023-04-05 21:11:45.525,South Korea,Windows,Windows,resubscribe.subscription,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,15920,15920,0,
1,Seoul,2023-06-30 09:59:56.848,South Korea,Samsung Phone,Samsung Galaxy S22+,resubscribe.subscription,Korean,Chrome WebView,Web,68831c27e47f0e8f04ba809baeec218e,15920,15920,0,
2,Nowon-gu,2023-06-13 06:20:22.524,South Korea,Windows,Windows,resubscribe.subscription,Korean,Chrome,Web,4e7568cb7db01f8386614ec2c6c961cd,15920,15920,0,
3,Suwon,2023-06-11 11:02:11.878,South Korea,Windows,Windows,resubscribe.subscription,Korean,Chrome,Web,f91f19527b9ec688e028ec884b1b5c1c,131600,131600,0,
4,Koishikawa,2023-06-13 14:39:25.736,Japan,Apple iPhone,Apple iPhone,resubscribe.subscription,Japanese,Chrome,Web,2276d7d2a31e05f4a5558c6406cc36c9,131600,131600,0,


file: start.content.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
content.id: [nan '011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61']
content.difficulty: [nan 'advanced' 'beginner' 'hard' 'intermediate']


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,content.difficulty
0,Daejeon,2023-04-05 21:14:39.906,South Korea,Windows,Windows,start.content,Korean,Edge,Web,41362ad5ebcade2bb1b78344a53e7ccf,445fac33232adcb3d1cdab7d8a579430,beginner
1,Seoul,2023-04-05 21:50:19.674,South Korea,Mac,Mac,start.content,Korean,Safari,Web,06b6868000a5d576180aca47896d6cce,f4be1a0fe0e301b03115591777fddf29,beginner
2,Seoul,2023-04-10 14:06:26.585,South Korea,Mac,Mac,start.content,Korean,Whale,Web,7b2ff3116db46b5e4326d49c7b027631,61b6463287573f00de13a930805a52d6,beginner
3,Toronto,2023-04-10 14:15:29.040,Canada,Windows,Windows,start.content,English,Chrome,Web,eb7515590646ead796fdfb54e2729240,61b6463287573f00de13a930805a52d6,beginner
4,Seoul,2023-04-10 14:21:36.467,South Korea,Windows,Windows,start.content,Korean,Chrome,Web,85a332649d34bc7b5ebdec7660b1d57d,18992b667be06d6f29ba0008f99d9745,advanced


file: start.free_trial.csv decodings - 
city: [nan "'Ewa Beach" '6th of October City' 'A Coruña' 'A Me']
country: [nan 'Afghanistan' 'Albania' 'Algeria' 'American Samoa']
device_family: [nan '10A30Q' '21051182G' '21061110AG' '21061119AG']
device_type: [nan 'Amazon Fire HD 10' 'Amazon Fire HD 8' 'Amazon Kindle Fire HDX'
 'Android']
event_type: ['error' 'enter.main_page' 'enter.signup_page' 'complete.signup'
 'enter.content_page']
language: [nan 'Afrikaans' 'Albanian' 'Arabic' 'Azerbaijani']
os_name: [nan 'AVG Secure Browser' 'Android Browser' 'Avast Secure Browser'
 'Chrome']
platform: [nan 'Web' None]
user_id: [nan '0000572f2ba8079bcefd2760b632e820' '000087537cbfd934375364218ff10f91'
 '000117267e1dd62481ef3ae7fb420107' '00012b6dbfccfb458ad0d08f34477944']
plan.price: [nan 14328 15920 19920 42960]
plan.type: [nan '12개월 플랜' '1개월 플랜' None]
trial.type: [nan 'A' 'B' None]


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,plan.price,plan.type,trial.type
0,Seoul,2023-04-05 21:49:36.156,South Korea,Mac,Mac,start.free_trial,Korean,Safari,Web,06b6868000a5d576180aca47896d6cce,15920,1개월 플랜,B
1,Dobong-gu,2023-03-12 08:28:29.384,South Korea,Windows,Windows,start.free_trial,Korean,Chrome,Web,40acc1ddfc012f3d2f58a3de95e9e748,15920,1개월 플랜,B
2,,2023-03-12 08:35:00.107,,,,start.free_trial,,,,38aca03fba9753a6b2f31b829fb16a30,15920,1개월 플랜,B
3,Seoul,2023-04-12 11:36:50.250,South Korea,Mac,Mac,start.free_trial,Korean,Safari,Web,cc8051a75f62278b0e4216dd0a182d8c,15920,1개월 플랜,B
4,,2023-02-18 04:13:20.822,,,,start.free_trial,,,,4cc436ad294973993ccad3e3a70053a7,131600,12개월 플랜,B


#### 함수 정의

In [82]:
# 데이터를 불러오되 일정 크기 이상이면 샘플로 추출하여 크기를 줄입니다.
# 샘플크기는 4000000으로 설정되어 있습니다.
# 샘플크기를 변경하고 싶으면 samplesize를 변경해주세요.
def read_sample(path, samplesize=4000000):
    # set
    df = []
    # read
    temp = pd.read_csv(path, chunksize=2000000)
    
    # append and union
    for chunk in temp:
        df.append(chunk)
    
    # check df length
    if len(df) == 1:
        df = df[0]
    else:
        df = pd.concat(df, ignore_index=True)
    
    # sample and drop duplicates
    if df.shape[0] > samplesize:
        df = df.sample(samplesize, replace=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True) 
        
    return df

In [134]:
def check_too_close_log(df, offset = 1):
    ## error handling
    
    # check 'user_id' column
    if 'user_id' not in df.columns:
        print("'user_id' column not found")
        return
    
    # check 'client_event_time' column
    if 'client_event_time' not in df.columns:
        print("'client_event_time' column not found")
        return
    
    # if 'client_event_time' is str, convert to datetime
    if type(df['client_event_time']) == str:    
        df['client_event_time'] = pd.to_datetime(df['client_event_time'])

    ## main

    # drop duplicate
    temp = df.drop_duplicates().copy()
    print(f"drop duplicate : {df.shape[0] - temp.shape[0]} \\")
    df = temp.copy()
    
    # 'user_id' - max conut
    temp = df.groupby('user_id')['user_id'].count().sort_values(ascending=False)
    print(f"'user_id' - max conut : {temp.max()} \\")

    # over 1
    df_len = df.shape[0]
    print(f"over 1 : {((temp > 1).sum() / df_len * 100).round(2)}% \\" )
    
    # 'user_id', 'client_event_time' - max conut
    temp = df.groupby(['user_id', 'client_event_time'])['client_event_time']\
        .count()\
        .sort_values(ascending=False)
    print(f"'user_id', 'client_event_time' - max conut : {temp.max()} \\\n")
    
    if temp.max() > 1:
        mask = df.duplicated(subset=['user_id', 'client_event_time'], keep=False)
        display(df[mask].sort_values(['user_id', 'client_event_time']).head(10))
    
    # too_close_log check
    temp = df.sort_values(['user_id', 'client_event_time'])\
            .groupby(['user_id'])['client_event_time']\
            .diff().dt.total_seconds()
    temp = temp.dropna()
    
    # display
    print(temp.head(3))
    print(temp.tail(3), '\n\n')
    
    # display too close log
    if (temp < offset).sum() > (temp == 0).sum():
        cnt = (temp < offset).sum() - (temp == 0).sum()
        print(f"too close log : {cnt}, {(cnt / df_len * 100).round(2)}%")
        result = temp[((temp < offset) & (~(temp == 0)))]
        display(result)
        
    return result.index

In [210]:
def concat_enc_df_int32(df1, df2):
    # make common columns in both dfs
    for col in df1:
        if col in df2.columns:
            continue    
        df2.loc[:, col] = 0

    for col in df2:
        if col in df1.columns:
            continue
        df1.loc[:, col] = 0

    # concat
    result = pd.concat([df1, df2], axis=0)\
                .sort_values('client_event_time')\
                .reset_index(drop=True)

    # set convert dict
    convert_dict = {col : 'int16' for col in result.columns}
    convert_dict['client_event_time'] = 'datetime64[ns]'
    convert_dict['user_id'] = 'int32'

    # convert
    result = result.astype(convert_dict)
    
    return result

In [258]:
def get_combinations(li):
    # exception handling
    if len(li) == 0:
        return set()
    if len(li) == 1:
        return set([li[0]])
    if len(li) == 2:
        if len(set(li)) == 1:
            return set([li[0]])
        return set([li[0], li[1]])
    
    combinations = set()
    act = li[0]
    only_act = 0
    
    # get combinations
    for i in li[1:]:
        next_act = i
        
        if next_act == act:
            only_act = act
            continue
        
        combinations.add(str([act, next_act]))
        
        act = next_act
    
    # exception handling
    if len(combinations) == 1:
        li = list(combinations)[0].replace("[", "").replace("]", "").split(", ")
        if len(li) == 1:
            return set([np.number(li[0])])
        else:
            return set([np.number(li[0]), np.number(li[1])])

    if len(combinations) == 0:
        return set([only_act])
    
    return combinations

In [221]:
def get_set(li):
    # exception handling
    if len(li) == 0:
        return set()
    if len(li) == 1:
        return set(str(li[0]))
    if len(li) == 2:
        if len(set(li)) == 1:
            return set(str(li[0]))
        return set([str(li[0]), str(li[1])])
    
    combinations = set()
    act = li[0]
    
    # get combinations
    for i in li[1:]:
        next_act = i
        
        combinations.add(str(act))
        
        act = next_act
    
    return combinations

#### 파일 탐색

In [None]:
BaseFilePath = 'd:\Codeit\intermediate_project'
filelist = os.listdir(BaseFilePath)

# filter filename
dirname = [f for f in filelist if not f.endswith('.zip')][1]
print(dirname)

주제 2. 구독서비스 프로덕트 데이터 분석


  BaseFilePath = 'd:\Codeit\intermediate_project'


#### 로드 데이터

In [None]:
# set
DF = []
dfs = []
common_cols = []
all_cols = []
core_cols = ['user_id', 'event_type', 'client_event_time']

# get filelist
filelist = os.listdir(path.join(BaseFilePath, dirname))

# read csv files and check common columns
for filename in filelist:
    # csv
    if filename.endswith('.csv'):
        print('csv file found')
    
        # read csv
        df = read_sample(path.join(BaseFilePath, dirname, filename))

        # if common columns are not set, set it            
        if common_cols == [] and list(df.columns):
            common_cols = list(df.columns)
        
        # if common columns are set, check common columns
        common_cols = list(set(common_cols).intersection(set(df.columns)))
        all_cols = list(set(all_cols).union(set(df.columns)))

        ## preprocessing
        # drop device_carrier
        df = df.drop('device_carrier', axis=1)
        # convert client_event_time to datetime
        df['client_event_time'] = pd.to_datetime(df['client_event_time'])
        # convert os_version to string
        df['os_version'] = df['os_version'].astype(str)
        # # before update, fillna with '0'
        # df.fillna({'country' : '0', 'city' : '0', 'device_type' : '0', 'device_family' : '0'}, inplace=True)
        
        # update
        dfs.append(df)

    # if not csv, just pass    
    else:
        print('other file found')
        print(filename)

print(common_cols)

csv file found
csv file found
csv file found


  for chunk in temp:


csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
csv file found


  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:


csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
other file found
중급1_구독서비스 데이터 이벤트 명세서.xlsx
['user_id', 'device_family', 'language', 'country', 'os_name', 'platform', 'event_type', 'device_carrier', 'client_event_time', 'os_version', 'device_type', 'city']


#### 데이터 통합

In [None]:
# reset DF
if len(DF) > 0:
    DF = []

# concat dfs
DF = pd.concat(dfs[:9] + dfs[10:], axis=0).reset_index(drop=True)
print(DF.shape[0])

## preprocessing
# drop os_version
DF = DF.drop('os_version', axis=1)

# encode categorical columns using the loaded encoders
for col in DF.columns:
    # exception handling
    if col in ['client_event_time', 'device_carrier']:
        continue
    
    print(f"\n\n{col}")
    
    # encode using the loaded encoder
    DF[col] = encoder_dict[col].transform(DF[col])
    
    # display
    display(DF[col].value_counts())

# display result head   
DF

9752541


city


city
3440    1916308
3436     344756
0        342556
3694     310699
1228     289534
         ...   
1517          1
1270          1
3349          1
992           1
2641          1
Name: count, Length: 4288, dtype: int64



country


country
158    9322437
0       210472
181      64529
85       27167
31       19944
        ...   
90           1
103          1
38           1
118          1
123          1
Name: count, Length: 189, dtype: int64



device_family


device_family
1029    5834764
570     1560911
135      874018
773      494798
129      371802
         ...   
656           1
675           1
781           1
728           1
302           1
Name: count, Length: 1118, dtype: int64



device_type


device_type
609    5834764
158    1560911
38      667763
4       371802
0       277627
        ...   
583          1
224          1
171          1
375          1
597          1
Name: count, Length: 652, dtype: int64



event_type


event_type
14    2826664
1     2133548
4     1914257
15    1379136
2      596829
5      172911
7      165061
3      145133
12     124917
16     120582
6       93962
11      30621
17      17895
9       15945
8       14289
10        791
Name: count, dtype: int64



language


language
38    9133833
15     459578
0      131288
33      10821
10       3596
       ...   
9           1
60          1
63          1
61          1
34          1
Name: count, Length: 71, dtype: int64



os_name


os_name
4     6092324
10     985713
23     507982
34     393408
40     342060
17     334080
35     315700
7      198787
6      156895
0      134018
39     110200
12      58060
5       54052
13      39014
15      19624
26       3919
41       2628
3        1653
2         746
20        487
28        245
38        239
36        155
11        115
1         111
30         52
32         51
19         48
16         37
33         32
42         30
8          26
37         24
31          8
14          5
21          4
27          2
22          1
24          1
25          1
43          1
18          1
29          1
9           1
Name: count, dtype: int64



platform


platform
1    9621253
0     131288
Name: count, dtype: int64



user_id


user_id
0         2861025
56112       30063
147616      24341
101729       5151
3799         4771
           ...   
165479          1
85303           1
52643           1
38467           1
169601          1
Name: count, Length: 178828, dtype: int64



content.id


content.id
0      3674696
201     465634
200     449619
85      323967
159     317454
        ...   
58           1
63           1
120          1
178          1
25           1
Name: count, Length: 211, dtype: int64



button.name


button.name
0    9666299
6      40961
7      27797
2      13596
3       2513
4        907
5        293
1        174
8          1
Name: count, dtype: int64



button_name


button_name
0    9713590
4      23793
1       6094
3       5348
2       3716
Name: count, dtype: int64



question.id


question.id
0       8927752
6967       6160
7619       3607
5970       3324
5982       2875
         ...   
6005          1
7319          1
4875          1
1462          1
1173          1
Name: count, Length: 7796, dtype: int64



lesson.id


lesson.id
0       6101088
4121      22985
2074      16370
49        15657
3563      15599
         ...   
4499          1
3082          1
5089          1
3366          1
4187          1
Name: count, Length: 5016, dtype: int64



type


type
0    9607583
4      89484
3      21465
5      19440
1      14126
2        442
6          1
Name: count, dtype: int64



plan.price


plan.price
0     9719077
2       21343
12       4060
4        2230
5        2171
10       1500
6         850
7         496
9         461
8         326
1          23
11          3
3           1
Name: count, dtype: int64



paid_amount


paid_amount
0     9721516
13      16836
50       2261
31       2160
23       1742
41       1483
2         896
1         871
7         855
34        827
35        494
40        461
37        325
44        316
18        254
8         242
6         213
17        101
45         99
43         95
15         71
12         68
42         56
11         40
5          38
16         28
38         27
22         24
49         24
30         20
39         17
10         13
48         12
9           9
29          8
3           6
46          5
27          4
20          4
19          3
21          3
47          3
26          2
25          2
4           1
14          1
33          1
24          1
32          1
28          1
36          1
Name: count, dtype: int64



coupon.discount_amount


coupon.discount_amount
0     9721516
3       26590
29        896
34        871
13        855
38        316
26        254
9         219
17        213
30        101
35         96
39         95
37         71
4          68
40         56
5          40
21         38
32         28
41         27
7          24
19         24
43         20
2          18
10         17
6          13
27         12
8           9
12          8
22          7
25          6
33          5
20          4
16          4
31          3
11          3
18          3
44          2
24          2
36          1
1           1
14          1
28          1
15          1
42          1
23          1
Name: count, dtype: int64



pg.type


pg.type
0    9747994
1       3731
2        767
3         49
Name: count, dtype: int64



content.difficulty


content.difficulty
0    9627624
2      58628
4      36109
1      27485
3       2695
Name: count, dtype: int64



plan.type


plan.type
0    9750102
2       1278
1       1161
Name: count, dtype: int64



trial.type


trial.type
0    9721920
1      28182
2       2439
Name: count, dtype: int64

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,...,question.id,lesson.id,type,plan.price,paid_amount,coupon.discount_amount,pg.type,content.difficulty,plan.type,trial.type
0,886,2023-04-10 14:57:48.556,158,1029,609,17,38,4,1,111466,...,0,0,0,0,0,0,0,0,0,0
1,3440,2023-03-12 08:26:16.224,158,130,5,17,15,23,1,55652,...,0,0,0,0,0,0,0,0,0,0
2,3945,2023-03-12 08:23:04.955,158,1029,609,17,38,4,1,128371,...,0,0,0,0,0,0,0,0,0,0
3,967,2023-03-12 08:32:39.271,158,1029,609,17,38,4,1,45697,...,0,0,0,0,0,0,0,0,0,0
4,1260,2023-04-12 11:36:28.311,158,1029,609,17,38,4,1,133511,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9752536,2765,2022-02-21 04:56:12.117,158,1029,609,11,38,4,1,149187,...,0,0,0,0,0,0,0,0,0,1
9752537,3440,2022-02-21 04:49:35.114,158,130,5,11,38,23,1,68438,...,0,0,0,0,0,0,0,0,0,1
9752538,3424,2022-02-21 04:37:53.294,158,1029,609,11,38,4,1,177735,...,0,0,0,0,0,0,0,0,0,1
9752539,3440,2022-02-21 04:13:10.443,158,135,38,11,38,23,1,114710,...,0,0,0,0,0,0,0,0,0,1


In [None]:
com_DF = DF[core_cols].copy() # copy core_cols

print(com_DF.info())
DF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9752541 entries, 0 to 9752540
Data columns (total 3 columns):
 #   Column             Dtype         
---  ------             -----         
 0   user_id            int32         
 1   event_type         int32         
 2   client_event_time  datetime64[ns]
dtypes: datetime64[ns](1), int32(2)
memory usage: 148.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9752541 entries, 0 to 9752540
Data columns (total 23 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int32         
 1   client_event_time       datetime64[ns]
 2   country                 int32         
 3   device_family           int32         
 4   device_type             int32         
 5   event_type              int32         
 6   language                int32         
 7   os_name                 int32         
 8   platform                int32         
 9   user_id                 int

In [None]:
big_df = pd.read_csv(path.join(BaseFilePath, dirname, filelist[9]), chunksize=2000000)
enc_chunks = []

for i, chunk in enumerate(big_df):
    # preprocessing
    chunk = chunk.drop('device_carrier', axis=1)
    chunk = chunk.drop('os_version', axis=1)
    chunk['client_event_time'] = pd.to_datetime(chunk['client_event_time'])
    
    # display
    print(f"\n\n{i}")
    print(chunk.info())
    
    # encoding
    for col in chunk.columns:
        # exception handling
        if col in ['client_event_time', 'device_carrier']:
            continue
        
        # label encoding
        chunk[col] = encoder_dict[col].transform(chunk[col])
        
    enc_chunks.append(chunk)
    
big_df = pd.concat(enc_chunks, axis=0)

  for i, chunk in enumerate(big_df):




0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           object        
 12  lesson.id          object        
 13  is_free_trial      float64       
dtypes: datetime64[ns](1), float64(1), object(12)
memory usage: 213.6+ MB
None


1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 2000000 to 3999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------       

  for i, chunk in enumerate(big_df):




2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 4000000 to 5999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           object        
 12  lesson.id          object        
 13  is_free_trial      float64       
dtypes: datetime64[ns](1), float64(1), object(12)
memory usage: 213.6+ MB
None


  for i, chunk in enumerate(big_df):




3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 6000000 to 7999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           object        
 12  lesson.id          object        
 13  is_free_trial      object        
dtypes: datetime64[ns](1), object(13)
memory usage: 213.6+ MB
None


4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 8000000 to 9999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             

  for i, chunk in enumerate(big_df):




5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 10000000 to 11999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           float64       
 12  lesson.id          object        
 13  is_free_trial      object        
dtypes: datetime64[ns](1), float64(1), object(12)
memory usage: 213.6+ MB
None


  for i, chunk in enumerate(big_df):




6
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 12000000 to 13999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           float64       
 12  lesson.id          object        
 13  is_free_trial      object        
dtypes: datetime64[ns](1), float64(1), object(12)
memory usage: 213.6+ MB
None


  for i, chunk in enumerate(big_df):




7
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 14000000 to 15999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           object        
 12  lesson.id          object        
 13  is_free_trial      object        
dtypes: datetime64[ns](1), object(13)
memory usage: 213.6+ MB
None


8
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 16000000 to 17999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------         

  for i, chunk in enumerate(big_df):




9
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 18000000 to 19999999
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               object        
 1   client_event_time  datetime64[ns]
 2   country            object        
 3   device_family      object        
 4   device_type        object        
 5   event_type         object        
 6   language           object        
 7   os_name            object        
 8   platform           object        
 9   user_id            object        
 10  content.id         object        
 11  is_trial           object        
 12  lesson.id          object        
 13  is_free_trial      float64       
dtypes: datetime64[ns](1), float64(1), object(12)
memory usage: 213.6+ MB
None


10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029707 entries, 20000000 to 21029706
Data columns (total 14 columns):
 #   Column             Non-Null Count    Dtyp

In [None]:
print(big_df.info())
big_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21029707 entries, 0 to 21029706
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               int32         
 1   client_event_time  datetime64[ns]
 2   country            int32         
 3   device_family      int32         
 4   device_type        int32         
 5   event_type         int32         
 6   language           int32         
 7   os_name            int32         
 8   platform           int32         
 9   user_id            int32         
 10  content.id         int32         
 11  is_trial           int32         
 12  lesson.id          int32         
 13  is_free_trial      int32         
dtypes: datetime64[ns](1), int32(13)
memory usage: 1.2 GB
None


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,is_trial,lesson.id,is_free_trial
0,4171,2023-04-05 21:46:04.333,158,1029,609,13,38,4,1,102081,125,1,4544,0
1,4171,2023-04-05 21:55:50.787,158,1029,609,13,38,4,1,102081,125,1,4544,0
2,4171,2023-04-05 21:55:55.684,158,1029,609,13,38,4,1,102081,125,1,2558,0
3,979,2023-04-05 21:04:55.828,158,570,158,13,38,4,1,125592,81,1,2278,0
4,979,2023-04-05 21:05:34.139,158,570,158,13,38,4,1,125592,182,1,410,0


#### 인코딩 데이터 저장

In [None]:
# save the big_df
big_df.to_csv('total_lesson.csv', index=False)
print('done - save total_lesson.csv')

# save the DF
DF.to_csv('total.csv', index=False)
print('done - save total.csv')

done


#### 인코딩 데이터 불러오기

In [None]:
big_df = pd.read_csv('total_lesson.csv')
print('done - load total_lesson.csv')

DF = pd.read_csv('total.csv')
print('done - load total.csv')

#### 데이터 분할

In [None]:
# separate 2022 / 2023
enter_lesson_2022 = big_df[big_df['client_event_time'].dt.year == 2022]
print(enter_lesson_2022.shape[0])
enter_lesson_2023 = big_df[big_df['client_event_time'].dt.year == 2023]
print(enter_lesson_2023.shape[0])

DF_2022 = DF[DF['client_event_time'].dt.year == 2022]
print(DF_2022.shape[0])
DF_2023 = DF[DF['client_event_time'].dt.year == 2023]
print(DF_2023.shape[0])

7373559
13655574
4818791
4933363


#### 각 연도 묶음

In [90]:
df_2022 = concat_enc_df_int32(DF_2022, enter_lesson_2022)

display(df_2022.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12192350 entries, 0 to 12192349
Data columns (total 25 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None

In [None]:
df_2023 = concat_enc_df_int32(DF_2023, enter_lesson_2023)

display(df_2023.head(5))
display(df_2023.info())


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,...,type,plan.price,paid_amount,coupon.discount_amount,pg.type,content.difficulty,plan.type,trial.type,is_trial,is_free_trial
0,1229,2023-01-01 00:00:02.833,158,498,157,13,38,4,1,40880,...,0,0,0,0,0,0,0,0,0,0
1,3436,2023-01-01 00:00:02.975,158,570,158,14,38,34,1,173501,...,0,0,0,0,0,0,0,0,0,0
2,3436,2023-01-01 00:00:03.391,158,570,158,13,38,34,1,173501,...,0,0,0,0,0,0,0,0,1,0
3,1332,2023-01-01 00:00:04.900,158,135,38,1,38,39,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3436,2023-01-01 00:00:06.338,158,570,158,13,38,34,1,173501,...,0,0,0,0,0,0,0,0,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18588937 entries, 0 to 18588936
Data columns (total 25 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None

#### 변수 선언

In [77]:
core_col = ['client_event_time', 'user_id', 'event_type']

In [79]:
com_df_23 = df_2023[core_col]
com_df_23

Unnamed: 0,client_event_time,user_id,event_type
0,2023-01-01 00:00:02.833,40880,13
1,2023-01-01 00:00:02.975,173501,14
2,2023-01-01 00:00:03.391,173501,13
3,2023-01-01 00:00:04.900,0,1
4,2023-01-01 00:00:06.338,173501,13
...,...,...,...
18588932,2023-12-31 23:58:39.902,86601,4
18588933,2023-12-31 23:58:48.291,86601,13
18588934,2023-12-31 23:58:57.928,65802,14
18588935,2023-12-31 23:59:18.981,12167,4


In [136]:
close_indexs = check_too_close_log(com_df_23[com_df_23['user_id'] != 0], offset = 1)

drop duplicate : 731 \
'user_id' - max conut : 17086 \
over 1 : 0.65% \
'user_id', 'client_event_time' - max conut : 2 \



Unnamed: 0,client_event_time,user_id,event_type
13867359,2023-09-11 02:50:11.945,135,14
13867360,2023-09-11 02:50:11.945,135,13
5439479,2023-04-10 13:37:29.981,172,13
5439480,2023-04-10 13:37:29.981,172,14
7755620,2023-05-29 05:10:17.814,316,14
7755621,2023-05-29 05:10:17.814,316,13
15530264,2023-10-10 12:52:29.628,455,13
15530265,2023-10-10 12:52:29.628,455,14
2449603,2023-02-07 12:25:09.638,567,13
2449604,2023-02-07 12:25:09.638,567,14


4344483    56.551
4344490     9.494
4344496     6.610
Name: client_event_time, dtype: float64
18079268    15.266
18079276    31.539
18059088     5.730
Name: client_event_time, dtype: float64 


too close log : 2268247, 14.76%


4356256     0.320
4356451     0.100
4356455     0.013
4356678     0.051
4356976     0.447
            ...  
9362299     0.477
13905650    0.349
13907956    0.072
17606790    0.244
17608777    0.176
Name: client_event_time, Length: 2268247, dtype: float64

In [120]:
temp = com_df_23[com_df_23['user_id'] != 0].groupby(['user_id', 'client_event_time'])['client_event_time']\
    .count()\
    .sort_values(ascending=False)

if temp.max() > 1:
    mask = com_df_23.duplicated(subset=['user_id', 'client_event_time'], keep=False)
    
result = com_df_23[mask]\
    .groupby(['user_id', 'client_event_time'])['event_type']\
    .agg(lambda x: str(x.values))\
    .value_counts()
rate = result / result.sum() * 100
rate_cum = rate.cumsum()

print(result.sum())

result = pd.concat([result, rate, rate_cum], axis = 1)
display(result.head(20))

2503


Unnamed: 0_level_0,count,count,count
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
[1 1],465,18.577707,18.577707
[13 13],439,17.538953,36.11666
[5 5],376,15.021974,51.138634
[14 13],319,12.744706,63.88334
[13 14],307,12.265282,76.148622
[4 4],191,7.630843,83.779465
[2 2],57,2.277267,86.056732
[13 1],36,1.438274,87.495006
[13 12],30,1.198562,88.693568
[ 1 13],28,1.118658,89.812225


[1 1]	465	18.577707	18.577707 \
메인 페이지 중복

[13 13]	439	17.538953	36.116660 \
레슨 시작 중복

[5 5]	376	15.021974	51.138634 \
콘텐츠 수강하기 중복

[14 13]	319	12.744706	63.883340 \
[13 14]	307	12.265282	76.148622 \
레슨 시작과 완료 중복

[4 4]	191	7.630843	83.779465 \
콘텐츠 개별 페이지 진입 중복

[2 2]	57	2.277267	86.056732 \
회원가입 페이지 중복

[13 1]	36	1.438274	87.495006 \
[ 1 13]	28	1.118658	89.812225 \
메인 페이지 진입과 레슨 시작 중복

[13 12]	30	1.198562	88.693568 \
[12 13]	20	0.799041	92.528965 \
콘텐츠 수강 시작과 레슨 시작 중복

[7 7]	26	1.038753	90.850979 \
결제 페이지 중복

[14 16]	22	0.878945	91.729924 \
[16 14]	16	0.639233	93.847383 \
레슨 완료와 콘텐츠 수강 완료 중복

[ 2 13]	17	0.679185	93.208150 \
[13 2]	12	0.479425	96.004794 \
레슨시작과 회원 가입 페이지 진입 중복

[16 13]	16	0.639233	94.486616 \
[13 16]	14	0.559329	95.045945 \
레슨 시작과 콘텐츠 수강 완료 중복

[ 5 12]	12	0.479425	95.525370 \
[12 5]	11	0.439473	96.444267 \
콘텐츠 수강하기 버튼 클릭과 콘텐츠 수강시작 중복

In [137]:
com_df_23.loc[close_indexs]

Unnamed: 0,client_event_time,user_id,event_type
4356256,2023-03-18 15:45:24.704,1,13
4356451,2023-03-18 15:53:42.851,1,13
4356455,2023-03-18 15:53:48.730,1,14
4356678,2023-03-18 16:04:54.296,1,14
4356976,2023-03-18 16:23:11.315,1,14
...,...,...,...
9362299,2023-07-03 04:52:16.705,179753,7
13905650,2023-09-11 10:06:34.388,179755,3
13907956,2023-09-11 10:38:24.622,179755,7
17606790,2023-11-15 04:45:44.402,179755,7


In [225]:
# set target df
df = com_df_23.loc[close_indexs]

#   
result = df\
    .sort_values(['user_id', 'client_event_time'])\
    .groupby(['user_id'])['event_type']\
    .agg(lambda x: get_combinations(x.values))\
    .value_counts()
rate = result / result.sum() * 100
rate_cum = rate.cumsum()

print(result.sum())

result = pd.concat([result, rate, rate_cum], axis = 1)
display(result.head(30).round(2))

#   
result = df\
    .sort_values(['user_id', 'client_event_time'])\
    .groupby(['user_id'])['event_type']\
    .agg(lambda x: get_set(x.values))\
    .value_counts()
rate = result / result.sum() * 100
rate_cum = rate.cumsum()

print(result.sum())

result = pd.concat([result, rate, rate_cum], axis = 1)
display(result.head(30).round(2))

64373


Unnamed: 0_level_0,count,count,count
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"{1, 3}",15068,23.41,23.41
{7},9867,15.33,38.74
{3},4135,6.42,45.16
"{13, 14}",3783,5.88,51.04
"{13, 7}",2568,3.99,55.02
"{[13, 14], [13, 16]}",2474,3.84,58.87
"{[13, 14], [14, 16], [13, 16]}",1315,2.04,60.91
"{1, 4}",1118,1.74,62.65
"{13, 12}",999,1.55,64.2
"{13, 16}",976,1.52,65.72


64373


Unnamed: 0_level_0,count,count,count
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"{1, 3}",12240,19.01,19.01
{7},10138,15.75,34.76
"{13, 14}",4223,6.56,41.32
{3},4135,6.42,47.75
"{13, 16, 14}",3856,5.99,53.74
{13},3616,5.62,59.35
"{13, 7}",2337,3.63,62.98
"{12, 13, 16, 14}",1334,2.07,65.06
"{13, 12}",1270,1.97,67.03
"{13, 12, 14}",1189,1.85,68.88


{7}	                            9867	15.33	38.74 \
결제 페이지 연속 로그
{3}	                            4135	6.42	45.16 \
회원 가입 연속 로그
{1}	                            947	    1.47	67.19 \
메인 페이지 연속 로그
{4}	                            603	    0.94	71.82 \
콘텐츠 개별 페이지 연속 로그

{1, 3}	                        15068	23.41	23.41 \
서비스 메인과 회원가입 완료
{13, 14}	                    3783	5.88	51.04 \
레슨 시작과 완료
{13, 7}	                        2568	3.99	55.02 \
레슨 시작과 결제 페이지 진입
{1, 4}	                        1118	1.74	62.65 \
서비스 메인과 콘텐츠 개별 페이지
{13, 12}	                    999	    1.55	64.20 \
콘텐츠 수강 시작과 레슨 시작
{13, 16}	                    976	    1.52	65.72 \
레슨 시작과 콘텐츠 수강 완료
{7, 3}	                        860	    1.34	68.52 \
결제 페이지 진입과 회원가입 완료
{13, 3}	                        579	    0.90	72.72 \
레슨 시작과 회원가입 완료
{1, 6}	                        519	    0.81	73.52 \
서비스 메인과 콘텐츠 후기 더보기
{13, 5}	                        377	    0.59	74.70 \
레슨 시작과 콘텐츠 수강하기
{1, 2}	                        377	    0.59	75.28 \
서비스 메인과 회원가입 페이지 진입

{[13, 14], [13, 16]}	        2474	3.84	58.87 \
{[13, 14], [14, 16], [13, 16]}	1315	2.04	60.91 \
레슨 시작과 완료
레슨 시작과 콘텐츠 수강 완료
{[13, 14], [7, 13]}	            775	    1.20	69.73 \
레슨 시작과 결제 페이지 진입
레슨 시작과 완료
{[12, 13], [13, 14]}	        743	    1.15	70.88 \
콘텐츠 수강 시작과 레슨 시작
레슨 시작과 완료

{[12, 13], [13, 14], [13, 16]}	378	    0.59	74.11 \
콘텐츠 수강 시작과 레슨 시작
레슨 시작과 완료
레슨 시작과 콘텐츠 수강 완료

In [259]:
target_df = com_df_23[com_df_23['event_type'].isin([8, 9, 10, 17])]
target_df = target_df[target_df['user_id'] != 0]
target_df.groupby('user_id')['event_type']\
    .agg(lambda x: str(get_combinations(x.values)))\
    .value_counts()

TypeError: cannot create 'numpy.number' instances