#### 모듈

In [23]:
import os
import os.path as path

import pandas as pd

from sklearn.preprocessing import LabelEncoder
import numpy as np

#### 옵션 설정

In [24]:
pd.set_option('mode.chained_assignment', None)

#### 함수 정의

In [25]:
def read_sample(path, samplesize=4000000):
    '''
    #### 데이터를 불러오되 일정 크기 이상이면 샘플로 추출하여 크기를 줄입니다. \n
     \n
    path : 파일 경로입니다. \n
    samplesize : 추출할 샘플의 크기입니다.(default: 4000000), 0 이라면 하지 않습니다. \n
     \n
    return : 데이터프레임을 반환합니다. \n
     \n
    chunksize(=2000000)로 불러와서 samplesize를 넘으면 샘플링하고 중복을 제거합니다. \n
    '''
    
    # set df
    df = []
    
    # read
    temp = pd.read_csv(path, chunksize=2000000)
    
    # append and union
    for chunk in temp:
        df.append(chunk)
    
    # check df length
    if len(df) == 1:
        df = df[0]
    else:
        df = pd.concat(df, ignore_index=True)
    
    # sample and drop duplicates
    if (samplesize != 0) and (df.shape[0] > samplesize):
        df = df.sample(samplesize, replace=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True) 
        
    return df

In [26]:
def concat_enc_df_int32(df1, df2):
    # make common columns in both dfs
    for col in df1:
        if col in df2.columns:
            continue    
        df2.loc[:, col] = 0

    for col in df2:
        if col in df1.columns:
            continue
        df1.loc[:, col] = 0

    # concat
    result = pd.concat([df1, df2], axis=0)\
                .sort_values('client_event_time')\
                .reset_index(drop=True)

    # set convert dict
    convert_dict = {col : 'int16' for col in result.columns}
    convert_dict['client_event_time'] = 'datetime64[ns]'
    convert_dict['user_id'] = 'int32'

    # convert
    result = result.astype(convert_dict)
    
    return result

#### 인코더 불러오기

In [27]:
# path
BasePath = "./encoders"

# get list of encoder files
encoder_file_list = os.listdir(BasePath)

# print list of encoder files
print('encoder_file_list :', encoder_file_list)

# load encoder files
encoder_dict = {}

for file in encoder_file_list:
    encoder_name = file.replace(".npy", "")
    encoder_dict[encoder_name] = LabelEncoder()
    classes = np.load(path.join(BasePath, file), allow_pickle=True)
    encoder_dict[encoder_name].classes_ = np.load(path.join(BasePath, file), allow_pickle=True)
    
    if encoder_name == "os_version":
        encoder_dict[encoder_name].fit(list(map(lambda x: str(x), classes)))

# encoder modify
for encoder_name in encoder_dict.keys():
    # reorder encoder : make encoder[0] = nan
    encoder_dict[encoder_name].classes_ = \
        np.append(encoder_dict[encoder_name].classes_[-1],\
                    encoder_dict[encoder_name].classes_[:-1])

    print(f"\n{encoder_name} : {(encoder_dict[encoder_name].classes_.shape[0]) - 1} 가지\n{encoder_dict[encoder_name].classes_[1:6]}")

# reorder 'event_type' encoder classes
encoder_dict['event_type'].classes_ = np.array([
    'error',
    'enter.main_page',
    'enter.signup_page',
    'complete.signup',
    'enter.content_page',
    'click.content_page_start_content_button',
    'click.content_page_more_review_button',
    'enter.payment_page',
    'complete.subscription',
    'renew.subscription',
    'resubscribe.subscription',
    'start.free_trial',
    'start.content',
    'enter.lesson_page',
    'complete.lesson',
    'click.lesson_page_related_question_box',
    'end.content',
    'click.cancel_plan_button'
])
# encoder test
print(encoder_dict["event_type"].classes_[:5])


encoder_file_list : ['button.name.npy', 'button_name.npy', 'city.npy', 'content.difficulty.npy', 'content.id.npy', 'country.npy', 'coupon.discount_amount.npy', 'device_family.npy', 'device_type.npy', 'event_type.npy', 'is_free_trial.npy', 'is_trial.npy', 'language.npy', 'lesson.id.npy', 'os_name.npy', 'os_version.npy', 'paid_amount.npy', 'pg.type.npy', 'plan.price.npy', 'plan.type.npy', 'platform.npy', 'question.id.npy', 'trial.type.npy', 'type.npy', 'user_id.npy']

button.name : 9 가지
['구독 시작하기' '무료로 들어보기' '무료로 시작하기' '최저가 혜택 받기' '최저가로 시작하기']

button_name : 5 가지
['무료로 시작하기' '수강시작' '수강하기' '이어하기' None]

city : 4340 가지
["'Ewa Beach" '6th of October City' 'A Coruña' 'A Me' 'AElmhult']

content.difficulty : 5 가지
['advanced' 'beginner' 'hard' 'intermediate' None]

content.id : 212 가지
['011a040b817225cb7c01d4676c21ea44' '01d9424fd9d2b1d7a4392042fc646061'
 '035f018008f1696ded6e10e1dd757726' '03b4a290c08f7fa83933827a994f2e61'
 '040ccaa97a52e006a2c94bb7c5dce263']

country : 189 가지
['Afghanistan' 

#### 파일 탐색

In [28]:
## 데이터가 있는 경로로 수정해주세요
BaseFilePath = 'd:\Codeit\intermediate_project\주제 2. 구독서비스 프로덕트 데이터 분석'

# get filelist
filelist = os.listdir(path.join(BaseFilePath))

  BaseFilePath = 'd:\Codeit\intermediate_project\주제 2. 구독서비스 프로덕트 데이터 분석'


#### 로드 데이터

In [29]:
# set
DF = []
dfs = []
common_cols = []
all_cols = []
core_cols = ['user_id', 'event_type', 'client_event_time']

# 제외 컬럼
exclude_columns = ['device_carrier', 'os_version']

# 제외 컬럼 : 제외하고 싶은 컬럼을 추가합니다.
exclude_columns_plus = []

# read csv files and check common columns
for filename in filelist:
    # csv
    if filename.endswith('.csv'):
        print('csv file found')
    
        # read csv
        df = read_sample(path.join(BaseFilePath, filename), 0)

        # if common columns are not set, set it            
        if common_cols == [] and list(df.columns):
            common_cols = list(df.columns)
        
        # if common columns are set, check common columns
        common_cols = list(set(common_cols).intersection(set(df.columns)))
        all_cols = list(set(all_cols).union(set(df.columns)))

        ## 데이터 전처리
        # convert client_event_time to datetime
        df['client_event_time'] = pd.to_datetime(df['client_event_time'])
        # convert os_version to string
        df['os_version'] = df['os_version'].astype(str)
        # drop 
        df = df.drop(exclude_columns + exclude_columns_plus, axis=1)
        # 추가할 전처리가 있다면, 원하시는 전처리를 추가해주세요
        
        
        # update
        dfs.append(df)

    # if not csv, just pass    
    else:
        print('other file found')
        print(filename)

print(common_cols)

csv file found
csv file found
csv file found


  for chunk in temp:


csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
csv file found


  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:
  for chunk in temp:


csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
csv file found
other file found
중급1_구독서비스 데이터 이벤트 명세서.xlsx
['event_type', 'platform', 'device_carrier', 'device_type', 'country', 'client_event_time', 'device_family', 'os_version', 'os_name', 'user_id', 'city', 'language']


#### 데이터 통합

In [30]:
# reset DF
if len(DF) > 0:
    DF = []

# concat dfs
DF = pd.concat(dfs[:9] + dfs[10:], axis=0).reset_index(drop=True)
print(DF.shape[0])

# encode categorical columns using the loaded encoders
for col in DF.columns:
    # exception handling
    if col in ['client_event_time', 'device_carrier']:
        continue
    
    print(f"\n{col}")
    
    # encode using the loaded encoder
    DF[col] = encoder_dict[col].transform(DF[col])

# set convert dict
convert_dict = {col : 'int16' for col in DF.columns}
convert_dict['client_event_time'] = 'datetime64[ns]'
convert_dict['user_id'] = 'int32'

# convert
DF = DF.astype(convert_dict)

# display result head   
DF

12330946

city

country

device_family

device_type

event_type

language

os_name

platform

user_id

content.id

button.name

button_name

question.id

lesson.id

type

plan.price

paid_amount

coupon.discount_amount

pg.type

content.difficulty

plan.type

trial.type


Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,...,question.id,lesson.id,type,plan.price,paid_amount,coupon.discount_amount,pg.type,content.difficulty,plan.type,trial.type
0,886,2023-04-10 14:57:48.556,158,1029,609,17,38,4,1,111466,...,0,0,0,0,0,0,0,0,0,0
1,3440,2023-03-12 08:26:16.224,158,130,5,17,15,23,1,55652,...,0,0,0,0,0,0,0,0,0,0
2,3945,2023-03-12 08:23:04.955,158,1029,609,17,38,4,1,128371,...,0,0,0,0,0,0,0,0,0,0
3,967,2023-03-12 08:32:39.271,158,1029,609,17,38,4,1,45697,...,0,0,0,0,0,0,0,0,0,0
4,1260,2023-04-12 11:36:28.311,158,1029,609,17,38,4,1,133511,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12330941,2765,2022-02-21 04:56:12.117,158,1029,609,11,38,4,1,149187,...,0,0,0,0,0,0,0,0,0,1
12330942,3440,2022-02-21 04:49:35.114,158,130,5,11,38,23,1,68438,...,0,0,0,0,0,0,0,0,0,1
12330943,3424,2022-02-21 04:37:53.294,158,1029,609,11,38,4,1,177735,...,0,0,0,0,0,0,0,0,0,1
12330944,3440,2022-02-21 04:13:10.443,158,135,38,11,38,23,1,114710,...,0,0,0,0,0,0,0,0,0,1


In [31]:
# display result info

display(DF.info())
DF.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330946 entries, 0 to 12330945
Data columns (total 23 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,...,question.id,lesson.id,type,plan.price,paid_amount,coupon.discount_amount,pg.type,content.difficulty,plan.type,trial.type
0,886,2023-04-10 14:57:48.556,158,1029,609,17,38,4,1,111466,...,0,0,0,0,0,0,0,0,0,0
1,3440,2023-03-12 08:26:16.224,158,130,5,17,15,23,1,55652,...,0,0,0,0,0,0,0,0,0,0
2,3945,2023-03-12 08:23:04.955,158,1029,609,17,38,4,1,128371,...,0,0,0,0,0,0,0,0,0,0
3,967,2023-03-12 08:32:39.271,158,1029,609,17,38,4,1,45697,...,0,0,0,0,0,0,0,0,0,0
4,1260,2023-04-12 11:36:28.311,158,1029,609,17,38,4,1,133511,...,0,0,0,0,0,0,0,0,0,0


In [32]:
big_df = dfs[9]

# encoding
for col in big_df.columns:
    # exception handling
    if col in ['client_event_time', 'device_carrier']:
        continue
    
    print(f"\n{col}")
    
    # label encoding
    big_df[col] = encoder_dict[col].transform(big_df[col])

# set convert dict
convert_dict = {col : 'int16' for col in big_df.columns}
convert_dict['client_event_time'] = 'datetime64[ns]'
convert_dict['user_id'] = 'int32'

# convert
big_df = big_df.astype(convert_dict)


city

country

device_family

device_type

event_type

language

os_name

platform

user_id

content.id

is_trial

lesson.id

is_free_trial


In [33]:
# display result info

display(big_df.info())
big_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21029707 entries, 0 to 21029706
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               int16         
 1   client_event_time  datetime64[ns]
 2   country            int16         
 3   device_family      int16         
 4   device_type        int16         
 5   event_type         int16         
 6   language           int16         
 7   os_name            int16         
 8   platform           int16         
 9   user_id            int32         
 10  content.id         int16         
 11  is_trial           int16         
 12  lesson.id          int16         
 13  is_free_trial      int16         
dtypes: datetime64[ns](1), int16(12), int32(1)
memory usage: 722.0 MB


None

Unnamed: 0,city,client_event_time,country,device_family,device_type,event_type,language,os_name,platform,user_id,content.id,is_trial,lesson.id,is_free_trial
0,4171,2023-04-05 21:46:04.333,158,1029,609,13,38,4,1,102081,125,1,4544,0
1,4171,2023-04-05 21:55:50.787,158,1029,609,13,38,4,1,102081,125,1,4544,0
2,4171,2023-04-05 21:55:55.684,158,1029,609,13,38,4,1,102081,125,1,2558,0
3,979,2023-04-05 21:04:55.828,158,570,158,13,38,4,1,125592,81,1,2278,0
4,979,2023-04-05 21:05:34.139,158,570,158,13,38,4,1,125592,182,1,410,0


#### 인코딩 데이터 저장

In [34]:
# save the big_df
big_df.to_csv('total_lesson.csv', index=False)
print('done - save total_lesson.csv')

# save the DF
DF.to_csv('total.csv', index=False)
print('done - save total.csv')

done - save total_lesson.csv
done - save total.csv


#### 인코딩 데이터 불러오기

In [35]:
big_df = pd.read_csv('total_lesson.csv')
print('done - load total_lesson.csv')

DF = pd.read_csv('total.csv')
print('done - load total.csv')

done - load total_lesson.csv
done - load total.csv


#### 칼럼 변환

In [36]:
# set time columns
time_cols = ['client_event_time']

# set convert dict
convert_dict1 = {col : 'int16' for col in big_df.columns}
convert_dict1['client_event_time'] = 'datetime64[ns]'
convert_dict1['user_id'] = 'int32'

convert_dict2 = {col : 'int16' for col in DF.columns}
convert_dict2['client_event_time'] = 'datetime64[ns]'
convert_dict2['user_id'] = 'int32'

# convert
big_df = big_df.astype(convert_dict1)
DF = DF.astype(convert_dict2)

In [37]:
# display
display(big_df.info())
display(DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21029707 entries, 0 to 21029706
Data columns (total 14 columns):
 #   Column             Dtype         
---  ------             -----         
 0   city               int16         
 1   client_event_time  datetime64[ns]
 2   country            int16         
 3   device_family      int16         
 4   device_type        int16         
 5   event_type         int16         
 6   language           int16         
 7   os_name            int16         
 8   platform           int16         
 9   user_id            int32         
 10  content.id         int16         
 11  is_trial           int16         
 12  lesson.id          int16         
 13  is_free_trial      int16         
dtypes: datetime64[ns](1), int16(12), int32(1)
memory usage: 722.0 MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330946 entries, 0 to 12330945
Data columns (total 23 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None

#### 데이터 분할

In [38]:
# separate 2022 / 2023
enter_lesson_2022 = big_df[big_df['client_event_time'].dt.year == 2022]
print(enter_lesson_2022.shape[0])
enter_lesson_2023 = big_df[big_df['client_event_time'].dt.year == 2023]
print(enter_lesson_2023.shape[0])

DF_2022 = DF[DF['client_event_time'].dt.year == 2022]
print(DF_2022.shape[0])
DF_2023 = DF[DF['client_event_time'].dt.year == 2023]
print(DF_2023.shape[0])

7373559
13655574
5978077
6352482


#### 각 연도 묶음

In [39]:
df_2022 = concat_enc_df_int32(DF_2022, enter_lesson_2022)
display(df_2022.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13351636 entries, 0 to 13351635
Data columns (total 25 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None

In [40]:
df_2023 = concat_enc_df_int32(DF_2023, enter_lesson_2023)
display(df_2023.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20008056 entries, 0 to 20008055
Data columns (total 25 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   city                    int16         
 1   client_event_time       datetime64[ns]
 2   country                 int16         
 3   device_family           int16         
 4   device_type             int16         
 5   event_type              int16         
 6   language                int16         
 7   os_name                 int16         
 8   platform                int16         
 9   user_id                 int32         
 10  content.id              int16         
 11  button.name             int16         
 12  button_name             int16         
 13  question.id             int16         
 14  lesson.id               int16         
 15  type                    int16         
 16  plan.price              int16         
 17  paid_amount             int16         
 18  

None