In [2]:
import pandas as pd
from datetime import datetime
from collections import namedtuple
from util import cfg, load_file, read_csv

import time
import numpy as np
import pickle

In [104]:
class SeqDataset():
    
    def __init__(self, year, month, maxlen=4):
        self.year = year
        self.month = month
        self.maxlen = maxlen
        
        print('데이터 셋 로딩중..')
        datelist = self.create_seq_dataset()
        print(datelist)
        print('데이터 셋 로딩 완료')
        print()
        
    def preprocess(self):

        appds = [data[0] for data in self.dataset]
        mbrds = [data[1] for data in self.dataset]
        gmds = [data[2] for data in self.dataset]
        pcd = read_csv(cfg.pgcd)
        
        print('applog 전처리중')
        appdf = prep_applogs(appds)
        pcd, code2name = prep_pagecd(pcd)
        appdf = merge_app_and_pcd(appdf, pcd)
        print('applog 전처리 완료')
        print()
        
        print('member 전처리중')
        mbrdf = prep_mbrlogs(mbrds)
        print('member 전처리 완료')
        print()
        
        return appdf, mbrdf
        
    def create_seq_dataset(self):
        """
        year:현재년도
        month:현재월
        maxlen:시계열에서 고려하는 개월 수 
        ex) 현재 개월수가 11월이면, 11월/10월/9월/8월을 고려함
        dataset: [[app, mbr, gm], [app, mbr, gm],...]
        """
        datelist = self.list_dates()
        dataset = self.load_dataset(datelist)
        self.dataset = dataset
        return datelist
    
    def load_dataset(self, datelist):
        data = []
        for date in datelist:
            data.append(load_file(date.year, date.month))

#         #임시
#         data = []
#         with open('sample_dataset0.pickle', 'rb') as f:
#             data.append(pickle.load(f))
#         with open('sample_dataset1.pickle', 'rb') as f:
#             data.append(pickle.load(f))
        return data
        
    def list_dates(self):
        mydate = namedtuple('mydate', ['year', 'month'])
        curdate = mydate(self.year, self.month)
        datelist = [curdate]
        for i in range(1, self.maxlen):
            year = curdate.year
            month = curdate.month - i
            if year == 2021 and month <= 0:
                year = 2020
                month += 12
            elif year == 2020 and month <= 0:
                return datelist       
            datelist.append(mydate(year, month))
        return datelist      
    
    
    def create_seqds_per_sess(self, appdf, mbrdf, logdf):
        return
    
    def create_seqds(self, appdf, mbrdf, logdf):
        return
    
    def prep_appdf_per_sess(self, appdf):
        return
    
def prep_applogs(applogs):
    df = pd.concat(applogs, axis=0)
    
    #null제거
    print('Orig. data len:', len(df))
    df = df.dropna()
    print('Aft. drop-nan:', len(df))
    
    #방문일시 변경
    vst_dtm = df['vst_dtm'].astype('str')
    f = lambda x: x[:-3]
    vst_dtm = vst_dtm.apply(f)
    vst_dtm = pd.to_datetime(vst_dtm, format='%Y%m%d%H%M%S')
    df['vst_dtm'] = vst_dtm
    
    #필요 없는 칼럼 drop
    df = df.drop(['login_yn', 'new_vst_yn', 'tlcom_co_cd'], axis=1)
    
    #1970년대 데이터 제외
    df = df[df['vst_dtm'].dt.year != 1970]
    df = df.reset_index(drop=True)

    #session_id '#' 제거
    inds = np.where(df['sesn_id'] == '#')[0]
    df = df.drop(inds)
    
    #sorting
    df = df.sort_values(['party_id', 'vst_dtm', 'sesn_id'])
    return df

def prep_pagecd(pcd):
    pcd = pcd.reset_index(drop=True)
    pcd = pcd.drop(columns=['No'])
    
    code2name = {}
    for k, v in zip(pcd['page_cd'].values,  pcd['page_nm'].values):
        if pd.isnull(v):
            code2name[k]=k
        else:
            code2name[k]=v
    return pcd, code2name

def prep_mbrlogs(mbrlogs):
    
    def count_vtlt_age_eff_dt(x):
        count_vtlt_age = np.zeros(len(x['vtlt_age_eff_dt']), dtype=np.float32)
        vtlt_effs = np.unique(x['vtlt_age_eff_dt'])
        for eff in vtlt_effs:
            if eff == 99991231:
                continue
            else:
                ind = np.where(x['vtlt_age_eff_dt'] == eff)[0][0]
                count_vtlt_age[ind:] += 1
        return pd.Series(count_vtlt_age, name='count_vtlt_age')
    
    #concat
    mbrdf = pd.concat(mbrlogs, axis=0)
    
    #dt -> datetime 으로 변경
#     mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y%m%d')
    mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y-%m-%d')
    
    #party_id당 dt순으로 sorting
    mbrdf = mbrdf.sort_values(['party_id', 'dt'])
    
    #사용안하는 컬럼 drop
    mbrdf = mbrdf.drop(columns=cfg.unused_mbrcol)
    
    #null제거
    print('Orig. data len:', len(mbrdf))
    mbrdf = mbrdf.dropna()
    print('Aft. drop-nan:', len(mbrdf), '\n')
    
    #party_id -> int형으로 변환
    mbrdf['party_id'] = mbrdf['party_id'].astype('int32')
    
    #바이탈리티 나이 측정 횟수 관련 전처리
    newcol = mbrdf.groupby(['party_id']).apply(count_vtlt_age_eff_dt)
    mbrdf['count_vtlt_age_dt'] = newcol.values
    
    #바이탈리티 나이 차이 관련 전처리
    mbrdf = mbrdf.reset_index(drop=True)
    inds = np.where(mbrdf['vtlt_age'] == 'NOT_ENOUGH_DATA')[0]
    mbrdf.loc[inds, 'vtlt_age'] = '0'
    mbrdf['vtlt_age'] = mbrdf['vtlt_age'].astype('int32')
    mbrdf['diff_age'] = mbrdf['vtlt_age'] - mbrdf['age']
    
    #주간미션달성률 관련 전처리
    mbrdf['achv_rat'] = mbrdf['cur_mbrsh_pd_goal_achv_cnt'] / mbrdf['cur_mbrsh_pd_goal_alct_cnt']
    
    #회원가입이후 경과일
    pids = np.unique(mbrdf.loc[mbrdf['mbr_scrb_dt'] == 99991231]['party_id'].values)
    newval = []
    for pid in pids:
        vals = np.unique(mbrdf.loc[mbrdf['party_id']==pid]['mbr_scrb_dt'].values)
        newval.append(vals[np.where(vals != 9991231)[0][0]])
    
    for val, pid in zip(newval, pids):
        inds = np.where(mbrdf['party_id'] == pid)[0]
        mbrdf.loc[inds, 'mbr_scrb_dt'] = val 
    
    mbrdf['mbr_scrb_dt'] = pd.to_datetime(mbrdf['mbr_scrb_dt'], format='%Y%m%d')
    mbrdf['active_dur'] = mbrdf['dt'] - mbrdf['mbr_scrb_dt']
    
    #멤버십 등급 -> 1,2,3,4로 변경
    mbrsh_dic = {'Bronze': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4}
    f = lambda x : mbrsh_dic[x]
    newmbrsh = mbrdf['cur_mbrsh_rwrd_st_cd'].transform(f)
    mbrdf['cur_mbrsh_rwrd_st_cd'] = newmbrsh
    
    #필요없는 칼럼 drop
    mbrdf = mbrdf.drop(columns=['vtlt_age_eff_dt', 'mbr_scrb_dt', 'cur_mbrsh_pd_goal_alct_cnt','cur_mbrsh_pd_goal_achv_cnt'])
    
    return mbrdf

def prep_gmlogs(gmlogs):
    
    gmdf = pd.concat(gmlogs, axis=0)
    gmdf = gmdf[['party_id', 'p_event_apl_dte','points_value','points_effective_dte']]
    #gmdf = gmdf[cfg.used_gmcol]
    gmdf = gmdf.replace('#', np.nan)
    
    print('Orig. data len:', len(gmdf))
    gmdf = gmdf.dropna()
    print('Aft. drop-nan:', len(gmdf))
    
    #party_id -> int
    gmdf['party_id'] = gmdf['party_id'].astype('int32')
    
    #datetime형으로 변환
    gmdf['p_event_apl_dte'] = pd.to_datetime(gmdf['p_event_apl_dte'], format='%Y%m%d')
    gmdf['points_effective_dte'] = pd.to_datetime(gmdf['points_effective_dte'], format='%Y%m%d')
    
    #sorting
    gmdf = gmdf.sort_values(['party_id', 'p_event_apl_dte'])
    return gmdf

def merge_app_and_pcd(df, pcd):
    return pd.merge(left=df, right=pcd[['page_cd','menu_nm_1','menu_nm_2']], on=['page_cd'], how='left', sort=False)



In [45]:
seqds = SeqDataset(year=2021, month=2, maxlen=3)

데이터 셋 로딩중..
data/applog/applog_202102.csv
data/applog/applog_202102.csv is read
data/member/mbr_202102.csv
data/member/mbr_202102.csv is read
data/mission/goal_misn_202102.csv
data/mission/goal_misn_202102.csv is read
data/applog/applog_202101.csv
data/applog/applog_202101.csv is read
data/member/mbr_202101.csv
data/member/mbr_202101.csv is read
data/mission/goal_misn_202101.csv
data/mission/goal_misn_202101.csv is read
data/applog/applog_202012.csv
data/applog/applog_202012.csv is read
data/member/mbr_202012.csv
data/member/mbr_202012.csv is read
data/mission/goal_misn_202012.csv
data/mission/goal_misn_202012.csv is read
[mydate(year=2021, month=2), mydate(year=2021, month=1), mydate(year=2020, month=12)]
데이터 셋 로딩 완료



In [112]:
import gc
gc.collect()

2736

In [113]:
appdf = prep_applogs(appds)
mbrdf = prep_mbrlogs(mbrds)
gmdf = prep_gmlogs(gmds)
pcd = read_csv(cfg.pgcd)
lbl = read_csv(cfg.label)

Orig. data len: 28039392
Aft. drop-nan: 28039392


In [134]:
uni_pcd_depth1 = pcd['menu_nm_1'].unique()
uni_pcd_depth2 = pcd['menu_nm_2'].unique()

In [136]:
print(len(uni_pcd_depth1))
uni_pcd_depth1

24


array([nan, '바이탈리티', '이벤트', '마음챙김', '활동', '디바이스연동', '공통', '보험', '설정',
       '갤럭시프로그램', '사이트맵', '꿀팁', '바이탈리티 할인', '건강도전', '건강', '바이탈리티 나이',
       '가입/로그인/비밀번호', '내바이탈리티', '등급리워드', '헬시푸드', '위젯', '주간 리워드', '주간미션',
       '건강걷기'], dtype=object)

In [137]:
print(len(uni_pcd_depth2))
uni_pcd_depth2

60


array([nan, '바이탈리티 메인', '이벤트', '마음챙김', '활동-헤더메시지', '디바이스연동', '맞춤보험', '채널',
       '갤럭시프로그램', '팝업', '바이탈리티-헤더메시지', '사이트맵', '꿀팁 메인', '꿀팁-컨텐츠', '자동로그인',
       '바이탈리티 할인', '꿀팁-영양', '꿀팁-예방', '꿀팁-운동', '꿀팁-일상', '건강도전', '디지털다이렉트',
       '서비스가이드', '바이탈리티 나이', '건강증진형보험', '회원가입', '내바이탈리티', '멀티팝업', '공지팝업',
       'FAQ', '공지사항', '등급리워드', '헬시푸드', '약관', '건강-배너', '건강메인', '정밀검진/예방접종',
       '마음건강진단', '기초건강검진', '영양균형진단', '금연선언', '시작', '다이렉트보험', '위젯',
       '비밀번호재설정', '로그인', '꿀팁-상단배너', '바이탈리티-상단배너', '보험-상단배너', '보험 메인',
       '보험/금융팁', '보험-헤더메시지', '개인설정', '회원탈퇴', '제휴사 할인 이벤트', '주간 리워드',
       '활동 메인', '주간미션', '건강걷기', '활동-상단배너'], dtype=object)

In [142]:
gc.collect()

1511

In [114]:
appdf = merge_app_and_pcd(appdf, pcd)

In [143]:
#session별 총 페이지 수
pgsess_depth1 = appdf.groupby(['party_id','sesn_id','menu_nm_1']).count().reset_index()[['party_id','sesn_id','menu_nm_1','page_cd']]
pgsess_depth2 = appdf.groupby(['party_id','sesn_id','menu_nm_2']).count().reset_index()[['party_id','sesn_id','menu_nm_2','page_cd']]

MemoryError: Unable to allocate 214. MiB for an array with shape (28039274,) and data type int64

In [140]:
pgsess_depth1

Unnamed: 0,party_id,sesn_id,page_cd
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2
1,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,1
2,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,3
3,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2
4,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,3
...,...,...,...
8111923,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,2
8111924,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,2
8111925,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,2
8111926,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,3


In [None]:
pgsess_depth2

In [101]:
#session별 상위 카테고리별 방문횟수
uv_group = appdf.groupby(['party_id','sesn_id','menu_nm_1'])
uv_per_depth1 = uv_group.count().reset_index()[['party_id', 'sesn_id', 'menu_nm_1','page_cd']]

In [102]:
uv_per_depth1

Unnamed: 0,party_id,sesn_id,menu_nm_1,page_cd
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,갤럭시프로그램,2
1,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,건강,1
2,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,공통,3
3,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,꿀팁,2
4,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,디바이스연동,3
...,...,...,...,...
8111947,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,디바이스연동,2
8111948,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,사이트맵,2
8111949,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,활동,2
8111950,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,가입/로그인/비밀번호,3


In [103]:
uv_per_depth1.sort_values(['sesn_id','menu_nm_1'])

Unnamed: 0,party_id,sesn_id,menu_nm_1,page_cd
7838242,1200336704,#,가입/로그인/비밀번호,1
7995940,1200344622,#,가입/로그인/비밀번호,1
8091798,1200357313,#,가입/로그인/비밀번호,1
8091979,1200357319,#,가입/로그인/비밀번호,1
633749,200056999,#,건강걷기,3
...,...,...,...,...
611353,200047745,fffff4d5-efad-4f93-90c0-5d04c3c661a9,공통,2
611354,200047745,fffff4d5-efad-4f93-90c0-5d04c3c661a9,디바이스연동,6
611355,200047745,fffff4d5-efad-4f93-90c0-5d04c3c661a9,바이탈리티,3
611356,200047745,fffff4d5-efad-4f93-90c0-5d04c3c661a9,활동,3
