In [1]:
import pandas as pd
from datetime import datetime
from collections import namedtuple
from util import cfg, load_file, read_csv

import time
import numpy as np
import pickle

In [2]:
class SeqDataset():
    
    def __init__(self, year, month, maxlen=4):
        self.year = year
        self.month = month
        self.maxlen = maxlen
        
        print('데이터 셋 로딩중..')
        datelist = self.create_seq_dataset()
        print(datelist)
        print('데이터 셋 로딩 완료')
        print()
        
    def preprocess(self):

        appds = [data[0] for data in self.dataset]
        mbrds = [data[1] for data in self.dataset]
        gmds = [data[2] for data in self.dataset]
        pcd = read_csv(cfg.pgcd)
        
        print('applog 전처리중')
        appdf = prep_applogs(appds)
        pcd, code2name = prep_pagecd(pcd)
        appdf = merge_app_and_pcd(appdf, pcd)
        print('applog 전처리 완료')
        print()
        
        print('member 전처리중')
        mbrdf = prep_mbrlogs(mbrds)
        print('member 전처리 완료')
        print()
        
        print('goal_mission 전처리중')
        gmdf = prep_gmlogs(gmds)
        print("goal_mission 전처리 완료")
        print()
        return appdf, mbrdf, gmdf
        
    def create_seq_dataset(self):
        """
        year:현재년도
        month:현재월
        maxlen:시계열에서 고려하는 개월 수 
        ex) 현재 개월수가 11월이면, 11월/10월/9월/8월을 고려함
        dataset: [[app, mbr, gm], [app, mbr, gm],...]
        """
        datelist = self.list_dates()
        dataset = self.load_dataset(datelist)
        self.dataset = dataset
        return datelist
    
    def load_dataset(self, datelist):
        data = []
        for date in datelist:
            data.append(load_file(date.year, date.month))
        return data
        
    def list_dates(self):
        mydate = namedtuple('mydate', ['year', 'month'])
        curdate = mydate(self.year, self.month)
        datelist = [curdate]
        for i in range(1, self.maxlen):
            year = curdate.year
            month = curdate.month - i
            if year == 2021 and month <= 0:
                year = 2020
                month += 12
            elif year == 2020 and month <= 0:
                return datelist       
            datelist.append(mydate(year, month))
        return datelist      
    
    
    def create_seqds_per_sess(self, appdf, mbrdf, logdf):
        return
    
    def create_seqds(self, appdf, mbrdf, logdf):
        return
    
    def prep_appdf_per_sess(self, appdf):
        return
    
def prep_applogs(applogs):
    df = pd.concat(applogs, axis=0)
    
    #null제거
    print('Orig. data len:', len(df))
    df = df.dropna()
    print('Aft. drop-nan:', len(df))
    
    #방문일시 변경
    vst_dtm = df['vst_dtm'].astype('str')
    f = lambda x: x[:-3]
    vst_dtm = vst_dtm.apply(f)
    vst_dtm = pd.to_datetime(vst_dtm, format='%Y%m%d%H%M%S')
    df['vst_dtm'] = vst_dtm
    
    #필요 없는 칼럼 drop
    df = df.drop(['login_yn', 'new_vst_yn', 'tlcom_co_cd'], axis=1)
    
    #1970년대 데이터 제외
    df = df[df['vst_dtm'].dt.year != 1970]
    df = df.reset_index(drop=True)

    #session_id '#' 제거
    inds = np.where(df['sesn_id'] == '#')[0]
    df = df.drop(inds)
    
    #'month'칼럼 추가
    df['month'] = df['vst_dtm'].dt.to_period('M')
    
    #sorting
    df = df.sort_values(['party_id', 'vst_dtm', 'sesn_id'])
    return df

def prep_pagecd(pcd):
    pcd = pcd.reset_index(drop=True)
    pcd = pcd.drop(columns=['No'])
    
    code2name = {}
    for k, v in zip(pcd['page_cd'].values,  pcd['page_nm'].values):
        if pd.isnull(v):
            code2name[k]=k
        else:
            code2name[k]=v
    return pcd, code2name

def prep_mbrlogs(mbrlogs):
    
    def count_vtlt_age_eff_dt(x):
        count_vtlt_age = np.zeros(len(x['vtlt_age_eff_dt']), dtype=np.float32)
        vtlt_effs = np.unique(x['vtlt_age_eff_dt'])
        for eff in vtlt_effs:
            if eff == 99991231:
                continue
            else:
                ind = np.where(x['vtlt_age_eff_dt'] == eff)[0][0]
                count_vtlt_age[ind:] += 1
        return pd.Series(count_vtlt_age, name='count_vtlt_age')
    
    #concat
    mbrdf = pd.concat(mbrlogs, axis=0)
    
    #dt -> datetime 으로 변경
    mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y%m%d')
#     mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y-%m-%d')
    
    #party_id당 dt순으로 sorting
    mbrdf = mbrdf.sort_values(['party_id', 'dt'])
    
    #사용안하는 컬럼 drop
    mbrdf = mbrdf.drop(columns=cfg.unused_mbrcol)
    
    #null제거
    print('Orig. data len:', len(mbrdf))
    mbrdf = mbrdf.dropna()
    print('Aft. drop-nan:', len(mbrdf), '\n')
    
    #party_id -> int형으로 변환
    mbrdf['party_id'] = mbrdf['party_id'].astype('int32')
    
    #바이탈리티 나이 측정 횟수 관련 전처리
    newcol = mbrdf.groupby(['party_id']).apply(count_vtlt_age_eff_dt)
    mbrdf['count_vtlt_age_dt'] = newcol.values
    
    #바이탈리티 나이 차이 관련 전처리
    mbrdf = mbrdf.reset_index(drop=True)
    inds = np.where(mbrdf['vtlt_age'] == 'NOT_ENOUGH_DATA')[0]
    mbrdf.loc[inds, 'vtlt_age'] = '0'
    mbrdf['vtlt_age'] = mbrdf['vtlt_age'].astype('int32')
    mbrdf['diff_age'] = mbrdf['vtlt_age'] - mbrdf['age']
    
    #주간미션달성률 관련 전처리
    mbrdf['achv_rat'] = mbrdf['cur_mbrsh_pd_goal_achv_cnt'] / mbrdf['cur_mbrsh_pd_goal_alct_cnt']
    
    #회원가입이후 경과일
    pids = np.unique(mbrdf.loc[mbrdf['mbr_scrb_dt'] == 99991231]['party_id'].values)
    newval = []
    passpids = []
    for pid in pids:
        pidmbrdf = mbrdf.loc[mbrdf['party_id'] == pid]
        vals = np.unique(pidmbrdf['mbr_scrb_dt'].values)
        inds = np.where(vals != 99991231)[0]
        if len(inds) > 1:
            newval.append(vals[np.where(vals != 99991231)[0][0]])
        else:
            mbrdf = mbrdf.drop(pidmbrdf.index)
            passpids.append(pid)
            
    for val, pid in zip(newval, pids):
        if pid in passpids:
            pass
        inds = np.where(mbrdf['party_id'] == pid)[0]
        mbrdf.loc[inds, 'mbr_scrb_dt'] = val 
    
    mbrdf['mbr_scrb_dt'] = pd.to_datetime(mbrdf['mbr_scrb_dt'], format='%Y%m%d')
    mbrdf['active_dur'] = mbrdf['dt'] - mbrdf['mbr_scrb_dt']
    
    #멤버십 등급 -> 1,2,3,4로 변경
    mbrsh_dic = {'Bronze': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4}
    f = lambda x : mbrsh_dic[x]
    newmbrsh = mbrdf['cur_mbrsh_rwrd_st_cd'].transform(f)
    mbrdf['cur_mbrsh_rwrd_st_cd'] = newmbrsh
    
    #필요없는 칼럼 drop
    mbrdf = mbrdf.drop(columns=['vtlt_age_eff_dt', 'mbr_scrb_dt', 'cur_mbrsh_pd_goal_alct_cnt','cur_mbrsh_pd_goal_achv_cnt'])
    
    return mbrdf

def prep_gmlogs(gmlogs):
    
    gmdf = pd.concat(gmlogs, axis=0)
    gmdf = gmdf[['party_id', 'p_event_apl_dte','points_value','points_effective_dte']]
    #gmdf = gmdf[cfg.used_gmcol]
    gmdf = gmdf.replace('#', np.nan)
    
    print('Orig. data len:', len(gmdf))
    gmdf = gmdf.dropna()
    print('Aft. drop-nan:', len(gmdf))
    
    #party_id -> int
    gmdf['party_id'] = gmdf['party_id'].astype('int32')
    
    #datetime형으로 변환
    gmdf['p_event_apl_dte'] = pd.to_datetime(gmdf['p_event_apl_dte'], format='%Y%m%d')
    gmdf['points_effective_dte'] = pd.to_datetime(gmdf['points_effective_dte'], format='%Y%m%d')
    
    #sorting
    gmdf = gmdf.sort_values(['party_id', 'p_event_apl_dte'])
    return gmdf

def merge_app_and_pcd(df, pcd):
    return pd.merge(left=df, right=pcd[['page_cd','menu_nm_1','menu_nm_2']], on=['page_cd'], how='left', sort=False)



In [3]:
seqds = SeqDataset(year=2021, month=2, maxlen=1)

데이터 셋 로딩중..
data/applog/applog_202102.csv 로딩중
data/applog/applog_202102.csv 로딩완료

data/member/mbr_202102.csv 로딩중
data/member/mbr_202102.csv 로딩완료

data/mission/goal_misn_202102.csv 로딩중
data/mission/goal_misn_202102.csv 로딩완료

[mydate(year=2021, month=2)]
데이터 셋 로딩 완료



In [4]:
appdf, mbrdf, gmdf = seqds.preprocess()

applog 전처리중
Orig. data len: 13439757
Aft. drop-nan: 13439757
applog 전처리 완료

member 전처리중
Orig. data len: 2234074
Aft. drop-nan: 2234074 

member 전처리 완료

goal_mission 전처리중
Orig. data len: 987857
Aft. drop-nan: 528117
goal_mission 전처리 완료



In [5]:
appdf['dt'] = appdf['vst_dtm'].dt.to_period('D')
appdf['mth'] = appdf['vst_dtm'].dt.to_period('M')

In [6]:
#appdf 전처리
#1.위젯 제거 #수인선임님이 진행해주실 예정
print('before drop 위젯', len(appdf))
appdf = appdf.loc[appdf['menu_nm_1'] != '위젯']
print('after drop 위젯', len(appdf))

before drop 위젯 13439671
after drop 위젯 13302819


In [7]:
#2. menu_nm_1 == Nan or menu_nm_2 == Nan인 경우로만 이뤄진 session_id 제거하기
## 먼저, 전체 고유 party_id와 session_id 갯수 -> menu_nm_1이 nan인 경우에 대해서 제거된 케이스
menusess1 = appdf[['party_id','sesn_id','page_cd']].groupby(['party_id','sesn_id']).first().reset_index()
menusess1 = menusess1[['party_id','sesn_id']]
len(menusess1)

727030

In [8]:
## nan이 제거된 전체 고유 party_id와 session_id갯수
menusess2 = appdf[['party_id','sesn_id','menu_nm_1','page_cd']].groupby(['party_id','sesn_id','menu_nm_1']).count().reset_index()
menusess2 = menusess2[['party_id','sesn_id']].drop_duplicates()
len(menusess2)

726857

In [9]:
menusess = pd.concat([menusess1, menusess2], axis=0)
menusess = menusess.loc[~menusess.duplicated(keep=False)]

In [10]:
pids_isin = np.isin(appdf['party_id'], menusess['party_id'])

In [11]:
sess_isin = np.isin(appdf['sesn_id'], menusess['sesn_id'])

In [12]:
print(len(pids_isin), len(sess_isin))

13302819 13302819


In [13]:
appdf = appdf.loc[~np.all([pids_isin, sess_isin], axis=0)]

In [14]:
len(appdf)

13302599

In [15]:
menusess3 = appdf[['party_id','sesn_id','menu_nm_2','page_cd']].groupby(['party_id','sesn_id','menu_nm_2']).count().reset_index()
menusess3 = menusess3[['party_id','sesn_id']].drop_duplicates()
len(menusess3)

726817

In [16]:
menusess = pd.concat([menusess1, menusess3], axis=0)
menusess = menusess.loc[~menusess.duplicated(keep=False)]

pids_isin = np.isin(appdf['party_id'], menusess['party_id'])
sess_isin = np.isin(appdf['sesn_id'], menusess['sesn_id'])
appdf = appdf.loc[~np.all([pids_isin, sess_isin], axis=0)]
len(appdf)

13302549

In [17]:
seqdf = appdf[['party_id','page_cd','sesn_id','dt']].groupby(['party_id','sesn_id']).last()['dt']
seqdf = seqdf.reset_index()
seqdf = seqdf.sort_values(['party_id','dt'])
seqdf = seqdf.reset_index(drop=True)

In [18]:
#session간의 방문일자 차이
def diff_vstdate(x):
    b = pd.concat([pd.Series(x['dt'].iloc[0]), x['dt'].iloc[:-1]]).reset_index(drop=True)
    seqdiff = x['dt'].reset_index(drop=True).dt.to_timestamp() - b.dt.to_timestamp()
    seqdiff.name = "diff_dt"
    return seqdiff
diffdf = seqdf.groupby(['party_id']).apply(diff_vstdate)

In [19]:
diffdf = diffdf.reset_index(drop=True)
seqdf = pd.concat([seqdf, diffdf], axis=1)
seqdf

Unnamed: 0,party_id,sesn_id,dt,diff_dt
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days
...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,2021-02-28,0 days
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2021-02-03,0 days
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02-08,5 days


In [20]:
pcd = read_csv(cfg.pgcd)
lbl = read_csv(cfg.label)

In [21]:
uni_pcd_depth1 = pcd['menu_nm_1'].unique()
uni_pcd_depth2 = pcd['menu_nm_2'].unique()

In [22]:
print(len(uni_pcd_depth1))
uni_pcd_depth1

24


array([nan, '바이탈리티', '이벤트', '마음챙김', '활동', '디바이스연동', '공통', '보험', '설정',
       '갤럭시프로그램', '사이트맵', '꿀팁', '바이탈리티 할인', '건강도전', '건강', '바이탈리티 나이',
       '가입/로그인/비밀번호', '내바이탈리티', '등급리워드', '헬시푸드', '위젯', '주간 리워드', '주간미션',
       '건강걷기'], dtype=object)

In [23]:
print(len(uni_pcd_depth2))
uni_pcd_depth2

60


array([nan, '바이탈리티 메인', '이벤트', '마음챙김', '활동-헤더메시지', '디바이스연동', '맞춤보험', '채널',
       '갤럭시프로그램', '팝업', '바이탈리티-헤더메시지', '사이트맵', '꿀팁 메인', '꿀팁-컨텐츠', '자동로그인',
       '바이탈리티 할인', '꿀팁-영양', '꿀팁-예방', '꿀팁-운동', '꿀팁-일상', '건강도전', '디지털다이렉트',
       '서비스가이드', '바이탈리티 나이', '건강증진형보험', '회원가입', '내바이탈리티', '멀티팝업', '공지팝업',
       'FAQ', '공지사항', '등급리워드', '헬시푸드', '약관', '건강-배너', '건강메인', '정밀검진/예방접종',
       '마음건강진단', '기초건강검진', '영양균형진단', '금연선언', '시작', '다이렉트보험', '위젯',
       '비밀번호재설정', '로그인', '꿀팁-상단배너', '바이탈리티-상단배너', '보험-상단배너', '보험 메인',
       '보험/금융팁', '보험-헤더메시지', '개인설정', '회원탈퇴', '제휴사 할인 이벤트', '주간 리워드',
       '활동 메인', '주간미션', '건강걷기', '활동-상단배너'], dtype=object)

In [24]:
#session별 페이지 길이
pglen_perse = appdf.groupby(['party_id','sesn_id']).count().reset_index()[['party_id','sesn_id','page_cd']]
pglen_perse

Unnamed: 0,party_id,sesn_id,page_cd
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,23
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,1
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,6
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,6
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,19
...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,47
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,25
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,3


In [25]:
#카테고리별 방문횟수
uv_per_d1 = appdf.groupby(['party_id','sesn_id','menu_nm_1']).count().reset_index()[['party_id','sesn_id','menu_nm_1','page_cd']]
uv_per_d2 = appdf.groupby(['party_id','sesn_id','menu_nm_2']).count().reset_index()[['party_id','sesn_id','menu_nm_2','page_cd']]

uv_per_d1 = uv_per_d1.pivot(index=['party_id','sesn_id'], columns='menu_nm_1', values='page_cd')
uv_per_d1 = uv_per_d1.fillna(0).reset_index()

uv_per_d2 = uv_per_d2.pivot(index=['party_id','sesn_id'], columns='menu_nm_2', values='page_cd')
uv_per_d2 = uv_per_d2.fillna(0).reset_index()

In [26]:
uv_per_d1

menu_nm_1,party_id,sesn_id,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,2.0,1.0,0.0,0.0,3.0,2.0,0.0,...,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,...,0.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,1.0,4.0,1.0,0.0,0.0,8.0,0.0,5.0,...,17.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,16.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
uv_per_d2

menu_nm_2,party_id,sesn_id,FAQ,개인설정,갤럭시프로그램,건강-배너,건강걷기,건강도전,건강메인,건강증진형보험,...,제휴사 할인 이벤트,주간 리워드,주간미션,채널,헬시푸드,활동 메인,활동-상단배너,활동-헤더메시지,회원가입,회원탈퇴
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,4.0,0.0,0.0,0.0,1.0,3.0,...,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [28]:
stydepth1 = appdf[['party_id','sesn_id','menu_nm_1','sty_tms']].groupby(['party_id','sesn_id','menu_nm_1']).mean()
stydepth1 = stydepth1.reset_index()
stydepth1 = stydepth1.pivot(index=['party_id','sesn_id'], columns=['menu_nm_1'], values=['sty_tms'])
stydepth1 = stydepth1.fillna(0).reset_index()
stydepth1

Unnamed: 0_level_0,party_id,sesn_id,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms
menu_nm_1,Unnamed: 1_level_1,Unnamed: 2_level_1,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.000000,0.50,5.0,0.0,0.0,5.333333,3.0,0.0,...,0.000000,0.0,7.5,10.0,0.0,0.0,0.000000,0.0,0.0,1.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.000000,0.00,0.0,0.0,0.0,4.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.000000,0.00,0.0,0.0,0.0,8.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.000000,0.00,0.0,0.0,0.0,3.666667,0.0,14.5,...,0.000000,0.0,16.0,1.5,8.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,5.000000,405.75,1.0,0.0,0.0,8.500000,0.0,7.2,...,7.470588,0.0,1.8,0.0,0.0,0.0,3.666667,0.0,7.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,3.000000,0.00,0.0,0.0,0.0,6.000000,0.0,0.0,...,13.125000,0.0,0.0,0.0,0.5,0.0,0.000000,0.0,0.0,1.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,14.333333,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [30]:
stydepth2 = appdf[['party_id','sesn_id','menu_nm_2','sty_tms']].groupby(['party_id','sesn_id','menu_nm_2']).mean()
stydepth2 = stydepth2.reset_index()
stydepth2 = stydepth2.pivot(index=['party_id','sesn_id'], columns=['menu_nm_2'],values=['sty_tms'])
stydepth2 = stydepth2.fillna(0).reset_index()
stydepth2

Unnamed: 0_level_0,party_id,sesn_id,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms,sty_tms
menu_nm_2,Unnamed: 1_level_1,Unnamed: 2_level_1,FAQ,개인설정,갤럭시프로그램,건강-배너,건강걷기,건강도전,건강메인,건강증진형보험,...,제휴사 할인 이벤트,주간 리워드,주간미션,채널,헬시푸드,활동 메인,활동-상단배너,활동-헤더메시지,회원가입,회원탈퇴
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,0.0,0.50,0.0,0.0,0.0,5.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,8.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,405.75,0.0,0.0,0.0,1.0,2.333333,...,0.0,3.666667,0.0,0.0,7.0,0.0,0.0,0.0,5.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,8.5,0.0


In [32]:
#종료율 관련
endmenu = appdf[['party_id','page_cd','sesn_id','menu_nm_1']].groupby(['party_id','sesn_id']).last().reset_index()

In [33]:
endmenu['value'] = 1

In [35]:
endmenu = endmenu.pivot(index=['party_id','sesn_id'], columns=['menu_nm_1'], values=['value'])
endmenu = endmenu.fillna(0).reset_index()
endmenu

Unnamed: 0_level_0,party_id,sesn_id,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
menu_nm_1,Unnamed: 1_level_1,Unnamed: 2_level_1,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
#seqdf, pglen_perse, uv_per_d1, uv_per_d2, stydepth1, stydepth2, endmenu
assert len(seqdf) == len(pglen_perse) == len(uv_per_d1) ==len(uv_per_d2) == len(stydepth1) == len(stydepth2) == len(endmenu)

In [53]:
display(seqdf)
display(pglen_perse)

Unnamed: 0,party_id,sesn_id,dt,diff_dt
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days
...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,2021-02-28,0 days
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2021-02-03,0 days
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02-08,5 days


Unnamed: 0,party_id,sesn_id,page_cd
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,23
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,1
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,6
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,6
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,19
...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,47
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,25
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,3


In [56]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, pglen_perse, on=['party_id', 'sesn_id'])
assert beflen == len(seqdf)

In [63]:
display(uv_per_d1)
uv_per_d1.columns.name = None

menu_nm_1,party_id,sesn_id,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,2.0,1.0,0.0,0.0,3.0,2.0,0.0,...,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,...,0.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,1.0,4.0,1.0,0.0,0.0,8.0,0.0,5.0,...,17.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,16.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, uv_per_d1, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

In [66]:
uv_per_d2.columns.name = None
display(uv_per_d2)

Unnamed: 0,party_id,sesn_id,FAQ,개인설정,갤럭시프로그램,건강-배너,건강걷기,건강도전,건강메인,건강증진형보험,...,제휴사 할인 이벤트,주간 리워드,주간미션,채널,헬시푸드,활동 메인,활동-상단배너,활동-헤더메시지,회원가입,회원탈퇴
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,4.0,0.0,0.0,0.0,1.0,3.0,...,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [67]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, uv_per_d2, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

In [68]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, stydepth1, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [73]:
stydepth2.columns.name = None
beflen = len(seqdf)
seqdf = pd.merge(seqdf, stydepth2, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [75]:
display(endmenu)
endmenu.columns.name = None
beflen = len(seqdf)
seqdf = pd.merge(seqdf, endmenu, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

Unnamed: 0_level_0,party_id,sesn_id,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
menu_nm_1,Unnamed: 1_level_1,Unnamed: 2_level_1,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [76]:
seqdf

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x,...,"(value, 바이탈리티 나이)","(value, 바이탈리티 할인)","(value, 보험)","(value, 사이트맵)","(value, 설정)","(value, 이벤트)","(value, 주간 리워드)","(value, 주간미션)","(value, 헬시푸드)","(value, 활동)"
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days,23,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days,6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days,6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days,8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726812,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days,47,1.0,4.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726813,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,2021-02-28,0 days,25,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
726814,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2021-02-03,0 days,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
726815,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02-08,5 days,3,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
#mbrdf와 합치기
mbrdf

Unnamed: 0,party_id,gender_cd,age,vtlt_age,cur_mbrsh_rwrd_st_cd,cur_mbrsh_pd_acqr_pt,wk_misn_sta_dt,fee_yn,fcip_yn,lst_vst_dt,push_alarm_yn,dt,count_vtlt_age_dt,diff_age,achv_rat,active_dur
0,861497,MALE,33,31,1,4100,20180727,N,N,20210331,Y,2021-02-01,1.0,-2,0.318182,920 days
1,861497,MALE,33,31,1,4100,20180727,N,N,20210331,Y,2021-02-02,1.0,-2,0.318182,921 days
2,861497,MALE,33,31,1,4100,20180727,N,N,20210331,Y,2021-02-03,1.0,-2,0.318182,922 days
3,861497,MALE,33,31,1,4100,20180727,N,N,20210331,Y,2021-02-04,1.0,-2,0.315789,923 days
4,861497,MALE,33,31,1,4100,20180727,N,N,20210331,Y,2021-02-05,1.0,-2,0.315789,924 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2234069,1200362289,MALE,73,75,1,550,20210228,N,N,20210331,Y,2021-02-28,1.0,2,0.000000,0 days
2234070,1200362290,MALE,29,34,1,500,20210228,N,N,20210316,Y,2021-02-28,1.0,5,0.000000,0 days
2234071,1200362291,FEMALE,43,48,1,550,20210228,N,N,20210331,Y,2021-02-28,1.0,5,0.000000,0 days
2234072,1200362301,MALE,30,44,1,500,20210228,N,N,20210301,Y,2021-02-28,1.0,14,0.000000,0 days


In [342]:
def diff_vst(x):
    a = x['vst_dtm'].values
    dur_vst = a[-1] - a[0]
    return dur_vst
vstdf = ab[['party_id','sesn_id','vst_dtm']].groupby(['party_id','sesn_id']).apply(diff_vst)

In [343]:
vstdf = vstdf.reset_index()

In [353]:
dd = timedelta(minutes=30)

In [354]:
vstdf[0] < dd

0          True
1          True
2          True
3          True
4          True
           ... 
1569477    True
1569478    True
1569479    True
1569480    True
1569481    True
Name: 0, Length: 1569482, dtype: bool

In [355]:
vstdf.loc[vstdf[0] < dd]

Unnamed: 0,party_id,sesn_id,0
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0 days 00:01:21
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0 days 00:00:00
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0 days 00:00:06
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0 days 00:00:10
4,861497,3E68155F-8B15-44D6-BF3C-E790662E277B,0 days 00:00:00
...,...,...,...
1569477,1200362302,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,0 days 00:04:23
1569478,1200374790,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,0 days 00:00:00
1569479,1200374790,6495507a-229e-4438-9d3a-f9d3d08282f8,0 days 00:00:45
1569480,1200374790,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,0 days 00:00:43


In [356]:
vstdf.loc[vstdf[0] >= dd]

Unnamed: 0,party_id,sesn_id,0
63,863497,FE6A5FE0-302E-44C1-A19B-407A3DB05880,0 days 12:39:10
78,870485,05c8ca41-759b-4e9d-9f48-dd20ed77353e,0 days 05:34:52
90,870485,0fea45b3-78b3-4127-92ea-f853e7e6b2cc,0 days 00:42:42
406,870494,002f55a1-fbdd-4088-90ef-b8cc291d1d18,0 days 06:50:00
407,870494,01ffeffc-e45b-46a9-946c-7cef983f3506,0 days 15:56:18
...,...,...,...
1569344,1200362205,733e65f4-b4b1-43ef-810d-538e48ef4811,0 days 01:42:20
1569347,1200362207,176e0ec7-69f2-43b1-b838-28def67e6061,0 days 00:41:34
1569364,1200362210,c27a8973-2d8d-4870-8b3e-426da73a2c5c,0 days 01:36:05
1569398,1200362233,7d1a625b-c265-4031-96c3-e33604d6169d,0 days 00:54:46


In [197]:
appdf.loc[appdf['sesn_id'] == '0fea45b3-78b3-4127-92ea-f853e7e6b2cc'].head(50)

Unnamed: 0,party_id,vst_dtm,page_cd,sty_tms,sesn_id,month,menu_nm_1,menu_nm_2,dt,mth
2013,870485,2021-02-15 09:52:35,/WID_2_05,0,0fea45b3-78b3-4127-92ea-f853e7e6b2cc,2021-02,위젯,위젯,2021-02-15,2021-02
2014,870485,2021-02-15 09:52:35,/WID_2_05,1210,0fea45b3-78b3-4127-92ea-f853e7e6b2cc,2021-02,위젯,위젯,2021-02-15,2021-02
2015,870485,2021-02-15 09:52:35,/WID_2_99,0,0fea45b3-78b3-4127-92ea-f853e7e6b2cc,2021-02,위젯,위젯,2021-02-15,2021-02
2016,870485,2021-02-15 09:52:35,/WID_2_99,1210,0fea45b3-78b3-4127-92ea-f853e7e6b2cc,2021-02,위젯,위젯,2021-02-15,2021-02


In [359]:
appdf.loc[appdf['sesn_id'] == '6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8']

Unnamed: 0,party_id,vst_dtm,page_cd,sty_tms,sesn_id,menu_nm_1,menu_nm_2,month,month_day,dt,mth
28039234,1200362302,2021-02-28 23:08:11,/member/success/joinSuccess,3,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,가입/로그인/비밀번호,회원가입,2021-02,2021-02-28,2021-02-28,2021-02
28039235,1200362302,2021-02-28 23:08:14,/health/vitalityAge/start,6,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039236,1200362302,2021-02-28 23:08:20,/health/vitalityAge/general/1,5,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039237,1200362302,2021-02-28 23:08:25,/health/vitalityAge/general/2,4,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039238,1200362302,2021-02-28 23:08:29,/health/vitalityAge/general/3,13,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039239,1200362302,2021-02-28 23:08:42,/health/vitalityAge/general/4,16,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039240,1200362302,2021-02-28 23:08:58,/health/vitalityAge/general/5,12,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039241,1200362302,2021-02-28 23:09:10,/health/vitalityAge/general/6,2,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039242,1200362302,2021-02-28 23:09:12,/health/vitalityAge/smoking/7,6,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02
28039243,1200362302,2021-02-28 23:09:18,/health/vitalityAge/smoking/8,2,6FCA01B9-6C57-4F98-AB8C-F9AA9FBA2DF8,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-28,2021-02-28,2021-02


In [352]:
appdf.loc[appdf['sesn_id'] == '8ddc13b0-358c-4d65-97f2-f04d1d8e3d60']

Unnamed: 0,party_id,vst_dtm,page_cd,sty_tms,sesn_id,menu_nm_1,menu_nm_2,month,month_day,dt,mth
28018044,1200360611,2021-02-26 13:42:57,/member/success/joinSuccess,1695,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,가입/로그인/비밀번호,회원가입,2021-02,2021-02-26,2021-02-26,2021-02
28018045,1200360611,2021-02-26 14:11:12,/health/vitalityAge/start,1,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018046,1200360611,2021-02-26 14:11:13,/health/vitalityAge/general/1,28177,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018047,1200360611,2021-02-26 22:00:50,/health/vitalityAge/general/2,11,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018048,1200360611,2021-02-26 22:01:01,/health/vitalityAge/general/3,2,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018049,1200360611,2021-02-26 22:01:03,/health/vitalityAge/general/4,23,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018050,1200360611,2021-02-26 22:01:26,/health/vitalityAge/general/5,20,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018051,1200360611,2021-02-26 22:01:46,/health/vitalityAge/general/6,22,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018052,1200360611,2021-02-26 22:02:08,/health/vitalityAge/smoking/7,33,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02
28018053,1200360611,2021-02-26 22:02:41,/health/vitalityAge/smoking/8,16,8ddc13b0-358c-4d65-97f2-f04d1d8e3d60,바이탈리티 나이,바이탈리티 나이,2021-02,2021-02-26,2021-02-26,2021-02


In [320]:
from datetime import timedelta
dd = timedelta(minutes=30)

In [321]:
dd

datetime.timedelta(0, 1800)

In [351]:
appdf.groupby(['party_id','sesn_id'])

Unnamed: 0,party_id,vst_dtm,page_cd,sty_tms,sesn_id,menu_nm_1,menu_nm_2,month,month_day,dt,mth
0,861497,2021-01-01 01:51:03,/WID_2_99,64610,1E2DD380-1784-4BFF-AEAD-D5470A112525,위젯,위젯,2021-01,2021-01-01,2021-01-01,2021-01
1,861497,2021-01-01 19:47:53,/WID_2_99,22,1E2DD380-1784-4BFF-AEAD-D5470A112525,위젯,위젯,2021-01,2021-01-01,2021-01-01,2021-01
2,861497,2021-01-01 19:48:15,/WID_2_99,0,1E2DD380-1784-4BFF-AEAD-D5470A112525,위젯,위젯,2021-01,2021-01-01,2021-01-01,2021-01
3,861497,2021-01-02 15:42:20,/WID_2_99,0,1E2DD380-1784-4BFF-AEAD-D5470A112525,위젯,위젯,2021-01,2021-01-02,2021-01-02,2021-01
4,861497,2021-01-03 09:59:22,/WID_2_99,0,1E2DD380-1784-4BFF-AEAD-D5470A112525,위젯,위젯,2021-01,2021-01-03,2021-01-03,2021-01
...,...,...,...,...,...,...,...,...,...,...,...
28039269,1200374790,2021-02-03 21:21:51,/MY_2_01,0,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,바이탈리티,바이탈리티 메인,2021-02,2021-02-03,2021-02-03,2021-02
28039270,1200374790,2021-02-08 14:29:35,/member/login,26,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,가입/로그인/비밀번호,로그인,2021-02,2021-02-08,2021-02-08,2021-02
28039271,1200374790,2021-02-08 14:30:01,/member/login/retry_membership,17,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,가입/로그인/비밀번호,회원가입,2021-02,2021-02-08,2021-02-08,2021-02
28039272,1200374790,2021-02-08 14:30:18,/member/login/retry_membership,0,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,가입/로그인/비밀번호,회원가입,2021-02,2021-02-08,2021-02-08,2021-02
