In [1]:
import pandas as pd
from datetime import datetime
from collections import namedtuple
from util import cfg, load_file, read_csv

import time
import numpy as np
import pickle

In [256]:
class SeqDataset():
    
    def __init__(self, year, month, maxlen=4):
        self.year = year
        self.month = month
        self.maxlen = maxlen
        
        print('데이터 셋 로딩중..')
        datelist = self.create_seq_dataset()
        print(datelist)
        print('데이터 셋 로딩 완료')
        print()
        
    def preprocess(self):

        appds = [data[0] for data in self.dataset]
        mbrds = [data[1] for data in self.dataset]
        gmds = [data[2] for data in self.dataset]
        pcd = read_csv(cfg.pgcd)
        lbl = read_csv(cfg.label)
        
        print('applog 전처리중')
        appdf = prep_applogs(appds)
        pcd, code2name = prep_pagecd(pcd)
        appdf = merge_app_and_pcd(appdf, pcd)
#         appdf = merge_app_and_lbl(appdf, lbl)
        print('applog 전처리 완료')
        print()
        
        print('member 전처리중')
        mbrdf = prep_mbrlogs(mbrds)
        print('member 전처리 완료')
        print()
        
        print('goal_mission 전처리중')
        gmdf = prep_gmlogs(gmds)
        print("goal_mission 전처리 완료")
        print()
        return appdf, mbrdf, gmdf
        
    def create_seq_dataset(self):
        """
        year:현재년도
        month:현재월
        maxlen:시계열에서 고려하는 개월 수 
        ex) 현재 개월수가 11월이면, 11월/10월/9월/8월을 고려함
        dataset: [[app, mbr, gm], [app, mbr, gm],...]
        """
        datelist = self.list_dates()
        dataset = self.load_dataset(datelist)
        self.dataset = dataset
        return datelist
    
    def load_dataset(self, datelist):
        data = []
        for date in datelist:
            data.append(load_file(date.year, date.month))
        return data
        
    def list_dates(self):
        mydate = namedtuple('mydate', ['year', 'month'])
        curdate = mydate(self.year, self.month)
        datelist = [curdate]
        for i in range(1, self.maxlen):
            year = curdate.year
            month = curdate.month - i
            if year == 2021 and month <= 0:
                year = 2020
                month += 12
            elif year == 2020 and month <= 0:
                return datelist       
            datelist.append(mydate(year, month))
        return datelist      
    
    
    def create_seqds_per_sess(self, appdf, mbrdf, logdf):
        return
    
    def create_seqds(self, appdf, mbrdf, logdf):
        return
    
    def prep_appdf_per_sess(self, appdf):
        return
    
def prep_applogs(applogs):
    df = pd.concat(applogs, axis=0)
    
    #null제거
    print('Orig. data len:', len(df))
    df = df.dropna()
    print('Aft. drop-nan:', len(df))
    
    #방문일시 변경
    vst_dtm = df['vst_dtm'].astype('str')
    f = lambda x: x[:-3]
    vst_dtm = vst_dtm.apply(f)
    vst_dtm = pd.to_datetime(vst_dtm, format='%Y%m%d%H%M%S')
    df['vst_dtm'] = vst_dtm
    
    #필요 없는 칼럼 drop
    df = df.drop(['login_yn', 'new_vst_yn', 'tlcom_co_cd'], axis=1)
    
    #1970년대 데이터 제외
    df = df[df['vst_dtm'].dt.year != 1970]
    df = df.reset_index(drop=True)

    #session_id '#' 제거
    inds = np.where(df['sesn_id'] == '#')[0]
    df = df.drop(inds)
    
    #'month'칼럼 추가
    df['month'] = df['vst_dtm'].dt.to_period('M')
    
    #sorting
    df = df.sort_values(['party_id', 'vst_dtm', 'sesn_id'])
    return df

def prep_pagecd(pcd):
    pcd = pcd.reset_index(drop=True)
    pcd = pcd.drop(columns=['No'])
    
    code2name = {}
    for k, v in zip(pcd['page_cd'].values,  pcd['page_nm'].values):
        if pd.isnull(v):
            code2name[k]=k
        else:
            code2name[k]=v
    return pcd, code2name

def prep_lbl(lbl):
    lbl['party_id'] = lbl['PartyId']
    lbl = lbl.drop(columns=['Unnamed: 0', 'PartyId'])
    lbl['month'] = pd.to_datetime(lbl['month'], format='%Y-%m')
    lblmth = lbl.loc[np.all([lbl['month'].dt.year==2021, lbl['month'].dt.month==2], axis=0)]
    lblmth = lblmth.drop(columns=['month'])

def prep_mbrlogs(mbrlogs):
    
    def count_vtlt_age_eff_dt(x):
        count_vtlt_age = np.zeros(len(x['vtlt_age_eff_dt']), dtype=np.float32)
        vtlt_effs = np.unique(x['vtlt_age_eff_dt'])
        for eff in vtlt_effs:
            if eff == 99991231:
                continue
            else:
                ind = np.where(x['vtlt_age_eff_dt'] == eff)[0][0]
                count_vtlt_age[ind:] += 1
        return pd.Series(count_vtlt_age, name='count_vtlt_age')
    
    #concat
    mbrdf = pd.concat(mbrlogs, axis=0)
    
    #dt -> datetime 으로 변경
    mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y%m%d')
#     mbrdf['dt'] = pd.to_datetime(mbrdf['dt'], format='%Y-%m-%d')
    
    #party_id당 dt순으로 sorting
    mbrdf = mbrdf.sort_values(['party_id', 'dt'])
    
    #사용안하는 컬럼 drop
    mbrdf = mbrdf.drop(columns=cfg.unused_mbrcol)
    
    #null제거
    print('Orig. data len:', len(mbrdf))
    mbrdf = mbrdf.dropna()
    print('Aft. drop-nan:', len(mbrdf), '\n')
    
    #party_id -> int형으로 변환
    mbrdf['party_id'] = mbrdf['party_id'].astype('int32')
    
    #바이탈리티 나이 측정 횟수 관련 전처리
    newcol = mbrdf.groupby(['party_id']).apply(count_vtlt_age_eff_dt)
    mbrdf['count_vtlt_age_dt'] = newcol.values
    
    #바이탈리티 나이 차이 관련 전처리
    mbrdf = mbrdf.reset_index(drop=True)
    inds = np.where(mbrdf['vtlt_age'] == 'NOT_ENOUGH_DATA')[0]
    mbrdf.loc[inds, 'vtlt_age'] = '0'
    mbrdf['vtlt_age'] = mbrdf['vtlt_age'].astype('int32')
    mbrdf['diff_age'] = mbrdf['vtlt_age'] - mbrdf['age']
    
    #주간미션달성률 관련 전처리
    mbrdf['achv_rat'] = mbrdf['cur_mbrsh_pd_goal_achv_cnt'] / mbrdf['cur_mbrsh_pd_goal_alct_cnt']
    
    #회원가입이후 경과일
    pids = np.unique(mbrdf.loc[mbrdf['mbr_scrb_dt'] == 99991231]['party_id'].values)
    newval = []
    passpids = []
    for pid in pids:
        pidmbrdf = mbrdf.loc[mbrdf['party_id'] == pid]
        vals = np.unique(pidmbrdf['mbr_scrb_dt'].values)
        inds = np.where(vals != 99991231)[0]
        if len(inds) > 1:
            newval.append(vals[np.where(vals != 99991231)[0][0]])
        else:
            mbrdf = mbrdf.drop(pidmbrdf.index)
            passpids.append(pid)
            
    for val, pid in zip(newval, pids):
        if pid in passpids:
            pass
        inds = np.where(mbrdf['party_id'] == pid)[0]
        mbrdf.loc[inds, 'mbr_scrb_dt'] = val 
    
    mbrdf['mbr_scrb_dt'] = pd.to_datetime(mbrdf['mbr_scrb_dt'], format='%Y%m%d')
    mbrdf['active_dur'] = mbrdf['dt'] - mbrdf['mbr_scrb_dt']
    
    #멤버십 등급 -> 1,2,3,4로 변경
    mbrsh_dic = {'Bronze': 1, 'Silver': 2, 'Gold': 3, 'Platinum': 4}
    f = lambda x : mbrsh_dic[x]
    newmbrsh = mbrdf['cur_mbrsh_rwrd_st_cd'].transform(f)
    mbrdf['cur_mbrsh_rwrd_st_cd'] = newmbrsh
    
    #필요없는 칼럼 drop
#     mbrdf = mbrdf.drop(columns=['vtlt_age_eff_dt', 'mbr_scrb_dt', 'cur_mbrsh_pd_goal_alct_cnt','cur_mbrsh_pd_goal_achv_cnt'])
    
    return mbrdf

def prep_gmlogs(gmlogs):
    
    gmdf = pd.concat(gmlogs, axis=0)
    gmdf = gmdf[['party_id', 'p_event_apl_dte','points_value','points_effective_dte','conn_equip']]
    #gmdf = gmdf[cfg.used_gmcol]
    gmdf = gmdf.replace('#', np.nan)
    
    print('Orig. data len:', len(gmdf))
    gmdf = gmdf.dropna()
    print('Aft. drop-nan:', len(gmdf))
    
    #party_id -> int
    gmdf['party_id'] = gmdf['party_id'].astype('int32')
    
    #datetime형으로 변환
    gmdf['p_event_apl_dte'] = pd.to_datetime(gmdf['p_event_apl_dte'], format='%Y%m%d')
    gmdf['points_effective_dte'] = pd.to_datetime(gmdf['points_effective_dte'], format='%Y%m%d')
    
    #sorting
    gmdf = gmdf.sort_values(['party_id', 'p_event_apl_dte'])
    return gmdf

def merge_app_and_pcd(df, pcd):
    return pd.merge(left=df, right=pcd[['page_cd','menu_nm_1','menu_nm_2']], on=['page_cd'], how='left', sort=False)

def merge_app_and_lbl(df, lbl):   
    return pd.merge(appdf, lblmth, on=['party_id'], how='inner', sort=False)

    



In [257]:
seqds = SeqDataset(year=2021, month=2, maxlen=1)
df = seqds.preprocess()

데이터 셋 로딩중..
data/applog/applog_202102.csv 로딩중
data/applog/applog_202102.csv 로딩완료

data/member/mbr_202102.csv 로딩중
data/member/mbr_202102.csv 로딩완료

data/mission/goal_misn_202102.csv 로딩중
data/mission/goal_misn_202102.csv 로딩완료

[mydate(year=2021, month=2)]
데이터 셋 로딩 완료

applog 전처리중
Orig. data len: 13439757
Aft. drop-nan: 13439757
applog 전처리 완료

member 전처리중
Orig. data len: 2234074
Aft. drop-nan: 2234074 

member 전처리 완료

goal_mission 전처리중
Orig. data len: 987857
Aft. drop-nan: 171133
goal_mission 전처리 완료



In [258]:
appdf, mbrdf, gmdf = df

In [27]:
appdf

Unnamed: 0,party_id,vst_dtm,page_cd,sty_tms,sesn_id,month,menu_nm_1,menu_nm_2
0,861497,2021-02-01 00:10:13,/WID_2_99,1938,6DBC7D4C-55B4-4021-9AB4-D55AA02CC90C,2021-02,위젯,위젯
1,861497,2021-02-01 00:42:31,/WID_2_99,920,6DBC7D4C-55B4-4021-9AB4-D55AA02CC90C,2021-02,위젯,위젯
2,861497,2021-02-01 00:57:51,/WID_2_99,764,6DBC7D4C-55B4-4021-9AB4-D55AA02CC90C,2021-02,위젯,위젯
3,861497,2021-02-01 01:10:35,/WID_2_99,25726,6DBC7D4C-55B4-4021-9AB4-D55AA02CC90C,2021-02,위젯,위젯
4,861497,2021-02-01 08:19:21,/WID_2_99,77,6DBC7D4C-55B4-4021-9AB4-D55AA02CC90C,2021-02,위젯,위젯
...,...,...,...,...,...,...,...,...
13439666,1200374790,2021-02-03 21:21:51,/MY_2_01,0,247eead9-c9d3-4d1b-bb00-e81fcda5a8a0,2021-02,바이탈리티,바이탈리티 메인
13439667,1200374790,2021-02-08 14:29:35,/member/login,26,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02,가입/로그인/비밀번호,로그인
13439668,1200374790,2021-02-08 14:30:01,/member/login/retry_membership,17,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02,가입/로그인/비밀번호,회원가입
13439669,1200374790,2021-02-08 14:30:18,/member/login/retry_membership,0,c50a6e6c-de5a-40fd-b954-0d59d7dd1b2d,2021-02,가입/로그인/비밀번호,회원가입


In [28]:
appdf['dt'] = appdf['vst_dtm'].dt.to_period('D')
appdf['mth'] = appdf['vst_dtm'].dt.to_period('M')

In [29]:
#appdf 전처리
#1.위젯 제거 #수인선임님이 진행해주실 예정
print('before drop 위젯', len(appdf))
appdf = appdf.loc[appdf['menu_nm_1'] != '위젯']
print('after drop 위젯', len(appdf))

before drop 위젯 13439671
after drop 위젯 13302819


In [30]:
#2. menu_nm_1 == Nan or menu_nm_2 == Nan인 경우로만 이뤄진 session_id 제거하기
## 먼저, 전체 고유 party_id와 session_id 갯수 -> menu_nm_1이 nan인 경우에 대해서 제거된 케이스
menusess1 = appdf[['party_id','sesn_id','page_cd']].groupby(['party_id','sesn_id']).first().reset_index()
menusess1 = menusess1[['party_id','sesn_id']]
len(menusess1)

727030

In [31]:
## nan이 제거된 전체 고유 party_id와 session_id갯수
menusess2 = appdf[['party_id','sesn_id','menu_nm_1','page_cd']].groupby(['party_id','sesn_id','menu_nm_1']).count().reset_index()
menusess2 = menusess2[['party_id','sesn_id']].drop_duplicates()
len(menusess2)

726857

In [32]:
menusess = pd.concat([menusess1, menusess2], axis=0)
menusess = menusess.loc[~menusess.duplicated(keep=False)]

In [33]:
pids_isin = np.isin(appdf['party_id'], menusess['party_id'])
sess_isin = np.isin(appdf['sesn_id'], menusess['sesn_id'])

In [34]:
print(len(pids_isin), len(sess_isin))

13302819 13302819


In [35]:
appdf = appdf.loc[~np.all([pids_isin, sess_isin], axis=0)]

In [36]:
len(appdf)

13302599

In [37]:
menusess3 = appdf[['party_id','sesn_id','menu_nm_2','page_cd']].groupby(['party_id','sesn_id','menu_nm_2']).count().reset_index()
menusess3 = menusess3[['party_id','sesn_id']].drop_duplicates()
len(menusess3)

726817

In [38]:
menusess = pd.concat([menusess1, menusess3], axis=0)
menusess = menusess.loc[~menusess.duplicated(keep=False)]

pids_isin = np.isin(appdf['party_id'], menusess['party_id'])
sess_isin = np.isin(appdf['sesn_id'], menusess['sesn_id'])
appdf = appdf.loc[~np.all([pids_isin, sess_isin], axis=0)]
len(appdf)

13302549

In [39]:
#전처리3 보험가입자 제외
lbl = read_csv(cfg.label)
lbl['party_id'] = lbl['PartyId']
lbl = lbl.drop(columns=['Unnamed: 0', 'PartyId'])

In [40]:
lbl['month'] = pd.to_datetime(lbl['month'], format='%Y-%m')
lblmth = lbl.loc[np.all([lbl['month'].dt.year==2021, lbl['month'].dt.month==2], axis=0)]
lblmth = lblmth.drop(columns=['month'])
appdf = pd.merge(appdf, lblmth, on=['party_id'], how='inner', sort=False)

In [41]:
seqdf = appdf[['party_id','page_cd','sesn_id','dt']].groupby(['party_id','sesn_id']).last()['dt']
seqdf = seqdf.reset_index()
seqdf = seqdf.sort_values(['party_id','dt'])
seqdf = seqdf.reset_index(drop=True)

In [42]:
#session간의 방문일자 차이
def diff_vstdate(x):
    b = pd.concat([pd.Series(x['dt'].iloc[0]), x['dt'].iloc[:-1]]).reset_index(drop=True)
    seqdiff = x['dt'].reset_index(drop=True).dt.to_timestamp() - b.dt.to_timestamp()
    seqdiff.name = "diff_dt"
    return seqdiff
diffdf = seqdf.groupby(['party_id']).apply(diff_vstdate)

In [43]:
diffdf = diffdf.reset_index(drop=True)
seqdf = pd.concat([seqdf, diffdf], axis=1)
seqdf

Unnamed: 0,party_id,sesn_id,dt,diff_dt
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days
...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,2021-02-28,0 days
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,2021-02-28,0 days
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,2021-02-28,0 days
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days


In [44]:
pcd = read_csv(cfg.pgcd)
lbl = read_csv(cfg.label)

In [45]:
uni_pcd_depth1 = pcd['menu_nm_1'].unique()
uni_pcd_depth2 = pcd['menu_nm_2'].unique()

In [46]:
print(len(uni_pcd_depth1))
uni_pcd_depth1

24


array([nan, '바이탈리티', '이벤트', '마음챙김', '활동', '디바이스연동', '공통', '보험', '설정',
       '갤럭시프로그램', '사이트맵', '꿀팁', '바이탈리티 할인', '건강도전', '건강', '바이탈리티 나이',
       '가입/로그인/비밀번호', '내바이탈리티', '등급리워드', '헬시푸드', '위젯', '주간 리워드', '주간미션',
       '건강걷기'], dtype=object)

In [47]:
print(len(uni_pcd_depth2))
uni_pcd_depth2

60


array([nan, '바이탈리티 메인', '이벤트', '마음챙김', '활동-헤더메시지', '디바이스연동', '맞춤보험', '채널',
       '갤럭시프로그램', '팝업', '바이탈리티-헤더메시지', '사이트맵', '꿀팁 메인', '꿀팁-컨텐츠', '자동로그인',
       '바이탈리티 할인', '꿀팁-영양', '꿀팁-예방', '꿀팁-운동', '꿀팁-일상', '건강도전', '디지털다이렉트',
       '서비스가이드', '바이탈리티 나이', '건강증진형보험', '회원가입', '내바이탈리티', '멀티팝업', '공지팝업',
       'FAQ', '공지사항', '등급리워드', '헬시푸드', '약관', '건강-배너', '건강메인', '정밀검진/예방접종',
       '마음건강진단', '기초건강검진', '영양균형진단', '금연선언', '시작', '다이렉트보험', '위젯',
       '비밀번호재설정', '로그인', '꿀팁-상단배너', '바이탈리티-상단배너', '보험-상단배너', '보험 메인',
       '보험/금융팁', '보험-헤더메시지', '개인설정', '회원탈퇴', '제휴사 할인 이벤트', '주간 리워드',
       '활동 메인', '주간미션', '건강걷기', '활동-상단배너'], dtype=object)

In [55]:
#session별 페이지 길이
pglen_perse = appdf.groupby(['party_id','sesn_id']).count().reset_index()[['party_id','sesn_id','page_cd']]
pglen_perse

Unnamed: 0,party_id,sesn_id,page_cd
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,23
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,1
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,6
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,6
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,19
...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,36
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,24
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,13
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,47


In [48]:
#카테고리별 방문횟수
uv_per_d1 = appdf.groupby(['party_id','sesn_id','menu_nm_1']).count().reset_index()[['party_id','sesn_id','menu_nm_1','page_cd']]
uv_per_d2 = appdf.groupby(['party_id','sesn_id','menu_nm_2']).count().reset_index()[['party_id','sesn_id','menu_nm_2','page_cd']]

uv_per_d1 = uv_per_d1.pivot(index=['party_id','sesn_id'], columns='menu_nm_1', values='page_cd')
uv_per_d1 = uv_per_d1.fillna(0).reset_index()

uv_per_d2 = uv_per_d2.pivot(index=['party_id','sesn_id'], columns='menu_nm_2', values='page_cd')
uv_per_d2 = uv_per_d2.fillna(0).reset_index()

In [49]:
stydepth1 = appdf[['party_id','sesn_id','menu_nm_1','sty_tms']].groupby(['party_id','sesn_id','menu_nm_1']).mean()
stydepth1 = stydepth1.reset_index()
stydepth1 = stydepth1.pivot(index=['party_id','sesn_id'], columns=['menu_nm_1'], values=['sty_tms'])
stydepth1 = stydepth1.fillna(0).reset_index()

In [50]:
stydepth2 = appdf[['party_id','sesn_id','menu_nm_2','sty_tms']].groupby(['party_id','sesn_id','menu_nm_2']).mean()
stydepth2 = stydepth2.reset_index()
stydepth2 = stydepth2.pivot(index=['party_id','sesn_id'], columns=['menu_nm_2'],values=['sty_tms'])
stydepth2 = stydepth2.fillna(0).reset_index()

In [51]:
#종료율 관련
endmenu = appdf[['party_id','page_cd','sesn_id','menu_nm_1']].groupby(['party_id','sesn_id']).last().reset_index()

In [52]:
endmenu['value'] = 1

In [53]:
endmenu = endmenu.pivot(index=['party_id','sesn_id'], columns=['menu_nm_1'], values=['value'])
endmenu = endmenu.fillna(0).reset_index()

In [56]:
#seqdf, pglen_perse, uv_per_d1, uv_per_d2, stydepth1, stydepth2, endmenu
assert len(seqdf) == len(pglen_perse) == len(uv_per_d1) ==len(uv_per_d2) == len(stydepth1) == len(stydepth2) == len(endmenu)

In [57]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, pglen_perse, on=['party_id', 'sesn_id'])
assert beflen == len(seqdf)

In [58]:
uv_per_d1.columns.name = None

In [59]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, uv_per_d1, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

In [60]:
uv_per_d2.columns.name = None
display(uv_per_d2)

Unnamed: 0,party_id,sesn_id,FAQ,개인설정,갤럭시프로그램,건강-배너,건강걷기,건강도전,건강메인,건강증진형보험,...,제휴사 할인 이벤트,주간 리워드,주간미션,채널,헬시푸드,활동 메인,활동-상단배너,활동-헤더메시지,회원가입,회원탈퇴
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,4.0,0.0,0.0,0.0,1.0,3.0,...,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [61]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, uv_per_d2, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

In [62]:
beflen = len(seqdf)
seqdf = pd.merge(seqdf, stydepth1, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [63]:
stydepth2.columns.name = None
beflen = len(seqdf)
seqdf = pd.merge(seqdf, stydepth2, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

In [64]:
display(endmenu)
endmenu.columns.name = None
beflen = len(seqdf)
seqdf = pd.merge(seqdf, endmenu, on=['party_id','sesn_id'])
assert beflen == len(seqdf)

Unnamed: 0_level_0,party_id,sesn_id,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
menu_nm_1,Unnamed: 1_level_1,Unnamed: 2_level_1,가입/로그인/비밀번호,갤럭시프로그램,건강,건강걷기,건강도전,공통,꿀팁,내바이탈리티,...,바이탈리티 나이,바이탈리티 할인,보험,사이트맵,설정,이벤트,주간 리워드,주간미션,헬시푸드,활동
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,32662597-024D-464B-8B7A-8214A49D44F0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,3591EC2D-406E-4470-9E37-ED529276CA59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,ABF36BBE-E91B-4EAF-BB84-CA656AB49566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [65]:
seqdf

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x,...,"(value, 바이탈리티 나이)","(value, 바이탈리티 할인)","(value, 보험)","(value, 사이트맵)","(value, 설정)","(value, 이벤트)","(value, 주간 리워드)","(value, 주간미션)","(value, 헬시푸드)","(value, 활동)"
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days,23,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days,6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days,6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days,8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,2021-02-28,0 days,36,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,2021-02-28,0 days,24,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,2021-02-28,0 days,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days,47,1.0,4.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
#mbrdf와 합치기
#추가로 drop해야할 row --> util에 넣채
# mbrdf = mbrdf.drop(columns=['wk_misn_sta_dt','fee_yn','fcip_yn','lst_vst_dt'])

KeyError: "['wk_misn_sta_dt' 'fee_yn' 'fcip_yn'] not found in axis"

In [67]:
seqdf['dt'] = seqdf['dt'].dt.to_timestamp()

In [77]:
np.unique(seqdf['party_id'].values).shape

(52136,)

In [78]:
np.unique(mbrdf['party_id'].values).shape

(96339,)

In [68]:
merged_mbr = pd.merge(seqdf, mbrdf, on=['party_id', 'dt'], how='inner')

In [168]:
#gmdf와 합치기

In [170]:
gmdf

Unnamed: 0,party_id,p_event_apl_dte,points_value,points_effective_dte
32181,870485,2021-02-01,100,2021-02-01
71412,870485,2021-02-02,50,2021-02-02
151320,870485,2021-02-04,100,2021-02-04
224056,870485,2021-02-06,100,2021-02-06
265218,870485,2021-02-08,100,2021-02-08
...,...,...,...,...
983509,1200362265,2021-02-28,100,2021-02-28
973873,1200362284,2021-03-06,100,2021-02-28
985846,1200362289,2021-02-28,50,2021-02-28
981557,1200362291,2021-02-28,50,2021-02-28


In [149]:
gmjan = read_csv('data/mission/goal_misn_202101.csv')

In [150]:
gmjan = prep_gmlogs([gmjan])

Orig. data len: 1091213
Aft. drop-nan: 368879


In [None]:
gmjan['points_effective_dte'] = pd.to_datetime(gmjan['points_effective_dte'], format='%Y-%m-%d'

In [157]:
gmjan.loc[(gmjan['p_event_apl_dte'] - gmjan['points_effective_dte']).dt.days > 10]

Unnamed: 0,party_id,p_event_apl_dte,points_value,points_effective_dte,conn_equip
423348,5406482,2021-01-24,100,2021-01-13,Garmin
342822,19698977,2021-01-24,50,2021-01-11,fitbit
783686,33593996,2021-02-04,50,2021-01-23,fitbit
437750,123522597,2021-02-01,50,2021-01-13,fitbit
639979,169705088,2021-02-01,0,2021-01-19,suunto
...,...,...,...,...,...
207570,1200288901,2021-01-24,50,2021-01-06,Garmin
207572,1200288901,2021-01-24,50,2021-01-06,Garmin
630786,1200291249,2021-02-16,50,2021-01-19,S Health Third-party
918911,1200330013,2021-02-16,50,2021-01-27,Garmin


In [158]:
gmdf = pd.concat([gmjan, gmdf], axis=0)

In [162]:
gmdf = gmdf.sort_values(['party_id','p_event_apl_dte'])

In [164]:
gmdf = gmdf.loc[gmdf['p_event_apl_dte'].dt.month == 2]

In [165]:
gmdf

Unnamed: 0,party_id,p_event_apl_dte,points_value,points_effective_dte,conn_equip
32181,870485,2021-02-01,100,2021-02-01,
71412,870485,2021-02-02,50,2021-02-02,
151320,870485,2021-02-04,100,2021-02-04,
224056,870485,2021-02-06,100,2021-02-06,
265218,870485,2021-02-08,100,2021-02-08,
...,...,...,...,...,...
980303,1200362245,2021-02-28,100,2021-02-28,
983509,1200362265,2021-02-28,100,2021-02-28,
985846,1200362289,2021-02-28,50,2021-02-28,
981557,1200362291,2021-02-28,50,2021-02-28,


In [168]:
gmdf.loc[gmdf['party_id'] == 870485]

Unnamed: 0,party_id,p_event_apl_dte,points_value,points_effective_dte,conn_equip
32181,870485,2021-02-01,100,2021-02-01,
71412,870485,2021-02-02,50,2021-02-02,
151320,870485,2021-02-04,100,2021-02-04,
224056,870485,2021-02-06,100,2021-02-06,
265218,870485,2021-02-08,100,2021-02-08,
303852,870485,2021-02-09,100,2021-02-09,
342791,870485,2021-02-10,100,2021-02-10,
408960,870485,2021-02-12,50,2021-02-12,
437501,870485,2021-02-14,50,2021-02-13,
545029,870485,2021-02-16,100,2021-02-16,


In [167]:
seqdf.loc[seqdf['party_id'] == 870485]

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x,...,"(value, 바이탈리티 나이)","(value, 바이탈리티 할인)","(value, 보험)","(value, 사이트맵)","(value, 설정)","(value, 이벤트)","(value, 주간 리워드)","(value, 주간미션)","(value, 헬시푸드)","(value, 활동)"
38,870485,22d558d5-46c1-4b3e-8b6e-6c2555649a6d,2021-02-01,0 days,20,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,870485,234eaa48-4adb-430a-b0a2-094f458137c5,2021-02-01,0 days,29,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,870485,5763a9ab-2f88-45b9-8ff6-207eeb61430a,2021-02-01,0 days,14,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
41,870485,6a918435-3550-444c-ae0a-b39408375f1d,2021-02-01,0 days,9,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,870485,dfc642f8-6794-47f1-8cf6-459264187acc,2021-02-01,0 days,12,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,870485,48ba9f3c-a79b-4748-a7fb-14bf4da69e9c,2021-02-28,1 days,10,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209,870485,63a57116-16ed-4ad7-9d65-67a2a6d5b962,2021-02-28,0 days,8,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210,870485,a5162c95-bc8c-42de-8743-9c3277f9c7b6,2021-02-28,0 days,12,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,870485,adae473f-17d7-4ce3-ae7e-d0e9bb326663,2021-02-28,0 days,20,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [171]:
def check_ge5(x):
    b = pd.concat([pd.Series(x['p_event_apl_dte'].iloc[0]), x['p_event_apl_dte'].iloc[:-1]]).reset_index(drop=True)
    seqdiff = x['p_event_apl_dte'].reset_index(drop=True) - b
    seqdiff.name = "diff_dt"
    return seqdiff
    
gmdiff = gmdf.groupby(['party_id']).apply(check_ge5)

In [183]:
gmdiff.loc[gmdiff.dt.days < 5]

party_id     
870485      0   0 days
            1   1 days
            2   2 days
            3   2 days
            4   2 days
                 ...  
1200362245  0   0 days
1200362265  0   0 days
1200362289  0   0 days
1200362291  0   0 days
1200362302  0   0 days
Name: diff_dt, Length: 483741, dtype: timedelta64[ns]

In [184]:
gmdf.loc[gmdf['party_id'] == 870485]

Unnamed: 0,party_id,p_event_apl_dte,points_value,points_effective_dte,conn_equip
32181,870485,2021-02-01,100,2021-02-01,
71412,870485,2021-02-02,50,2021-02-02,
151320,870485,2021-02-04,100,2021-02-04,
224056,870485,2021-02-06,100,2021-02-06,
265218,870485,2021-02-08,100,2021-02-08,
303852,870485,2021-02-09,100,2021-02-09,
342791,870485,2021-02-10,100,2021-02-10,
408960,870485,2021-02-12,50,2021-02-12,
437501,870485,2021-02-14,50,2021-02-13,
545029,870485,2021-02-16,100,2021-02-16,


In [189]:
seqdf.loc[seqdf['party_id'] == 870485].iloc[100:150]

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x,...,"(value, 바이탈리티 나이)","(value, 바이탈리티 할인)","(value, 보험)","(value, 사이트맵)","(value, 설정)","(value, 이벤트)","(value, 주간 리워드)","(value, 주간미션)","(value, 헬시푸드)","(value, 활동)"
138,870485,da879ed1-dd95-453e-8183-11d97176d22a,2021-02-15,0 days,25,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,870485,03a3d43c-9fab-45eb-bbf2-aa6e7c9992f8,2021-02-16,1 days,8,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,870485,73b3b412-c8e6-4967-a1b2-a25b60f574ff,2021-02-16,0 days,16,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
141,870485,940d9303-7973-4a1e-9049-c45f761433b1,2021-02-16,0 days,10,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,870485,a5fb4d38-605a-4e7d-bb57-68871d8a8336,2021-02-16,0 days,12,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143,870485,c3aab861-4d34-4ea2-a522-32a8ddb616e6,2021-02-16,0 days,9,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,870485,dec3f818-4784-43f7-9a4e-1e0b5289cec3,2021-02-16,0 days,9,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,870485,40218e8a-7c31-4961-8ca7-e9c9c155e659,2021-02-17,1 days,9,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,870485,751ae6f3-7db5-43b6-880c-f217fc0b8540,2021-02-17,0 days,9,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,870485,8d6de2ed-c985-4e09-bbad-46dae48d7da5,2021-02-17,0 days,15,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
#포인트 반영일이랑 획득일 차이가 10일이상인 row 제외
print(len(gmdf))
gmdf = gmdf.loc[(gmdf['p_event_apl_dte'] - gmdf['points_effective_dte']).dt.days <= 10]
print(len(gmdf))

506526
506083


In [239]:
pointsdf = gmdf[['party_id','p_event_apl_dte','points_value']]
pointsdf['points_value'] = pointsdf['points_value'].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [241]:
pointsdf = pointsdf.groupby(['party_id','p_event_apl_dte']).sum()

In [242]:
pointsdf = pointsdf.reset_index()

In [246]:
pointsdf['dt'] = pointsdf['p_event_apl_dte']

In [247]:
pointsdf = pointsdf.drop(columns=['p_event_apl_dte'])

In [248]:
pointsdf.head()

Unnamed: 0,party_id,points_value,dt
0,870485,100.0,2021-02-01
1,870485,50.0,2021-02-02
2,870485,100.0,2021-02-04
3,870485,100.0,2021-02-06
4,870485,100.0,2021-02-08


In [249]:
mergedaa = pd.merge(merged_mbr, pointsdf, on=['party_id','dt'], how='left')

In [277]:
mergedaa[['achv_rat','points_value']] = mergedaa[['achv_rat','points_value']].fillna(value=0)

In [279]:
mergedaa

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x,...,vtlt_age,cur_mbrsh_rwrd_st_cd,cur_mbrsh_pd_acqr_pt,lst_vst_dt,push_alarm_yn,count_vtlt_age_dt,diff_age,achv_rat,active_dur,points_value
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days,23,0.0,2.0,1.0,0.0,0.0,...,31,1,4100,20210331,Y,1.0,-2,0.318182,920 days,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days,1,0.0,0.0,0.0,0.0,0.0,...,31,1,4100,20210331,Y,1.0,-2,0.318182,921 days,0.0
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days,6,0.0,0.0,0.0,0.0,0.0,...,31,1,4100,20210331,Y,1.0,-2,0.315789,924 days,0.0
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days,6,0.0,0.0,0.0,0.0,0.0,...,31,1,4100,20210331,Y,1.0,-2,0.315789,928 days,0.0
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days,8,0.0,0.0,0.0,0.0,0.0,...,31,1,4100,20210331,Y,1.0,-2,0.315789,928 days,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,2021-02-28,0 days,36,1.0,0.0,1.0,0.0,0.0,...,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,2021-02-28,0 days,24,1.0,0.0,0.0,0.0,0.0,...,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,2021-02-28,0 days,13,0.0,0.0,0.0,0.0,0.0,...,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days,47,1.0,4.0,1.0,0.0,0.0,...,44,1,500,20210301,Y,1.0,14,0.000000,0 days,0.0


In [280]:
mergedaa.iloc[:,:10]

Unnamed: 0,party_id,sesn_id,dt,diff_dt,page_cd,가입/로그인/비밀번호,갤럭시프로그램_x,건강,건강걷기_x,건강도전_x
0,861497,0DE31D0B-9E4A-483A-A35C-E3D081100872,2021-02-01,0 days,23,0.0,2.0,1.0,0.0,0.0
1,861497,2E53BA03-0D5B-4508-A5FE-BD28A5516C61,2021-02-02,1 days,1,0.0,0.0,0.0,0.0,0.0
2,861497,3591EC2D-406E-4470-9E37-ED529276CA59,2021-02-05,3 days,6,0.0,0.0,0.0,0.0,0.0
3,861497,32662597-024D-464B-8B7A-8214A49D44F0,2021-02-09,4 days,6,0.0,0.0,0.0,0.0,0.0
4,861497,F4EA8E37-9627-47CA-BE7D-BC8ABC3F5882,2021-02-09,0 days,8,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
664271,1200362291,17a02086-c213-4057-8b7a-ce05f5e57972,2021-02-28,0 days,36,1.0,0.0,1.0,0.0,0.0
664272,1200362291,9654c21a-9511-4cc3-abda-e2b149102e86,2021-02-28,0 days,24,1.0,0.0,0.0,0.0,0.0
664273,1200362291,a4532e32-182d-4b99-bddf-4ec28a8b7f01,2021-02-28,0 days,13,0.0,0.0,0.0,0.0,0.0
664274,1200362301,31dd3ca9-89f7-40b8-841a-47ddcb78dc36,2021-02-28,0 days,47,1.0,4.0,1.0,0.0,0.0


In [298]:
mergedaa.iloc[:,179:]

Unnamed: 0,gender_cd,age,vtlt_age,cur_mbrsh_rwrd_st_cd,cur_mbrsh_pd_acqr_pt,lst_vst_dt,push_alarm_yn,count_vtlt_age_dt,diff_age,achv_rat,active_dur,points_value
0,MALE,33,31,1,4100,20210331,Y,1.0,-2,0.318182,920 days,0.0
1,MALE,33,31,1,4100,20210331,Y,1.0,-2,0.318182,921 days,0.0
2,MALE,33,31,1,4100,20210331,Y,1.0,-2,0.315789,924 days,0.0
3,MALE,33,31,1,4100,20210331,Y,1.0,-2,0.315789,928 days,0.0
4,MALE,33,31,1,4100,20210331,Y,1.0,-2,0.315789,928 days,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
664271,FEMALE,43,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664272,FEMALE,43,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664273,FEMALE,43,48,1,550,20210331,Y,1.0,5,0.000000,0 days,50.0
664274,MALE,30,44,1,500,20210301,Y,1.0,14,0.000000,0 days,0.0


In [302]:
mergedaa['gender_cd'].astype("category").cat.codes

0         1
1         1
2         1
3         1
4         1
         ..
664271    0
664272    0
664273    0
664274    1
664275    0
Length: 664276, dtype: int8

In [303]:
mergedaa['gender_cd']

0           MALE
1           MALE
2           MALE
3           MALE
4           MALE
           ...  
664271    FEMALE
664272    FEMALE
664273    FEMALE
664274      MALE
664275    FEMALE
Name: gender_cd, Length: 664276, dtype: object

In [304]:
mergedaa['push_alarm_yn'].astype("category").cat.codes

0         1
1         1
2         1
3         1
4         1
         ..
664271    1
664272    1
664273    1
664274    1
664275    1
Length: 664276, dtype: int8