In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('processd_data_0613.csv', index_col=0)

In [3]:
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session', 'year',
       'month', 'day', 'hour', 'main_category', 'sub_category',
       'sub_sub_category'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,year,month,day,hour,main_category,sub_category,sub_sub_category
0,2020-09-24 11:57:06+00:00,view,1996170,2144415922528452715,electronics.telephone,others,31.9,1515915625519388267,LJuJVLEjPT,2020,9,24,11,electronics,telephone,others
1,2020-09-24 11:57:26+00:00,view,139905,2144415926932472027,computers.components.cooler,zalman,17.16,1515915625519380411,tdicluNnRY,2020,9,24,11,computers,components,cooler
2,2020-09-24 11:57:27+00:00,view,215454,2144415927158964449,others,others,9.81,1515915625513238515,4TMArHtXQy,2020,9,24,11,others,others,others
3,2020-09-24 11:57:33+00:00,view,635807,2144415923107266682,computers.peripherals.printer,pantum,113.81,1515915625519014356,aGFYrNgC08,2020,9,24,11,computers,peripherals,printer
4,2020-09-24 11:57:36+00:00,view,3658723,2144415921169498184,others,cameronsino,15.87,1515915625510743344,aa4mmk0kwQ,2020,9,24,11,others,others,others


In [5]:
df.drop(columns=['product_id', 'category_id', 'category_code', 'year', 'month', 'day', 'hour', 'sub_category', 'sub_sub_category'], inplace=True)

In [6]:
df['event_time'] = pd.to_datetime(df['event_time'])
df['event_date'] = df['event_time'].dt.date

In [7]:
#데이터셋의 일자 중 가장 마지막 날
latest_date = df['event_date'].max()

In [8]:
gr=df.groupby('user_id')

In [9]:
User_RF = gr.agg(
    last_visit = ('event_date', lambda x:x.max()),# 최근 방문일
    total_session = ('user_session', lambda x:x.nunique()),# 유저별 방문횟수 - user별로 몇 번 접속했는지(고유한 세션 수)
    frequency_days = ('event_date', lambda x:x.nunique()),# 유저별 방문일수 - user별로 몇 일 접속했는지(고유한 일자 수)
    view_amount = ('event_type', lambda x: (x == 'view').sum()),# 유저별 이벤트 타입별 누적 수
    cart_amount = ('event_type', lambda x: (x == 'cart').sum()),# 유저별 이벤트 타입별 누적 수
    purchase_amount = ('event_type', lambda x: (x == 'purchase').sum()),# 유저별 이벤트 타입별 누적 수
    first_visit_date = ('event_date', lambda x:x.min())# 유저별 첫 방문일
)

In [19]:
since_first_visit_date = gr.apply(lambda x:latest_date - x['event_date'].min()).dt.days# 유저별 첫 방문일로부터 지난 시간(가입 유지 기간)
since_last_visit_date = gr.apply(lambda x:latest_date - x['event_date'].max()).dt.days# 유저별 마지막 방문일로부터 지난 시간(미방문 기간)

In [22]:
User_visit = pd.concat([since_first_visit_date, since_last_visit_date], axis=1)
User_visit.columns = ['since_first_visit(days)', 'since_last_visit(days)']

In [None]:
User_RF = pd.concat([User_RF, User_visit], axis=1)

In [11]:
purchase_df = df[df['event_type'] == 'purchase']
purchase_gr = purchase_df.groupby('user_id')

User_M = purchase_gr.agg(
    purchase_amount = ('price', 'sum'),# 유저별 누적 소비액
    avg_purchase_amount = ('price', 'mean'),
    # 유저별 min 소비액
    avg_min_amount = ('price', 'min'),
    # 유저별 max 소비액
    avg_max_amount = ('price', 'max')
)

In [12]:
User_M

Unnamed: 0_level_0,purchase_amount,avg_purchase_amount,avg_min_amount,avg_max_amount
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1515915625353286099,119.03,119.030000,119.03,119.03
1515915625353457259,55.16,55.160000,55.16,55.16
1515915625353534622,57.15,19.050000,19.05,19.05
1515915625353561691,345.72,172.860000,172.86,172.86
1515915625353900095,57.85,28.925000,18.10,39.75
...,...,...,...,...
1515915625610995356,29.40,29.400000,29.40,29.40
1515915625610997879,881.65,293.883333,292.83,294.41
1515915625610999486,1444.62,481.540000,481.54,481.54
1515915625611008742,38.83,38.830000,38.83,38.83


- User_preference

In [24]:
def get_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

def brand_mode_by_event(event_type):
    event_df = df[df['event_type'] == event_type]
    event_gr = event_df.groupby('user_id')['brand'].apply(get_mode).reset_index()
    event_gr = event_gr.rename(columns={'brand': f'{event_type}_brand_prefer'})
    return event_gr

def cat_mode_by_event(event_type):
    event_df = df[df['event_type'] == event_type]
    event_gr = event_df.groupby('user_id')['main_category'].apply(get_mode).reset_index()
    event_gr = event_gr.rename(columns={'main_category': f'{event_type}_cat_prefer'})
    return event_gr

In [25]:
brand_preferences = None

for event_type in ['view', 'cart', 'purchase']:
    brand_prefer_df = brand_mode_by_event(event_type)
                                      
    if brand_preferences is None:
        brand_preferences = brand_prefer_df
    else:
        brand_preferences = pd.merge(brand_preferences, brand_prefer_df, on='user_id', how='outer')

brand_preferences.head()

Unnamed: 0,user_id,view_brand_prefer,cart_brand_prefer,purchase_brand_prefer
0,1515915625353226922,honor,,
1,1515915625353230067,kester,,
2,1515915625353230683,creative,,
3,1515915625353230922,msi,,
4,1515915625353234047,samsung,,


In [26]:
category_preferences = None

for event_type in ['view', 'cart', 'purchase']:
    cat_prefer_df = cat_mode_by_event(event_type)
                                      
    if category_preferences is None:
        category_preferences = cat_prefer_df
    else:
        category_preferences = pd.merge(category_preferences, cat_prefer_df, on='user_id', how='outer')

category_preferences.head()

Unnamed: 0,user_id,view_cat_prefer,cart_cat_prefer,purchase_cat_prefer
0,1515915625353226922,electronics,,
1,1515915625353230067,others,,
2,1515915625353230683,electronics,,
3,1515915625353230922,computers,,
4,1515915625353234047,electronics,,


- Duration

In [16]:
df['event_time'] = pd.to_datetime(df['event_time'])
session_gr = df.groupby(['user_id','user_session'])

session_df = session_gr['event_time'].agg(['min', 'max'])
session_df['session_duration'] = (session_df['max']-session_df['min']).dt.total_seconds()

In [28]:
# 유저별 총 체류시간
# 유저별 평균 체류시간
duration = session_df.groupby('user_id').agg(
    avg_session_time = ('session_duration', lambda x : x.mean()),
    total_session_time = ('session_duration', lambda x : x.sum())
)
duration

Unnamed: 0_level_0,avg_session_time,total_session_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1515915625353226922,0.0,0.0
1515915625353230067,0.0,0.0
1515915625353230683,624.5,2498.0
1515915625353230922,0.0,0.0
1515915625353234047,12244190.0,12244190.0
...,...,...
1515915625611023671,0.0,0.0
1515915625611023730,0.0,0.0
1515915625611024014,0.0,0.0
1515915625611024020,0.0,0.0


In [14]:
from functools import reduce

# 모든 데이터프레임 병합
dfs_to_merge = [User_RF, User_M, brand_preferences, category_preferences, duration]
preferences = reduce(lambda left, right: pd.merge(left, right, on='user_id', how='outer'), dfs_to_merge)

# 병합된 데이터프레임을 CSV 파일로 저장
preferences.to_csv('merged_all_sunhye_4.csv', index=False)