# 0. 환경설정

In [2]:
# 라이브러리 설정
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# import math
# from torch import nn, optim
# import torch
# import torch.nn.functional as F
# from torch.autograd import Variable

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# 1. 데이터 업로드

## 1.1) 3개 데이터(유저 프로파일, 이미지, 검색 기록) 업로드

In [3]:
# 데이터 업로드
# 1. 유저 프로파일 데이터 (전체유저 User profiling : 50,172 명)
n_user_info_member = pd.read_csv('./data/n_user_info_member.csv') # , dtype ={'birth_year' : str})

# 2. 이미지 검색 기록 데이터 (검색기록  262,372 건)
n_prod_view_log = pd.read_csv('./data/n_prod_view_log.csv', dtype ={'product_id' : str})

# 3. 검색 기록 데이터 (검색기록 160,552 건)
n_search_log = pd.read_csv('./data/n_search_log.csv', dtype ={'product_id' : str})
n_search_log = n_search_log[n_search_log['product_id'] != '          '] # 빈칸 제거

In [11]:
print(n_user_info_member.shape, n_prod_view_log.shape, n_search_log.shape)

(50172, 20) (262372, 4) (65285, 4)


# 2. 데이터 전처리

## 2.1) 사용자 정보 데이터 전처리

In [85]:
def preprocessing(data):
    df = data.copy()

    # 1. member_id만 사용 - fcm_token 제거
    df = df[df['member_id'].notna()] # 7,051건
    
    # 2. 불필요한 컬럼 제거
    df.drop(['fcm_token', 'beauty_level'], axis = 1, inplace = True)

    # 3. skin_type 전처리
    df['skin_type'].replace('         ', '0', inplace = True)

    # 4. gender 변수 전처리
    df['gender'] = [i.strip() for i in df['gender']]          # strip 적용

    gender = {'M' : '0', 'F' : '1'}
    df['gender']  = df['gender'].map(lambda x : gender[x])    # 0과 1로 변경

    # 5. birth_year 전처리 
    df['birth_year'] = df['birth_year'].astype('int')
    cond1 = (df['birth_year'] > 1950)
    cond2 = (df['birth_year'] < 2016)
    df = df.loc[cond1 & cond2 ,:]
    # 현재 나이 계산
    df['birth_year'] = (2021 - df['birth_year']) + 1
    
    
    # 6. skin_tone 전처리
    df['skin_tone'].replace('         ', '0', inplace = True)

    # 7. personal_color 전처리
    df['personal_color'].replace('              ', '0', inplace = True)
    
    # 8. 데이터 타입 변경(int4로 변경)
    int4_col = [s for s in df.columns if "worry" in s] 
    for i in int4_col:
        df = df.astype({i : 'int8'})

    return df

In [86]:
user_info_data = preprocessing(n_user_info_member)
print(user_info_data.shape)
user_info_data.head() # 7,004 건

(7004, 18)


Unnamed: 0,member_id,skin_type,gender,birth_year,skin_tone,personal_color,worry_1,worry_2,worry_3,worry_4,worry_5,worry_6,worry_7,worry_8,worry_9,worry_10,worry_11,worry_12
0,MlRwTkpBOHdmd1pHeVFGZDBnSkRvSm5fQ1FaekNGQ2NRVD...,2,1,12,2,1,1,0,0,0,0,0,0,1,1,0,0,1
1,WUZwWnFPcFA4TXJ2aHlxdU43aEYxVDltVEtVeVUwWTZzek...,4,1,42,4,1,0,0,1,1,0,0,1,1,0,0,0,1
2,X0RUZHM0bHVRdmdTU1N6WFdvdkdQbnFzS3o3OHFLeEQzMk...,4,1,15,4,1,1,0,0,0,1,0,0,1,0,1,0,1
3,M3NuTWRYWVREeU5zWE5xWVpYOHJkUGpkRUVDa2dpS3A1aE...,3,1,23,2,3,1,0,1,0,0,0,0,0,0,0,0,0
4,Qjlncm5RaUNEb0ZzZmxFMW9CTWVlb0xaazRCRVZqMU4tdE...,2,0,20,4,2,0,0,1,1,0,0,0,0,1,1,0,1


## 2.2) 검색 기록 데이터 전처리

In [87]:
def make_user_product_df(user, prod_view_log, search_log):
    df1 = user.copy()
    df2 = prod_view_log.copy()
    df3 = search_log.copy()
    
    # 1. user_info_data의 7,004 ID  
    user_df = pd.DataFrame(df1['member_id'].copy())

    # 2. prod + search 로그 기록 병합
    item_view = pd.concat([df2,df3], axis = 0).sort_index() # 34711 + 8836 = 43547

    # 3. 갯수 카운트
    temp_join = pd.DataFrame(item_view.groupby(['member_id', 'product_id']).size()).reset_index()

    # 4. 데이터 병합
    item_view = pd.merge(item_view,temp_join, on = ['member_id', 'product_id'], how = 'left')
    item_view.rename(columns = {0 : 'Clicks'}, inplace = True)

    # 5. 데이터 중복제거
    item_view.drop_duplicates(inplace = True) # 43,547 -> 26,015
    
    # 6. Clicks 조정 (1 ~ 5번 클릭)
    item_view['Clicks'] = [i if i <= 5 else 5 for i in item_view['Clicks']]
    
    return item_view

In [88]:
model_df = make_user_product_df(user_info_data, prod_view_log, search_log)
print(model_df.shape)
print(model_df['Clicks'].value_counts())
model_df.head()

(26015, 3)
1    14871
2     8683
4      969
3      886
5      606
Name: Clicks, dtype: int64


Unnamed: 0,member_id,product_id,Clicks
0,NTRqT3J1RHdkLS1oVlJzMHRtV1lvNUt0aWFsN3U1VVg2YU...,4215,2
1,dk5aZmJ0SElaTGVxUzhuRnlkZjhFeElaUTJoNy1ZU1h5VU...,3968,2
2,aGF0cGp5VlozdDdZRnRWbXNjaHpIMWJYQTNUNGNTelNwNm...,3615,1
4,Z3ZVZ2x2YkJVWjlEeGlCSTJVOFN3M1cwWFhmbTVLVmpvN2...,3755,5
5,UE1Vbzh1d3hhNXNhTWRMMTc2LTNaUXltREtMUkQ5R0ZXQz...,6544,2


## 2.3) 유저 프로파일 생성

In [89]:
user_df = user_info_data[user_info_data['member_id'].isin(model_df['member_id'])]
user_df.head()

Unnamed: 0,member_id,skin_type,gender,birth_year,skin_tone,personal_color,worry_1,worry_2,worry_3,worry_4,worry_5,worry_6,worry_7,worry_8,worry_9,worry_10,worry_11,worry_12
0,MlRwTkpBOHdmd1pHeVFGZDBnSkRvSm5fQ1FaekNGQ2NRVD...,2,1,12,2,1,1,0,0,0,0,0,0,1,1,0,0,1
1,WUZwWnFPcFA4TXJ2aHlxdU43aEYxVDltVEtVeVUwWTZzek...,4,1,42,4,1,0,0,1,1,0,0,1,1,0,0,0,1
2,X0RUZHM0bHVRdmdTU1N6WFdvdkdQbnFzS3o3OHFLeEQzMk...,4,1,15,4,1,1,0,0,0,1,0,0,1,0,1,0,1
6,RkNKdkxGeHZnXzRvQ25TUDQ4a25zZE90aTZ6Ml9hZ2FHLT...,2,1,38,1,2,1,1,0,1,1,0,0,0,0,0,0,1
7,LU1JTm5rSGZoOTBKM0Q4Zy1DSVRveGI4NFJfY3JoejRaWV...,3,1,27,1,4,0,1,0,1,0,0,0,0,1,0,0,1


## 2.4) 데이터 타입 변경 후 매트릭스 생성

In [101]:
user_item_click = model_df.dropna().reset_index(drop = True)
user_item_click['Clicks'] = user_item_click['Clicks'].astype('float32') # float64 -> 32로 변환하여 메모리절감
user_item_click['product_id'] = user_item_click['product_id'].astype('int64')

# pivot Clicks into id features
user_item_matrix = user_item_click.pivot(
    index='member_id',
    columns='product_id',
    values='Clicks'
).fillna(0)

user_item_matrix .head()

product_id,1,2,3,5,7,8,9,10,11,12,...,41701,41702,41705,41707,41708,41735,41740,41741,41754,41762
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LS1FMWJ3cG5qaVRWdzBQQWlnZkl3SGowdUpMWXFKTmwyZlhlMnNJVzZtZw==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTBDR2FkbUFPLTFuY2lmYkNXQVJ5U1liU1liVllETTlfTFp2bW44N2Vadw==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTFlVGU5VXE5Qnl5WkM4amgyYmp4N3FVaFpwYTcyVFJZc3FBVENENlJ6SQ==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTFnQmVwbllnZURacjdYcjZDWGx0TFBBdnc2QWVmWmtWX2pOUFNCNmZrUQ==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTFvTnVVdUlMVHY5bTd0WXNQd2Y3LU1ra3dFSVBrb0ZjZTZrdWg0YUJDSQ==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.5) 데이터 저장 (matrix, user_profile)

In [97]:
print("Member_id 갯수 : ", len(model_df['member_id'].unique()))
print("Product_id 갯수 : ", len(model_df['product_id'].unique()))

Member_id 갯수 :  3956
Product_id 갯수 :  9450


In [98]:
# 1. 유저 프로파일 데이터
user_df.to_csv('./data/user_df.csv', index = False)

# 2. user-item 데이터
model_df.to_csv('./data/model_df.csv', index = False)

# 3. user-item 매트릭스 데이터
# user_item_matrix.to_csv('.csv', index = False)

# 3. 탐색적 데이터 분석

In [None]:
# import pandas_profiling
# data.profile_report()

# 스킨 타입 유형

# 성별 유형

# 출생연도

# 스킨 톤

# 개인 컬러?

# [유저] 출생연도 분포
# sns.distplot(user_info_data['birth_year'])

# 4. 데이터 모델링

In [None]:
# member_id 인코딩 (보기 좋게 위해서)

# 4. 평가

# 5. Top K 