# FM모델을 활용한 Recommend System


### *RecSys 2015 Challenge Dataset 데이터: 대형 E-Commerce 상에서 사용자들이 6개월 간 '클릭' 데이터 (클릭이 구매용 클릭일 수도, 그저 의미없는 클릭일 수 있음)


### *목적: 사용자가 물건을 구매할 것인지의 여부와, 구매한다면  어떤 품목을, 얼마나 구매할 것인지 예측하여 추천하기


In [3]:
import tensorflow as tf
from tffm import TFFMRegressor
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 데이터 가져오기

In [4]:
buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')

FileNotFoundError: [Errno 2] No such file or directory: 'yoochoose-buys.dat'

In [40]:
initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
                              dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32', 'Category': 'str'})

initial_buys_df.set_index('Session ID', inplace=True)

initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
                                dtype={'Category': 'str'})

initial_clicks_df.set_index('Session ID', inplace=True)

In [41]:
initial_buys_df.tail()

Unnamed: 0_level_0,Timestamp,Item ID,Category,Quantity
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11368701.0,2014-09-26T07:52:51.357Z,214849808.0,554,2
11368691.0,2014-09-25T09:37:44.206Z,214700000.0,6806,5
11523941.0,2014-09-25T06:14:47.965Z,214578016.0,14556,1
11423202.0,2014-09-26T18:49:34.024Z,214849168.0,1046,1
11423202.0,2014-09-26T18:49:34.026Z,214560496.0,5549,1


In [42]:
initial_clicks_df.head()

Unnamed: 0_level_0,Timestamp,Item ID,Category
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2014-04-07T10:51:09.277Z,214536502,0
1,2014-04-07T10:54:09.868Z,214536500,0
1,2014-04-07T10:54:46.998Z,214536506,0
1,2014-04-07T10:57:00.306Z,214577561,0
2,2014-04-07T13:56:37.614Z,214662742,0


In [43]:
len(initial_clicks_df), len(initial_buys_df)

(33003944, 1150753)

# 데이터 전처리
### 1) 필요없는 열 제거
### 2) 일부 데이터만 추출 (용량, 속도관계로)
### 3) 필요한 열 추가
### 4) One-hot encoding (벡터형태로 바꾸기 위해)
### 5) 데이터 통합

In [44]:
# 여기선 Timestamp를 사용하지 않을 것이므로 column 삭제
# 즉, 여기선 사용자 ID와 구매/클릭 이력만 사용할 것

initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)


In [45]:
# 데이터가 굉장히 큼! 여기선 간단하게 보여주기 위해 구매/클릭 수 상위 10,000명의 데이터만 가져옴

x = Counter(initial_buys_df.index).most_common(10000) # most_common(n): 상위 n개 데이터만 가져옴
top_k = dict(x).keys()
initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]
initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]

In [46]:
# index를 나타내는 열 추가. index(Session, 즉, 클릭)도 벡터에 포함시키기 위해

initial_buys_df['_Session ID'] = initial_buys_df.index

In [47]:
initial_buys_df.head()

Unnamed: 0_level_0,Item ID,Category,Quantity,_Session ID
Session ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
420471.0,214717888.0,2092,1,420471.0
420471.0,214821024.0,1570,1,420471.0
420471.0,214829280.0,837,1,420471.0
420471.0,214819552.0,418,1,420471.0
420471.0,214746384.0,784,1,420471.0


In [48]:
# One-hot encoding(벡터화)

transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)

In [49]:
# 아이템과 카테고리에 대한 과거 데이터를 추리기
# Aggregate historical data for Items and Categories

filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")

historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)

historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)


In [63]:
# 각 사용자id를 기준으로 과거 데이터와 원본 데이터 병합
# Merge historical data of every user_id

merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)

# TFFM라이브러리를 사용하여 학습모델 구성

In [64]:
model = TFFMRegressor(
    order=2, 
    rank=7,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.1), #다른 알고리즘을 써도 됌 
    n_epochs=100,
    batch_size=-1,
    init_std=0.001,
    input_type='dense'
)


merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], 1, inplace=True)
X = np.array(merged2)
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].as_matrix())


  from ipykernel import kernelapp as app


# 학습 데이터, 테스트 데이터 나누기

In [65]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

In [66]:
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set

for column in X_te_cs.columns:
    if ('buy' in column or 'click' in column) and ('Category' not in column):
        X_te_cs[column] = 0

In [67]:
# Compute the mean squared error for both test sets

model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
cold_start_predictions = model.predict(X_te_cs)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [01:31<00:00,  1.15epoch/s]


MSE: 0.7006379736490871
