### 데이터 살펴보기

In [1]:
# 라이브러리 불러오기
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random

In [2]:
df = pd.read_csv('events.csv')

# 데이터 살펴보기
df.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
5,1433224086234,972639,view,22556,
6,1433221923240,810725,view,443030,
7,1433223291897,794181,view,439202,
8,1433220899221,824915,view,428805,
9,1433221204592,339335,view,82389,


In [3]:
# 데이터 개수
len(df)

2756101

In [4]:
# 한 사람이 여러 방문자인지 알수 없으므로
# 모든 visitorid는 독립적인 방문자로 가정
print("Num of unique visitors:", df['visitorid'].nunique())
print("Num of unique items:", df['itemid'].nunique())

Num of unique visitors: 1407580
Num of unique items: 235061


In [5]:
# 이벤트의 종류
df['event'].unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

In [9]:
# 한번이라도 물건을 구입한 방문자의 수
df[df['transactionid'].notnull()]['visitorid'].nunique()

11719

In [10]:
# 구매 기록만 모은다
transaction_df = df[df['transactionid'].notnull()]
transaction_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0
418,1433193915008,552148,transaction,81345,5444.0
814,1433176736375,102019,transaction,150318,13556.0
843,1433174518180,189384,transaction,310791,7244.0


In [11]:
# 이번 실습에서는 구매했다/안했다만 예측
# 중복구매기록은 지운다
deduplicated_df = transaction_df.drop_duplicates(subset=['visitorid','itemid']).copy()

In [12]:
# 구매를 일정 수준 이하로 한 방문자는 필터링한다.
min_visitor_transactions = 1
filter_visitors = (deduplicated_df['visitorid'].value_counts() > min_visitor_transactions)
filter_visitors = filter_visitors[filter_visitors].index.tolist()

In [13]:
# 구매를 일정 수준 이하로 받은 상품은 필터링한다.
min_item_transactions = 1
filter_items = (deduplicated_df['itemid'].value_counts() > min_item_transactions)
filter_items = filter_items[filter_items].index.tolist()

In [14]:
df_purchased = deduplicated_df[lambda x: (x['visitorid'].isin(filter_visitors)) & (x['itemid'].isin(filter_items))][['visitorid', 'itemid']]
print('필터링 전 구매 기록 총 수:\t{}'.format(len(deduplicated_df)))
print('필터링 후 구매 기록 총 수:\t{}'.format(len(df_purchased)))

필터링 전 구매 기록 총 수:	21270
필터링 후 구매 기록 총 수:	6968


In [15]:
# 네거티브 샘플, 즉 구매하지 않은 경우의 데이터를 생성한다
not_purchased_data = {'visitorid': [], 'itemid': []}
random.seed(0)
while len(not_purchased_data['visitorid']) < len(df_purchased):
  random_visitor = random.choice(filter_visitors)
  random_item = random.choice(filter_items)
  random_purchased = df_purchased[lambda x: (x['visitorid'] == random_visitor) & (x['itemid'] == random_item)]
  # if already in df_purchased
  if len(random_purchased) > 0:
    continue
  else:
    not_purchased_data['visitorid'].append(random_visitor)
    not_purchased_data['itemid'].append(random_item)

In [16]:
# 전처리된 데이터 개수 확인
df_not_purchased = pd.DataFrame(not_purchased_data)
len(df_not_purchased)

6968

In [17]:
# 구매한 경우 1, 구매하지 않은 경우 0으로 두는 컬럼을 만든다.
df_balanced = pd.concat([df_purchased.assign(purchased=1), df_not_purchased.assign(purchased=0)])
df_balanced

Unnamed: 0,visitorid,itemid,purchased
814,102019,150318,1
1215,350566,284871,1
1234,404403,150100,1
2315,911093,277119,1
2316,911093,251130,1
...,...,...,...
6963,98772,354832,0
6964,858411,50648,0
6965,557700,399049,0
6966,1084741,307428,0


In [32]:
# 학습 데이터와 테스트 데이터로 나눈다.
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_balanced, random_state=0)

In [33]:
df_train.head()

Unnamed: 0,visitorid,itemid,purchased
1158284,698816,175914,1
2353280,1365340,334823,1
1936146,1143383,390093,1
106988,1093035,324833,1
6126,9145,276494,0


In [34]:
# 구매 데이터를 피봇해서 (방문자 수 X 상품 수)의 행렬로 만든다. 원소의 값은 구매했는지 여부가 된다.

df_p = pd.pivot_table(df_train, index='visitorid', columns='itemid', values='purchased')

print(df_p.shape)

(2118, 3680)


In [35]:
df_p.head()

itemid,25,496,546,829,869,1022,1152,1255,1261,1377,...,465565,465751,465833,465951,466008,466109,466114,466135,466319,466614
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172,,,,,,,,,,,...,,,,,,,,,,
264,,,,,,,,,,,...,,,,,,,,,,
2019,,,,,,,,,,,...,,,,,,,,,,
3104,,,,,,,,,,,...,,,,,,,,,,
3258,,,,,,,,,,,...,,,,,,,,,,


In [37]:
# 2118 x 3680 행렬에 5248개 데이터 존재
df_p.sum().sum()

5248.0

### 모델 기반 협업 필터링

In [38]:
# 케라스를 불러온다
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras import activations
import numpy as np
from sklearn.metrics import mean_squared_error

In [45]:
# visitorid와 itemid는 연속적이지 않다
# 인덱스로 쓰기 위해 매핑을 만든다
visitor_id_mapping = {visitorid:i for i, visitorid in enumerate(df_balanced['visitorid'].unique())}
item_id_mapping = {itemid:i for i, itemid in enumerate(df_balanced['itemid'].unique())}

i = 0
for visitorid, index in visitor_id_mapping.items():
  print(visitorid, ':', index)
  i += 1
  if i >= 10:
    break

102019 : 0
350566 : 1
404403 : 2
911093 : 3
273406 : 4
1233140 : 5
1161163 : 6
189384 : 7
286616 : 8
1235292 : 9


In [46]:
# 학습 데이터와 테스트 데이터를 인덱스로 매핑한다
train_visitor_data = df_train['visitorid'].map(visitor_id_mapping)
train_item_data = df_train['itemid'].map(item_id_mapping)

test_visitor_data = df_test['visitorid'].map(visitor_id_mapping)
test_item_data = df_test['itemid'].map(item_id_mapping)

In [47]:
train_visitor_data

1158284     817
2353280    1538
1936146     138
106988       17
6126        936
           ... 
6155        761
1251729     877
2877        940
3831       1309
1043656     742
Name: visitorid, Length: 10452, dtype: int64

In [48]:
# 사이즈를 구한다
num_visitors = len(visitor_id_mapping)
num_items = len(item_id_mapping)
embedding_size = 10

print('num_visitors', num_visitors)
print('num_items', num_items)

num_visitors 2156
num_items 3862


In [49]:
# 모델 만들기

# 입력 레이어를 만든다
visitor_id_input = Input(shape=[1], name='visitor')
item_id_input = Input(shape=[1], name='item')

In [50]:
# 임베딩 레이어를 만든다
visitor_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=num_visitors,
                           input_length=1, 
                           name='user_embedding')(visitor_id_input)
item_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=num_items,
                            input_length=1, 
                            name='item_embedding')(item_id_input)

In [51]:
# 임베딩 레이어의 차원을 바꾼다.
user_vector = Reshape([embedding_size])(visitor_embedding)
item_vector = Reshape([embedding_size])(item_embedding)

In [52]:
# 차원을 조정한 임베딩 레이어 사이를 내적한다.
y = Dot(1, normalize=False)([user_vector, item_vector])

In [53]:
# 모델을 구성한다
model = Model(inputs=[visitor_id_input, item_id_input], outputs=y)
opt = optimizers.Adam(learning_rate=0.01)
model.compile(loss='mse', optimizer=opt)

In [54]:
# 모델을 학습시킨다
model.fit([train_visitor_data, train_item_data],
          df_train['purchased'],
          batch_size=1024,
          epochs=10,
          validation_split=0.01,
          shuffle=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c1e72a0ac0>

In [55]:
# 모델 테스트하기
y_pred = model.predict([test_visitor_data, test_item_data])
y_true = df_test['purchased'].values


In [56]:
# RMSE 계산하기
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('{:.4f} RMSE'.format(rmse))

0.7129 RMSE


### 행렬 분해 기법 확장하기

- 모델 구조를 데이터셋의 특징에 맞추면 정확도를 더 올릴수 있다

In [57]:
y

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dot')>

In [58]:
binary_y = activations.sigmoid(y)

In [59]:
binary_y

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'tf.math.sigmoid')>

In [60]:
# 모델을 구성한다
model = Model(inputs=[visitor_id_input, item_id_input], outputs=binary_y)
opt = optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt)

In [61]:
# 모델을 학습시킨다
model.fit([train_visitor_data, train_item_data],
          df_train['purchased'],
          batch_size=1024, 
          epochs=10,
          validation_split=0.01,
          shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c1e764a760>

In [62]:
# 모델 테스트하기
y_pred = model.predict([test_visitor_data, test_item_data])
y_true = df_test['purchased'].values

In [63]:
# RMSE 계산하기
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('{:.4f} RMSE'.format(rmse))

0.5253 RMSE
