# Recommendation using Factorization Machine 

## Load Libraries

In [None]:
%reload_ext sql
%run lib.py

import xlearn as xl
import pandas as pd
import numpy as np

%sql postgresql+psycopg2://postgres:@127.0.0.1:5432/fcrec

# Create Feature Data

### Join click event with metadata

In [None]:
%%sql

drop table if exists cmc_click_event;

create table cmc_click_event as
select a.session_id, a.event_timestamp, c.user_no, c.birth_date, c.gender, 
	b.item_no, b.price, b.category1_code, b.category2_code, b.category3_code, b.brand_no 
from cmc_event a 
	join cmc_product b on b.item_no = a.item_no 
	join cmc_user c on c.user_no = a.user_no
where event_name = 'click_item' and event_timestamp between '2021-07-18' and '2021-07-25';

 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
Done.
546756 rows affected.


[]

### Create feature index table

In [None]:
%%sql
drop table if exists cmc_feat_idx;

create table cmc_feat_idx as
select type, feat, (row_number() over () -1) idx
from (
	select distinct 1 as type, user_no as feat from cmc_click_event where user_no is not null
	union all
	select distinct 2 as type, left(birth_date, 4) as feat from cmc_click_event where birth_date is not null
	union all
	select distinct 3 as type, gender as feat from cmc_click_event where gender is not null
	union all
	select distinct 4 as type, item_no as feat from cmc_click_event where item_no is not null
	union all
	select distinct 5 as type, category1_code as feat from cmc_click_event where category1_code is not null
	union all
	select distinct 6 as type, category2_code as feat from cmc_click_event where category2_code is not null
	union all
	select distinct 7 as type, category3_code as feat from cmc_click_event where category3_code is not null
	union all
	select distinct 8 as type, brand_no as feat from cmc_click_event where brand_no is not null
) t
order by type, feat;

 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
Done.
170508 rows affected.


[]

### Create click event feature table

In [None]:
%%sql
drop table if exists cmc_click_event_with_index;

create table cmc_click_event_with_index as
select
	1 as label,
	b.idx as user_no_idx,
	c.idx as birth_date_idx,
	d.idx as gender_idx,
	e.idx as item_no_idx,
	f.idx as category1_code_idx,
	g.idx as category2_code_idx,
	h.idx as category3_code_idx,
	i.idx as brand_no_idx
from cmc_click_event a
	left join cmc_feat_idx b on b.type = 1 and a.user_no = b.feat
	left join cmc_feat_idx c on c.type = 2 and left(a.birth_date, 4) = c.feat
	left join cmc_feat_idx d on d.type = 3 and a.gender = d.feat
	left join cmc_feat_idx e on e.type = 4 and a.item_no = e.feat
	left join cmc_feat_idx f on f.type = 5 and a.category1_code = f.feat
	left join cmc_feat_idx g on g.type = 6 and a.category2_code = g.feat
	left join cmc_feat_idx h on h.type = 7 and a.category3_code = h.feat
	left join cmc_feat_idx i on i.type = 8 and a.brand_no = i.feat

 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
Done.
546756 rows affected.


[]

## Read feature from DB

### Read samples

In [None]:
encoded_samples = %sql select * from cmc_click_event_with_index;

 * postgresql+psycopg2://postgres:***@127.0.0.1:5432/fcrec
546756 rows affected.


## Negative Sample 추가

In [None]:
import random

positive_sample_keys = set()
for s in encoded_samples:
    positive_sample_keys.add(f'{s[1]}:{s[4]}')


negative_samples = []
dateset_size = len(encoded_samples)
for i in range(dateset_size // 2):
    negative_sample = encoded_samples[random.randint(0, dateset_size - 1)]
    positive_sample = encoded_samples[random.randint(0, dateset_size - 1)]

    if f'{positive_sample[1]}:{negative_sample[4]}' in positive_sample_keys:
        continue

    negative_samples.append(
        (0,
        positive_sample[1],
        positive_sample[2],
        positive_sample[3],
        negative_sample[4],
        negative_sample[5],
        negative_sample[6],
        negative_sample[7],
        negative_sample[8]
        )
    )

negative_samples[0:10]

[(0, 13069, 64499, 64569, 149087, 164779, 164844, 165308, 167619),
 (0, 56446, 64513, 64569, 128143, 164769, 164789, 164992, 165552),
 (0, 27988, 64554, 64569, 132746, 164769, 164822, 165429, 169305),
 (0, 20605, 64499, 64569, 141338, 164769, 164822, 165147, 167924),
 (0, 14162, 64550, 64569, 163951, 164769, 164822, 165147, 167315),
 (0, 22879, 64502, 64569, 138926, 164777, 164850, 165390, 170163),
 (0, 17263, 64537, 64569, 139530, 164769, 164789, 165425, 170296),
 (0, 24287, 64524, 64569, 102872, 164769, 164789, 165425, 168657),
 (0, 1959, 64516, 64569, 115265, 164779, 164833, 165285, 170262),
 (0, 11162, 64497, 64569, 157568, 164779, 164844, 165308, 169750)]

## Training Data 생성

In [None]:
def to_sparse_vector_str(sample):
    sv = ""

    for i, v in enumerate(sample):
        if i == 0:
            sv += f'{v}'
        elif v:
            sv += f' {v}:1'
    return sv

In [None]:
samples = encoded_samples + negative_samples
random.shuffle(samples)

with open("temp/train.txt", "w") as file:
    for sample in samples:
        file.write(to_sparse_vector_str(sample) + "\n")

## FM Model Train

https://github.com/aksnzhy/xlearn

In [None]:
fm_model = xl.create_fm()
fm_model.setTrain("temp/train.txt")
param = {
    "task": "binary",
    "lr": 0.2,
    "epoch": 10,
    "lambda": 0.002,
    "metric": "prec",
    "k": 20,
    "fold": 5
}
fm_model.setTXTModel("temp/fm_model_out.txt")
fm_model.cv(param)
fm_model.fit(param, "temp/fm_model_out.bin")

# Recommend to a User

### Set a user_no '++MXKfwkOw4VFn9HkVCRrw=='

In [None]:
user_no = '++MXKfwkOw4VFn9HkVCRrw=='

### Check a user history

In [None]:
query = f'''
    select a.session_id, a.event_timestamp, a.user_no, b.*
    from cmc_event a join cmc_product b on b.item_no = a.item_no 
    where user_no = '++MXKfwkOw4VFn9HkVCRrw=='
        and event_timestamp between '2021-07-18' and '2021-07-25'
    order by event_timestamp desc
    limit 20;
    '''

result = executeQuery(query)
result

displayItemInRows(result)

## Predict clicks on the unseen items

In [None]:
query = f'''
select
	b.idx as user_no_idx,
--	c.idx as birth_date_idx,
--	d.idx as gender_idx,
	e.idx as item_no_idx
--	f.idx as category1_code_idx,
--	g.idx as category2_code_idx,
--	h.idx as category3_code_idx,
--	i.idx as brand_no_idx
from cmc_user u
	left join cmc_feat_idx b on b.type = 1 and u.user_no = b.feat
	left join cmc_feat_idx c on c.type = 2 and left(u.birth_date, 4) = c.feat
	left join cmc_feat_idx d on d.type = 3 and u.gender = d.feat,
	cmc_product p
	join cmc_feat_idx e on e.type = 4 and p.item_no = e.feat
	left join cmc_feat_idx f on f.type = 5 and p.category1_code = f.feat
	left join cmc_feat_idx g on g.type = 6 and p.category2_code = g.feat
	left join cmc_feat_idx h on h.type = 7 and p.category3_code = h.feat
	left join cmc_feat_idx i on i.type = 8 and p.brand_no = i.feat
where u.user_no = '{user_no}'
'''

result = executeQuery(query)

### Make feature data to predict

In [None]:
def to_sparse_vector_str_from_dic(dic):
    sv = ""
    for v in dic:
        if sv:
            sv += ' '
        sv += f'{dic[v]}:1'
    return sv

### Make predictions

In [None]:
item_no_idx_arr = []
with open("temp/user_features_to_predict.txt", "w") as file:
    i = 0
    for row in result:
        item_no_idx_arr.append(row['item_no_idx'])
        file.write(to_sparse_vector_str_from_dic(row) + "\n")

fm_model.setSigmoid()
fm_model.setTest("temp/user_features_to_predict.txt")
fm_model.predict("temp/fm_model_out.bin", "temp/predictions.txt")

In [None]:
cands = pd.DataFrame(data=item_no_idx_arr, columns=['item_no_idx'])

preds = []
with open("temp/predictions.txt", "r") as file:
    for line in file:
        preds.append(float(line))

cands['pred'] = pd.Series(data=preds)
cands = cands.sort_values(by='pred', ascending=True)[:40]

NameError: name 'Ture' is not defined

## Display Items

In [None]:
item_score_dic = {}
for i, v in cands.iterrows():
    item_score_dic[v['item_no_idx']] = v['pred']

item_index_str = ", ".join([ str(c) for c in cands['item_no_idx'].values])

query = f'''
    select a.idx, b.*
    from cmc_feat_idx a join cmc_product b on a.type = 4 and a.feat = b.item_no
    where a.type = 4 and a.idx in ({item_index_str})
'''

rec_result = executeQuery(query)

for r in rec_result:
    r['pred'] = item_score_dic[r['idx']]

rec = pd.DataFrame(data = rec_result)
rec = rec.sort_values(by='pred', ascending=True)

displayItemInRows(rec.to_dict('records'))
