In [1]:
import math
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import heapq # for retrieval topK
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import random
import json
from datetime import datetime
from pprint import pprint 
from time import time
from scipy import spatial
%matplotlib inline

![title](./F1.png)

In [2]:
#每篇文章都將由3072個向量表示。
#讀取BERT的JSON檔放入bert_vector
bert_vector = []
with open('../BERT test/output_1.json') as file:
    for line in file.readlines():
        bert_vector.append(json.loads(line))
        


![title](./F2.png)

In [3]:
#讀取文章檔案
df = pd.read_csv('../news_info2.csv')
df.shape

(48503, 3)

In [4]:
#顯示部分文章
df.head()

Unnamed: 0,news_id,news_class,news_class_Count
0,d9c9f480-da76-412c-90f3-000006bb0ad0,2.0,472
1,cea69ac1-5c24-41f8-936a-00013f62de89,1.0,126
2,1c1dbc9e-78d9-4e5e-92a4-000599b43691,1.0,194
3,36187583-96b3-43f8-8733-000717d75a7a,5.0,911
4,628bb991-b9f7-4112-8926-0007f6bd2e7e,1.0,109


In [5]:
#增加欄位features每篇文章都將由3072個向量表示
df = df.reset_index().join(pd.DataFrame(bert_vector).features)

In [6]:
#顯示部分文章跟features
df.head()

Unnamed: 0,index,news_id,news_class,news_class_Count,features
0,0,d9c9f480-da76-412c-90f3-000006bb0ad0,2.0,472,"[0.5079685996093749, 0.05682145703124999, -0.5..."
1,1,cea69ac1-5c24-41f8-936a-00013f62de89,1.0,126,"[0.6301766582031247, -0.11561564843750012, -0...."
2,2,1c1dbc9e-78d9-4e5e-92a4-000599b43691,1.0,194,"[0.7734264025316461, -0.22052073924050605, -0...."
3,3,36187583-96b3-43f8-8733-000717d75a7a,5.0,911,"[0.5616207304687506, 0.013011304687500011, -0...."
4,4,628bb991-b9f7-4112-8926-0007f6bd2e7e,1.0,109,"[0.6127381406249995, -0.04040539062499999, -0...."


In [7]:
#news_id跟features放入pid_vector
#後面模型會用到
pid_vector = {}
for i in range(len(df)):
    pid_vector[df['news_id'].values[i]] = df['features'].values[i]

In [8]:
##定義函式getHitRatio & getNDCG
#Hit ratio將計算測試集的平均得分。
#NDCG 等級關聯性

def getHitRatio(ranklist, gtItem):
    if gtItem in ranklist:
        return 1
    return 0

def getNDCG(ranklist, gtItem):
    ar = np.array(ranklist)
    if gtItem in ar:
        return math.log(2) / math.log(np.where(ar == gtItem)[0][0] + 2)
    return 0

![title](./F3.jpeg)

In [9]:
##定義函式
#只是對測試集合中的某個用戶的某個物品，以及和事先劃分好的負樣本組合在一起進行預測，最終輸出該測試物品是否在topK中。
def eval_one_rating(idx, eval_mode, uim, pid_vector,test_user_last_pid):
    
    '''''
    eval_mode = 'Keras', 'ALS', 'matrix'
    '''''
    rating = _testRatings[idx]
    ### rating should be like (  [(only one article, which is ground truth)]  ) 
    items = _testNegatives[idx][1]
    ### items should be like (  [(19 ones, which are ones the reader haven't read)]  )
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    
    # Get prediction scores, the process is we offer 100(99 negative + 1 ground truth) 
    map_item_score = {}
#     users = np.full(len(items), u, dtype = 'int32')

    if eval_mode == 'ALS':
        predictions = _model.rank_items(u, uim.T, items)
        items.pop()
        ranklist = np.array(predictions, dtype = int)[:_K,0]
        
    else:
        if eval_mode == 'Keras':
            predictions = _model.predict([users, np.array(items)], 
                                         batch_size=100, verbose=0)
        elif eval_mode == 'matrix':         
            predictions = _model[u,items]
        
        elif eval_mode == 'content-based':
            predictions = []
            pid_vectors = []
            user_history_article_list = test_user_last_pid[u]
            for pid in user_history_article_list:
                if pid in pid_vector:
                    pid_vectors.append(pid_vector[pid])
            user_history_article_mean = np.array(pid_vectors).mean(axis = 0)
        
            for i in range(len(items)):
                if items[i] in pid_vector:
                    similarity = 1 - spatial.distance.cosine(user_history_article_mean,pid_vector[items[i]])
                    predictions.append(similarity)
                else:
                    predictions.append(0)
        for i in range(len(items)):
            item = items[i]
            map_item_score[item] = predictions[i]
        items.pop()
        
        # Evaluate top rank list
        ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
        
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

In [10]:
##定義函式
#評估模型：
#評估top-K推薦的效果（Hit_Ratio，NDCG）
#每個測試評分的分數。
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread, eval_mode = 'Keras', uim = None,
                   pid_vector = None, test_user_last_pid = None):

    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    else:# Single thread
        for idx in range(len(_testRatings)):
            (hr,ndcg) = eval_one_rating(idx, eval_mode, uim, pid_vector, test_user_last_pid)
            hits.append(hr)
            ndcgs.append(ndcg)      
    return (hits, ndcgs)

# Training Testing Split

In [11]:
##定義函式
#將訓練/測試集隨機分成9：1
def train_test_split_time(df , columns_1 , columns_2 , columns_time , ratio = 0.9):

    
    retreive_name = df[columns_1].value_counts()[df[columns_1].value_counts()>5].keys()
    df_test = df[df[columns_1].isin(retreive_name)]
    df_test = df_test.sort_values(columns_time).groupby(columns_1).tail(1)
    df_train = df.drop(index=df_test.index)
    
    if df_test.shape[0] + df_train.shape[0] == df.shape[0]:
        print('train_test_split succeed!! with df_train shape:(%d,%d), df_test shape:(%d,%d)'
              %(df_train.shape[0],df_train.shape[1],df_test.shape[0],df_test.shape[1])
             )
        return df_train, df_test, retreive_name
    else:
        print('Oops, something wrong, with df_train shape : (%d,%d), df_test shape : (%d,%d)'
              %(df_train.shape[0],df_train.shape[1],df_test.shape[0],df_test.shape[1])
             )
        print('df_shape : (%d,%d)'
              %(df.shape[0],df.shape[1])
             )

In [12]:
#讀user_id
df1 = pd.read_csv('../newslog_df.csv')
df1.shape

  interactivity=interactivity, compiler=compiler, result=result)


(11308282, 5)

In [13]:
#顯示部分user
df1.head()

Unnamed: 0,user_id,human,my_time,time,guid
0,7ee141b7-5941-403f-93cc-b4975eab2769,False,1532336000.0,2018-07-23 17:00:01,550c8436-1174-4d60-b6ec-1199337fb287
1,539edc42-7362-47de-bf94-ce0cf68fb5de,False,1532855000.0,2018-07-29 17:00:01,35606c49-c92e-4b70-9264-6dabd763d34d
2,a576a197-ce6c-482f-8117-a1b4b73f21e5,True,1532073000.0,2018-07-20 15:49:28,f3d50a1e-2744-4610-b91b-b55983c1db60
3,4151c8d5-6604-482d-85d9-7d40915ac15c,False,1532164000.0,2018-07-21 17:00:01,17e27e49-d45a-48ed-a54e-d4666bde7455
4,59d3dc4e-d6e2-42c8-bdcd-bf5ce7832078,False,1531645000.0,2018-07-15 17:00:01,a236e2e1-54c9-4fe7-aa2e-c33b60ed7250


In [14]:
#只取人閱讀的
df_log=df1[df1['human'] == 1]
df_log.shape

(3571058, 5)

In [15]:
#分割資料
#將訓練/測試集隨機分成9：1
df_train,df_test, retrieve = train_test_split_time(df_log,'user_id','guid','time')

train_test_split succeed!! with df_train shape:(3498499,5), df_test shape:(72559,5)


In [16]:
#顯示部分訓練
df_train.head()

Unnamed: 0,user_id,human,my_time,time,guid
2,a576a197-ce6c-482f-8117-a1b4b73f21e5,True,1532073000.0,2018-07-20 15:49:28,f3d50a1e-2744-4610-b91b-b55983c1db60
8,0910ca21-0e55-4d8b-aa0c-8df93b47177c,True,1531282000.0,2018-07-11 12:06:17,38f168a1-b69f-4532-9e7d-8fe963c53611
13,13ab7907-71d5-4eae-97be-06491d91d90f,True,1531188000.0,2018-07-10 09:58:38,7bc023ae-a347-4dd8-8d38-2b1cba576946
16,6b72782c-48f9-4d51-bfc4-1bd5fa0c583e,True,1531716000.0,2018-07-16 12:45:11,179d5a43-3d0a-41f4-8e23-8efa8b9b93c5
24,{3DA5A7EC-6FBB-4B06-B587-D9642273BF9E},True,1530600000.0,2018-07-03 14:38:12,553023B7-856C-45B5-A5FC-BE7EC88FB857


In [17]:
#顯示部分測試
df_test.head()

Unnamed: 0,user_id,human,my_time,time,guid
7146529,b4547e4d-c650-4f7a-acf7-f5f3ca30f073,True,1514787000.0,2018-01-01 14:16:21,c23c3b1a-14fb-4521-9781-20dfb7531a31
7448985,{712A1770-9A53-4BE6-8A15-D8CC15C228A0},True,1514819000.0,2018-01-01 23:05:30,D8F9B736-5F04-4983-ADFA-62B3710EBBBB
7309630,{BCD87577-D699-4F8E-96BB-8C2E24F412D8},True,1514882000.0,2018-01-02 16:37:40,C41DF70E-4D1B-46C0-BDE0-FB7D2353FFBE
6823924,e371cfec-cd8f-4ded-a3c1-1ca6fad31986,True,1514941000.0,2018-01-03 09:02:40,b6a9f7b7-3d14-407d-b6b3-7bd3ea4928eb
6447252,d4b82def-3f94-4315-a29b-7fab969b927a,True,1514944000.0,2018-01-03 09:39:14,14254f9a-87e4-45c6-bc37-b359c008a693


In [18]:
##定義函式
#產生模擬的測試集
#1。產生[[user_id，文章ID] * XXXXX]形式的test_rating
#2。產生列表形式為[[（（（user_id，ground true））），[199（負樣本）]] * XXXXX]形式的negative_test_rating
def get_sampling(df_target,df_source,numbers_of_N_sample, generate_negative = False):

    
    test_rating = df_target[['user_id','guid']].values.tolist()
    
    print('test_rating yield successfully!!!')
    
    if generate_negative == True:
        df_temp = df_source.drop_duplicates(subset = 'guid', keep = 'first')
        negative_test_rating = []
        for i in range(df_target.shape[0]):
            drop_id = test_rating[i][0]
            list_ = df_temp[df_temp['user_id'] != drop_id]['guid'].sample(numbers_of_N_sample).values.tolist()
            
            negative_test_rating.append([test_rating[i],list_])
        print('negative_test_rating yield successfully!!!')

        return test_rating, negative_test_rating
    else:
        return test_rating

In [19]:
#加速評估過程
#選擇讀者尚未閱讀的其他199篇文章，計算這200（199 + 1）篇文章
time1 = time()
test_rating = get_sampling(df_test,df_train,None)
with open("Negative_test_rating_199N.txt",'rb') as f: #in read mode, not in write mode, careful
    negative_test_rating = pickle.load(f)
    print('negative_test_rating yield successfully!!!')
time2 = time()
print('Took for %d seconds' %(time2-time1))

test_rating yield successfully!!!
negative_test_rating yield successfully!!!
Took for 4 seconds


In [20]:
##定義函式
#測試用戶已閱讀的訓練數據中的最後一篇文章
#並根據此閱讀文章按餘弦相似度對候選文章進行排名。
def test_user_last_pid_func(number_of_last_seen):
    ### Create the last seen dict
    test_user_last_pid = {}
    retrieve_name = df_test.user_id.values
    df_train_lookup = df_train[df_train['user_id'].isin(retrieve_name)].sort_values('time')
    df_train_lookup = df_train_lookup[['user_id','guid','time']].drop_duplicates(subset = 'user_id', keep = 'last')
    for i in range(len(df_train_lookup)):
        test_user_last_pid[df_train_lookup['user_id'].values[i]]  = [df_train_lookup['guid'].values[i]]
    ### Append the last nth seen dict
    if number_of_last_seen > 1:
        for number in range(2,number_of_last_seen+1):
            df_train_lookup_append = df_train[df_train['user_id'].isin(retrieve_name)].sort_values('time')[['guid','user_id']].groupby(['user_id'],as_index = False).nth(-number)
            for i in range(len(df_train_lookup_append)):
                test_user_last_pid[df_train_lookup_append['user_id'].values[i]].append(df_train_lookup_append['guid'].values[i])
    
    return test_user_last_pid

# Evaluation

#基於內容的推薦

#建議與用戶之前點擊過的商品最相似的商品

In [21]:
#讀取次數（每個用戶閱讀文章的次數）
#包含三列的數據框，分別為user_id，item_id和read_times
#1 for read once 0 for never-read 
df_train['read_time'] = 1
df_test['read_time'] = 1
df_train_group = df_train[['user_id','guid','read_time']].groupby(by = ['user_id','guid'],as_index=False).sum()
df_test_group = df_test[['user_id','guid','read_time']].groupby(by = ['user_id','guid'],as_index=False).sum()

### For the reason that normally people don't read articles more than 10 times
cliper = 10
df_train_group['read_time'] = df_train_group['read_time'].apply(lambda w: min(cliper,w))


### To feed into the keras model, we have to turn both the eruid and pid into integer index
eruid_map = {i:v for i,v in enumerate(df_log.user_id.unique())}
inverse_eruid_map = {v:i for i,v in enumerate(df_log.user_id.unique())}
pid_map = {i:v for i,v in enumerate(df_log.guid.unique())}
inverse_pid_map = {v:i for i,v in enumerate(df_log.guid.unique())}

In [22]:
#對user_id和item_id進行索引和轉換
#針對training data
### creating a mapping table for training data
df_train_group_map = df_train_group.copy()
df_test_group_map = df_test_group.copy()
df_train_group_map['user_id'] = df_train_group['user_id'].map(inverse_eruid_map)
df_train_group_map['guid'] = df_train_group['guid'].map(inverse_pid_map)
df_test_group_map['user_id'] = df_test_group['user_id'].map(inverse_eruid_map)
df_test_group_map['guid'] = df_test_group['guid'].map(inverse_pid_map)

#對user_id和item_id進行索引和轉換
#針對testing data 
### creating a mapping list for testing data   
test_rating_map = []
for i in range(len(test_rating)):
    test_rating_map.append([inverse_eruid_map[test_rating[i][0]],inverse_pid_map[test_rating[i][1]]])

#對user_id和item_id進行索引和轉換
#針對negative testing data
### creating a mapping list for negative testing data
negative_test_rating_map = []
for i in range(len(negative_test_rating)):
    negative_test_rating_map.append([
                                     [inverse_eruid_map[negative_test_rating[i][0][0]],inverse_pid_map[negative_test_rating[i][0][1]]],
                                     list(map(lambda w:inverse_pid_map[w],negative_test_rating[i][1]))])

In [23]:
#顯示資料
print('df_train_group_map shape: ',df_train_group_map.shape)
print('number of users: ', len(eruid_map.items()))
print('number of items: ', len(pid_map.items()))
print('The sparse matrix is one with shape (%d , %d), with %d non-zero read_times'
      %(len(eruid_map.items()),len(pid_map.items()),df_train_group_map.shape[0]))

df_train_group_map shape:  (3294247, 3)
number of users:  1929869
number of items:  224640
The sparse matrix is one with shape (1929869 , 224640), with 3294247 non-zero read_times


In [None]:
#使用文章向量來計算文章之間的相似度。每篇文章都有3072個維度，我計算彼此之間的餘弦相似度，以衡量兩篇文章之間的相似度。
#開始進行training。將topK設置為20， 這意味著如果將ground truth排在測試用戶的前20名文章中，則hit ratio將僅計為1，否則為0。 
#我們開始訓練並在之前創建的negative test rating上進行評估（1 ground truth and 199 negative samples）。
### Decide how many last pieces of articles to calculate
number_of_seen_article = [i for i in range(20)]
for number in number_of_seen_article:
    test_user_last_pid = test_user_last_pid_func(number)

    ### Evaluate how the 'most similar content recommendation works'
    topK = 20
    verbose = 0
    evaluation_threads = 1

    time_1 = time()
    testRatings, testNegatives = test_rating, negative_test_rating
    time1 = time()
    (hits, ndcgs) = evaluate_model(None, testRatings, testNegatives, topK, evaluation_threads,
                                   eval_mode = 'content-based', pid_vector = pid_vector,
                                   test_user_last_pid = test_user_last_pid)
    hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()

    print('HR = %.4f, NDCG = %.4f [%.1f s] number of last seen = %d' % (hr, ndcg, time()-time1,number))

  ret = ret.dtype.type(ret / rcount)


HR = 0.2063, NDCG = 0.1068 [379.6 s] number of last seen = 0
HR = 0.2063, NDCG = 0.1068 [378.5 s] number of last seen = 1
HR = 0.2521, NDCG = 0.1262 [389.5 s] number of last seen = 2
HR = 0.2698, NDCG = 0.1299 [392.1 s] number of last seen = 3
HR = 0.2770, NDCG = 0.1300 [395.0 s] number of last seen = 4
HR = 0.2828, NDCG = 0.1299 [400.3 s] number of last seen = 5
HR = 0.2847, NDCG = 0.1290 [398.7 s] number of last seen = 6
HR = 0.2845, NDCG = 0.1280 [401.5 s] number of last seen = 7
HR = 0.2851, NDCG = 0.1274 [402.6 s] number of last seen = 8
HR = 0.2849, NDCG = 0.1268 [406.7 s] number of last seen = 9
HR = 0.2857, NDCG = 0.1266 [405.2 s] number of last seen = 10
HR = 0.2850, NDCG = 0.1261 [446.4 s] number of last seen = 11
HR = 0.2848, NDCG = 0.1257 [424.5 s] number of last seen = 12
HR = 0.2849, NDCG = 0.1256 [404.5 s] number of last seen = 13
HR = 0.2844, NDCG = 0.1254 [414.2 s] number of last seen = 14


如果前20篇相似的文章hit the ground truth（用戶真正閱讀過的一篇文章），
The HR is 28%，這意味著如果我根據相似性對商品進行排名和推薦，則它僅對28％的用戶有效。

總之，在這一部分中，我演示了基於內容的推薦系統如何工作。
關鍵是必須有一種表示項目內容的方法，
因此可以根據要推薦的內容（這裡我使用BERT嵌入矢量來表示項目）。
在這裡，結果似乎似乎在構建強大的推薦系統時，基於內容的 content-based system要比基於model-based 的系統弱，
此外，當構建混合推薦系統時， 是strengthens。