In [3]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [16]:
articles_df = pd.read_csv('/Users/lucaschang/Documents/python/rec_sys_implicit/shared_articles.csv')
interactions_df = pd.read_csv('/Users/lucaschang/Documents/python/rec_sys_implicit/users_interactions.csv')
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)

In [19]:
articles_df.head(3)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [20]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

In [21]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)

In [22]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3047 entries, 1 to 3121
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   timestamp        3047 non-null   int64 
 1   contentId        3047 non-null   int64 
 2   authorPersonId   3047 non-null   int64 
 3   authorSessionId  3047 non-null   int64 
 4   contentType      3047 non-null   object
 5   url              3047 non-null   object
 6   title            3047 non-null   object
 7   text             3047 non-null   object
 8   lang             3047 non-null   object
dtypes: int64(4), object(5)
memory usage: 238.0+ KB


In [23]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  72312 non-null  int64 
 1   eventType  72312 non-null  object
 2   contentId  72312 non-null  int64 
 3   personId   72312 non-null  int64 
 4   sessionId  72312 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 2.8+ MB


In [71]:
interactions_df['personId'].unique

<bound method Series.unique of 0       -8845298781299428018
1       -1032019229384696495
2       -1130272294246983140
3         344280948527967603
4        -445337111692715325
                ...         
72307   -9016528795238256703
72308     102305705598210278
72309   -9196668942822132778
72310   -9016528795238256703
72311    5713241217519616260
Name: personId, Length: 72312, dtype: int64>

In [24]:
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], 
              articles_df[['contentId', 'title']], 
              how = 'inner', 
              on = 'contentId')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72269 entries, 0 to 72268
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   contentId  72269 non-null  int64 
 1   personId   72269 non-null  int64 
 2   eventType  72269 non-null  object
 3   title      72269 non-null  object
dtypes: int64(2), object(2)
memory usage: 2.8+ MB


In [26]:
df.head(3)

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem


In [72]:
df['personId'].unique

<bound method Series.unique of 0       -8845298781299428018
2        -108842214936804958
3       -1443636648652872475
6       -8020832670974472349
8       -9009798162809551896
                ...         
72264   -4028919343899978105
72265   -3643155458357242906
72266    5660542693104786364
72267    5660542693104786364
72268   -7496361692498935601
Name: personId, Length: 50910, dtype: int64>

In [27]:
df['eventType'].value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [28]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])

In [29]:
df.head(10)

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
5,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
7,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem,2.0
9,-3499919498720038879,-9009798162809551896,VIEW,Hiri wants to fix the workplace email problem,1.0


In [30]:
df = df.drop_duplicates()

#each person,content and title 的加總
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()

In [33]:
grouped_df['eventStrength'].value_counts()

1.0     32742
3.0      4305
4.0      1011
6.0       885
10.0      540
12.0      420
5.0       230
8.0       173
7.0       121
2.0       121
15.0       77
11.0       23
13.0       22
9.0        10
14.0        2
Name: eventStrength, dtype: int64

In [34]:
grouped_df.dtypes

personId           int64
contentId          int64
title             object
eventStrength    float64
dtype: object

In [35]:
grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

In [62]:
grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40682 entries, 0 to 40681
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   personId       40682 non-null  category
 1   contentId      40682 non-null  category
 2   title          40682 non-null  category
 3   eventStrength  40682 non-null  int64   
 4   person_id      40682 non-null  int16   
 5   content_id     40682 non-null  int16   
dtypes: category(3), int16(2), int64(1)
memory usage: 1016.3 KB


In [63]:
#因為若是比較基本的是不論是什麼行為都算是1
grouped_df['eventStrength'] = 1
grouped_df['eventStrength'].value_counts()

1    40682
Name: eventStrength, dtype: int64

In [42]:
#?兩個差在哪
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['person_id'], grouped_df['content_id'])))

In [69]:
#?? csr_matrix把資料做壓縮那最後推薦要怎麼推薦
sparse_content_person.toarray().shape

(2979, 1895)

In [73]:
sparse_content_person.toarray()[1:100,1:100]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
np.savetxt("/Users/lucaschang/Documents/python/rec_sys_implicit/sparse_content_person.csv", 
           sparse_content_person.toarray()[1:100,1:100], delimiter=",")

In [75]:
sparse_person_content.toarray().shape

(1895, 2979)

In [76]:
sparse_person_content.toarray()[1:100,1:100]

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [77]:
np.savetxt("/Users/lucaschang/Documents/python/rec_sys_implicit/sparse_person_content.csv", 
           sparse_content_person.toarray()[1:100,1:100], delimiter=",")

In [43]:
print(sparse_content_person[1, ])

  (0, 2)	1.0
  (0, 130)	1.0
  (0, 210)	1.0
  (0, 555)	1.0
  (0, 941)	1.0
  (0, 946)	1.0
  (0, 1149)	1.0
  (0, 1235)	1.0
  (0, 1499)	1.0
  (0, 1532)	1.0


In [44]:
print(sparse_person_content[1,:])

  (0, 1152)	1.0
  (0, 1319)	1.0
  (0, 1760)	1.0
  (0, 2308)	1.0
  (0, 2586)	1.0


In [78]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

In [79]:
alpha = 15
data = (sparse_content_person * alpha).astype('double')

# Fit the model
model.fit(data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [82]:
#針對特定商品的相似度計算
content_id = 450
n_similar = 10

#？？model.user_factors要怎麼查的出來
person_vecs = model.user_factors
content_vecs = model.item_factors

#?
content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

#?好像是做標準化？
scores = content_vecs.dot(content_vecs[content_id]) / content_norms

#?
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]

#?zip 是幹嘛的，similar是相似度
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

In [87]:
sorted(similar)

[(388, 0.7084298),
 (450, 1.0),
 (729, 0.7759288),
 (1153, 0.72980464),
 (1388, 0.7075799),
 (1718, 0.6997923),
 (1765, 0.78623885),
 (1824, 0.7107821),
 (2343, 0.7397392),
 (2840, 0.7386269)]

In [48]:
for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Google's fair use victory is good for open source
Up your DevOps chops with this online Kubernetes class
Google lags behind Amazon and Microsoft's cloud in one important area
How Companies Are Using Machine Learning to Get Faster and More Efficient
Python Ecosystem for Machine Learning - Machine Learning Mastery
An independent organization just ranked Google as the best cloud, beating Amazon
Meet Mycroft, the open source AI who wants to rival Siri, Cortana, and Alexa | ZDNet
Tensorflow wins
Deep learning software knows that a rose is a rose is a rosa rubiginosa
Google's Cloud Dataflow stomps on Apache Spark in new benchmark tests


In [50]:
#針對特定個人推薦內容
#return recommedation results to a specific person (input:person_id,person_content_matrix,person_vecs,content_vecs,num_contents)
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix（取得特定個人的購買向量）
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors（specific person all content vecs)
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    #?
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero ??
    recommend_vector = person_interactions * rec_vector_scaled   ##why miltiple rec_vector_scaled ??
    # Sort the indices of the content into order of best recommendations   ##argsort??
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations

In [51]:
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 50

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Custo do Erro - Cinco motivos para investir em...  1.000000
1  Former Google career coach shares a visual tri...  0.832930
2                       Discutindo Devops na Prática  0.731799
3  'The Simpsons' celebrates 600 episodes with a ...  0.725046
4  Do You Suffer From Deployment Anxiety? - DZone...  0.721484
5           Drupal and ambitious digital experiences  0.719188
6  GitLab launches Issue Boards, an open-source t...  0.709348
7  How I built an app with 500,000 users in 5 day...  0.703791
8            Why Learning Angular 2 Was Excruciating  0.694935
9               The technology behind preview photos  0.694694


In [52]:
grouped_df.loc[grouped_df['person_id'] == 50].sort_values(by=['eventStrength'], ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

Unnamed: 0,title,person_id,eventStrength
1727,Acquia Engage 2016: Day One,50,1
1771,Google vai reduzir em 50% consumo de memória d...,50,1
1777,"Waterwheel, the Drupal SDK Ecosystem",50,1
1776,Can virtual reality revolutionise education?,50,1
1775,Seja esperto no trabalho: Melhore a comunicaçã...,50,1
1774,Google's new tool helps test your website's sp...,50,1
1773,"Razorfish, US digital revenues, drag down Publ...",50,1
1772,"Applying the Linus Torvalds ""Good Taste"" Codin...",50,1
1770,Rating the English Proficiency of Countries an...,50,1
1779,Cognizant Named a Digital Transformation Leade...,50,1


In [53]:
#??為何training_set,test_set是這樣分割法
#是用隨機挑選法遮蔽客人＊文章的配對嗎？
import random

def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list ＃zip如何使用？？

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    content_inds = [index[0] for index in samples] # Get the item row indices

    person_inds = [index[1] for index in samples] # Get the user column indices

    #把抽到的pairs都改成零（把答案遮起來）
    training_set[content_inds, person_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(person_inds))

In [54]:
content_train, content_test, content_persons_altered = make_train(sparse_content_person, pct_test = 0.2)

In [90]:
content_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [91]:
content_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [55]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [56]:
#rec_sys auc is compare popularity and rec_sys result by each person  
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [57]:
calc_mean_auc(content_train, content_persons_altered,[person_vecs, content_vecs.T], content_test)

(0.983, 0.819)