# Collaborative Filtering Using `implicit` library

In [None]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.5.2-cp37-cp37m-manylinux2014_x86_64.whl (18.5 MB)
[K     |████████████████████████████████| 18.5 MB 237 kB/s 
Installing collected packages: implicit
Successfully installed implicit-0.5.2


In [None]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
import random
import implicit
from implicit.evaluation import ranking_metrics_at_k

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
cur_path = "/content/drive/My Drive/social-networks-project/"
os.chdir(cur_path)

Mounted at /content/drive


# Data

Read in, select only necessary cols: customer_id, article_id. Add column for purchase, because rows in this df indicate purchases

In [None]:
# read in
train_df = pd.read_csv(cur_path + 'training_test/train_revise.csv')
test_df = pd.read_csv(cur_path + 'training_test/test_revise.csv')

# select cols
train_df = train_df[['customer_id','article_id']]
test_df = test_df[['customer_id','article_id']]

# add purchase col
train_df['purchase'] = 1
test_df['purchase'] = 1

print('train',len(train_df))
print('test',len(test_df))

train 1781011
test 445253


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
print('train', train_df['customer_id'].nunique())
print('test', test_df['customer_id'].nunique())

train 390826
test 286521


The overlap between the train and test set customers is the test set. This makes sense considering the test set includes customers who bought >=2 things.

In [None]:
len(set(train_df['customer_id'].unique()).intersection(test_df['customer_id'].unique()))

286521

Aggregate rows where user has purchased an item more than once!

In [None]:
train_df = train_df.groupby(['customer_id', 'article_id'])['purchase'].sum().reset_index()
test_df = test_df.groupby(['customer_id', 'article_id'])['purchase'].sum().reset_index()

In [None]:
# use a binary representation (one row for each customer X article, rmv multiple purchases)
#train_df = train_df.drop_duplicates(['customer_id', 'article_id'], inplace=False)
#test_df = test_df.drop_duplicates(['customer_id', 'article_id'], inplace=False)

Create a numeric user_id column. article_id is already numeric

In [None]:
# get all customer ids
users_df = train_df['customer_id'].append(test_df['customer_id'], ignore_index=True)\
           .reset_index()\
           .drop('index', axis=1)\
           .drop_duplicates('customer_id')\
           .reset_index()\
           .drop('index', axis=1)\
           .sort_values('customer_id')
users_df['user'] = users_df['customer_id'].astype("category")
users_df['user_id'] = users_df['user'].cat.codes

# get all item ids across both and add new ids
items_df = train_df['article_id'].append(test_df['article_id'], ignore_index=True)\
           .reset_index()\
           .drop('index', axis=1)\
           .drop_duplicates('article_id')\
           .reset_index()\
           .drop('index', axis=1)\
           .sort_values('article_id')
items_df['item'] = items_df['article_id'].astype("category")
items_df['item_id'] = items_df['item'].cat.codes

# add user and item ids to train and test dfs
train_df = train_df.merge(users_df, how='inner', on='customer_id')
train_df = train_df.merge(items_df, how='inner', on='article_id')
test_df = test_df.merge(users_df, how='inner', on='customer_id')
test_df = test_df.merge(items_df, how='inner', on='article_id')

In [None]:
# one row per customer per item
print(len(train_df))
print(len(test_df))

1512046
430815


In [None]:
# add all ids straight up
'''train_df['user'] = train_df['customer_id'].astype("category")
train_df['item'] = train_df['article_id'].astype("category")
train_df['user_id'] = train_df['user'].cat.codes
train_df['item_id'] = train_df['item'].cat.codes

test_df['user'] = test_df['customer_id'].astype("category")
test_df['item'] = test_df['article_id'].astype("category")
test_df['user_id'] = test_df['user'].cat.codes
test_df['item_id'] = test_df['item'].cat.codes

train_df.head(2)'''

# create users df (all ids)
'''users_df = train_df['customer_id'].append(test_df['customer_id'], ignore_index=True)\
           .reset_index()\
           .drop('index', axis=1)\
           .drop_duplicates('customer_id')\
           .reset_index()\
           .drop('index', axis=1)\
           .sort_values('customer_id')
users_df['user'] = users_df['customer_id'].astype("category")
users_df['user_id'] = users_df['user'].cat.codes'''

'''train_df = train_df.merge(users_df, how='inner', on='customer_id')
train_df = train_df.merge(items_df, how='inner', on='article_id')
test_df = test_df.merge(users_df, how='inner', on='customer_id')
test_df = test_df.merge(items_df, how='inner', on='article_id')'''

'''train_user_cust = dict(zip(train_df.user_id, train_df.customer_id))
train_item_art = dict(zip(train_df.item_id, train_df.article_id))

test_user_cust = dict(zip(test_df.user_id, test_df.customer_id))
test_item_art = dict(zip(test_df.item_id, test_df.article_id))'''

#train_user_cust = dict(zip(train_df.user_id, train_df.customer_id))
#test_user_cust = dict(zip(test_df.user_id, test_df.customer_id))

'train_user_cust = dict(zip(train_df.user_id, train_df.customer_id))\ntrain_item_art = dict(zip(train_df.item_id, train_df.article_id))\n\ntest_user_cust = dict(zip(test_df.user_id, test_df.customer_id))\ntest_item_art = dict(zip(test_df.item_id, test_df.article_id))'

Save a dictionary that maps user_id back to customer_id and item_id back to article_id

In [None]:
user_cust = dict(zip(users_df.user_id, users_df.customer_id))
item_art = dict(zip(items_df.item_id, items_df.article_id))

If we use the implicit library, we need a user-item matrix. So, we should create two matrices: 

* one for fitting the model (item-user)
* one for recommendations (user-item)

In [None]:
# using train data
train_iu = sparse.csr_matrix((train_df['purchase'].astype(float),
                                     (train_df['item_id'], train_df['user_id'])))
train_ui = sparse.csr_matrix((train_df['purchase'].astype(float),
                                     (train_df['user_id'], train_df['item_id'])))

# for test data, we won't be fitting model, so only create second matrix
test_iu = sparse.csr_matrix((test_df['purchase'].astype(float),
                                     (test_df['item_id'], test_df['user_id'])))
test_ui = sparse.csr_matrix((test_df['purchase'].astype(float),
                                     (test_df['user_id'], test_df['item_id'])))

# Implicit Model

Init the model

* factors: number of latent factors to compute
* regularization: regularization factor
* iterations: number of ALS iterations
* calculate_training_loss: log training loss at each interation
* random_state: random state for seeding init user and item factors

In [None]:
# init model
model = implicit.als.AlternatingLeastSquares(factors=15, regularization=0.01,
                                             iterations=50, random_state = 99,
                                             calculate_training_loss=True)

Fit model!

According to the documentation, when we fit the model we need:
* user_items: csr_matrix
            Matrix of confidences for the liked items. This matrix should be a csr_matrix where
            the rows of the matrix are the users, the columns are the items liked that user,
            and the value is the confidence that the user liked the item.

For some reason, as github user MDTsai pointed out in an [issue](https://github.com/benfred/implicit/issues/567), that the tutorial the creators of the package put out send sparse_iu to the model.fit() function, but the docs (above) say to do the opposite. There must have been some change after tutorials released.

Therefore, I use train_ui in model.fit(), as the rows of this matrix are users and the columns are products. 

In [None]:
train_iu.shape

(1000, 390826)

In [None]:
train_ui.shape

(390826, 1000)

Calculate confidence

* alpha_val (int): The rate in which we'll increase our confidence in a preference with more interactions.

In [None]:
alpha_val = 5 #default is 40
# from tutorial
#data_conf = (train_iu * alpha_val).astype('double')
# what we should do according to documentation
data_conf = (train_ui * alpha_val).astype('double')

Fit model

In [None]:
model.fit(data_conf)

  0%|          | 0/50 [00:00<?, ?it/s]

# Find Similar Items



In [None]:
# Find the 10 most similar to item id
item_id = 85 
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

idx = similar[0]
score = similar[1]

# Print the names of our most similar artists
for i in range(len(idx)):
  print(train_df['article_id'].loc[train_df['item_id'] == idx[i]].head(1))

1452682    399201002
Name: article_id, dtype: int64
736108    399201020
Name: article_id, dtype: int64
364917    399201005
Name: article_id, dtype: int64
737940    399201026
Name: article_id, dtype: int64
1412647    399201023
Name: article_id, dtype: int64
129133    399201022
Name: article_id, dtype: int64
990193    399201024
Name: article_id, dtype: int64
605929    399136009
Name: article_id, dtype: int64
998296    399136061
Name: article_id, dtype: int64
1492674    399136033
Name: article_id, dtype: int64


# Recommendations

An example using one user id. 

In [None]:
# Create recommendations for user with id 2025
user_id = 72945

# Use the implicit recommender.
recommended = model.recommend(user_id, train_ui[user_id],
                              filter_already_liked_items=True)

articles = []
scores = []

idx = recommended[0]
score = recommended[1]

# Get artist names from ids
for i in range(len(idx)):
  articles.append(train_df['article_id'].loc[train_df['item_id'] == idx[i]].iloc[0])
  scores.append(score[i])

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'user_id':user_id,'articles': articles, 'score': scores})

print(recommendations)

   user_id   articles     score
0    72945  507910001  0.204355
1    72945  568601006  0.179209
2    72945  507909001  0.170565
3    72945  573716012  0.161125
4    72945  539723005  0.154264
5    72945  507909003  0.133605
6    72945  539723001  0.128036
7    72945  399256001  0.115379
8    72945  673677002  0.112244
9    72945  568597006  0.104448


On all train IDs

In [None]:
# number of products to recommend
n_recs = 10

# user ids
user_ids = np.array(train_df['user_id'])

# use the implicit recommender 
recommended = model.recommend(user_ids, train_ui[user_ids], filter_already_liked_items=False)

Create dataframe to store recommendations

In [None]:
# convert item_id to article_id
rec_article_ids = np.ndarray(recommended[0].shape)
for k in item_art:
  rec_article_ids[recommended[0] == k] = item_art[k]

# get items
item_rec_df = pd.DataFrame(rec_article_ids, columns=['rec_' + str(n+1) for n in range(n_recs)])
item_rec_df = item_rec_df.astype({'rec_1': int, 'rec_2': int, 'rec_3': int, 'rec_4': int,
                                  'rec_5': int, 'rec_6': int, 'rec_7': int, 'rec_8': int,
                                  'rec_9': int, 'rec_10': int,})

# add user id
item_rec_df.insert(0, 'user_id', user_ids)

# get scores in a df, add user_id
score_rec_df = pd.DataFrame(recommended[1], columns=['score_' + str(n+1) for n in range(n_recs)])
score_rec_df.insert(0, 'user_id', user_ids)

# merge dataframes by user_id
train_rec_df = item_rec_df.merge(score_rec_df, how="inner", on="user_id")

# map user_id to customer_id
train_rec_df.insert(0, 'customer_id', train_rec_df['user_id'].map(user_cust))
train_rec_df.drop('user_id', axis=1, inplace=True)

train_rec_df = train_rec_df.drop_duplicates('customer_id',inplace=False)

In [None]:
len(train_rec_df)

390826

In [None]:
# above matches len for nunique users in train
print(train_df['customer_id'].nunique())

390826


Test

In [None]:
# number of products to recommend
n_recs = 5

# user ids
user_ids = np.array(test_df['user_id'].unique())

# use the implicit recommender 
recommended = model.recommend(user_ids, test_ui[user_ids], N=n_recs,
                              recalculate_user=False,
                              filter_already_liked_items=False)

#### make dataframe #####################################

# convert item_id to article_id
rec_article_ids = np.ndarray(recommended[0].shape)
for k in item_art:
  rec_article_ids[recommended[0] == k] = item_art[k]

# get items
item_rec_df = pd.DataFrame(rec_article_ids, columns=['rec_' + str(n+1) for n in range(n_recs)])
item_rec_df = item_rec_df.astype({'rec_1': int, 'rec_2': int, 'rec_3': int, 'rec_4': int,
                                  'rec_5': int, })
                                  #'rec_6': int, 'rec_7': int, 'rec_8': int,
                                  #'rec_9': int, 'rec_10': int,})
                                  #'rec_11': int, 'rec_12': int,'rec_13': int, 'rec_14': int,
                                  #'rec_15': int, 'rec_16': int, 'rec_17': int, 'rec_18': int,
                                  #'rec_19': int, 'rec_20': int,})

# add user id
item_rec_df.insert(0, 'user_id', user_ids)

# get scores in a df, add user_id
score_rec_df = pd.DataFrame(recommended[1], columns=['score_' + str(n+1) for n in range(n_recs)])
score_rec_df.insert(0, 'user_id', user_ids)

# merge dataframes by user_id
test_rec_df = item_rec_df.merge(score_rec_df, how="inner", on="user_id")

# map user_id to customer_id
test_rec_df.insert(0, 'customer_id', test_rec_df['user_id'].map(user_cust))
test_rec_df.drop('user_id', axis=1, inplace=True)

test_rec_df = test_rec_df.drop_duplicates('customer_id',inplace=False)

In [None]:
len(test_rec_df)

286521

In [None]:
# above matches len for nunique users in train
print(test_df['customer_id'].nunique())

286521


# Evaluate 

Built in function (not sure if this is exactly right)

In [None]:
# k = 10
ranking_metrics_at_k(model, train_ui, test_ui, K=10, show_progress=True, num_threads=1)

  0%|          | 0/286521 [00:00<?, ?it/s]

{'auc': 0.5461308374079085,
 'map': 0.0432126812800124,
 'ndcg': 0.06040387611492828,
 'precision': 0.09093211693927866}

In [None]:
# k = 20 same, k = 100 same auc but precision increases
ranking_metrics_at_k(model, train_ui, test_ui, K=100, show_progress=True, num_threads=1)

  0%|          | 0/286521 [00:00<?, ?it/s]

{'auc': 0.6084337117159164,
 'map': 0.05038062438925609,
 'ndcg': 0.10373796300881959,
 'precision': 0.28602300291308336}

Manual

Get df with customer_id and predictions

In [None]:
test_rec_df['test_recs'] = test_rec_df[['rec_'+str(n+1) for n in range(n_recs)]].values.tolist()
train_rec_df['train_recs'] = train_rec_df[['rec_'+str(n+1) for n in range(n_recs)]].values.tolist()
test_recs = test_rec_df[['customer_id', 'test_recs']]
train_recs = train_rec_df[['customer_id', 'train_recs']]

Get df with customer_id and true purchases

In [None]:
train_true = train_df.groupby('customer_id').agg({"article_id": lambda x: list(x)}).reset_index()
test_true = test_df.groupby('customer_id').agg({"article_id": lambda x: list(x)}).reset_index()

Get number of correct predictions divided by total number of actual purchases

In [None]:
train_acc = train_true.merge(train_recs, how = "left", on = "customer_id")
train_acc['correct_preds'] =  [set(a).intersection(b) for a, b in zip(train_acc['article_id'], train_acc['train_recs'])]
train_acc['num_correct'] = train_acc['correct_preds'].apply(len)
train_acc.head()

Unnamed: 0,customer_id,article_id,train_recs,correct_preds,num_correct
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[666448006, 673677002]","[673677002, 507910001, 372860001, 537116001, 5...",{673677002},1
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[351933001, 444325004, 478549001, 572187001, 6...","[355072002, 615141002, 355569001, 589222001, 3...",{},0
2,00008469a21b50b3d147c97135e25b4201a8c58997f787...,"[673677001, 673677004]","[673677002, 507910001, 537116001, 507909001, 3...",{},0
3,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,"[573085010, 573085028, 573716053, 573716054, 6...","[706016001, 706016002, 539723005, 712587003, 5...",{},0
4,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,[591334019],"[507910001, 673677002, 507909001, 579541001, 6...",{},0


Metrics

* number of correct recommendations
* precision (num correct recs / total purchases)
* recall (num correct recs / total recs)

In [None]:
print('avg number of correct recs:', train_acc['num_correct'].mean())
train_acc['corr'] = (train_acc['num_correct']>0)*1
print('accuracy', train_acc['corr'].sum()/len(train_acc))
print('precision:', train_acc['num_correct'].sum() / sum(train_acc['article_id'].map(len)))
print('recall:', train_acc['num_correct'].sum() / sum(train_acc['train_recs'].map(len)))

avg number of correct recs: 0.7910527958733554
accuracy 0.5958150174246339
precision: 0.20446732440679716
recall: 0.15821055917467108


For test

In [None]:
test_acc = test_true.merge(test_recs, how = "left", on = "customer_id")
test_acc['correct_preds'] =  [set(a).intersection(b) for a, b in zip(test_acc['article_id'], test_acc['test_recs'])]
test_acc['num_correct'] = test_acc['correct_preds'].apply(len)
test_acc.head()

Unnamed: 0,customer_id,article_id,test_recs,correct_preds,num_correct
0,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,[666448006],"[673677002, 507910001, 372860001, 537116001, 5...",{},0
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[671502001, 681376001, 685687001]","[355072002, 615141002, 355569001, 589222001, 3...",{},0
2,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,"[573085010, 636455003, 684210001]","[706016001, 706016002, 539723005, 712587003, 5...",{},0
3,0000f1c71aafe5963c3d195cf273f7bfd50bbf17761c91...,[632982036],"[673677002, 372860001, 537116001, 673396002, 6...",{},0
4,0000f2ea26b7f0a9175f428c8cf7743e9e10e193465ecd...,[562245059],"[615141002, 673396002, 708352001, 692454002, 6...",{},0


In [None]:
test_acc['correct_preds'] =  [set(a).intersection(b) for a, b in zip(test_acc['article_id'], test_acc['test_recs'])]
test_acc['num_correct'] = test_acc['correct_preds'].apply(len)
print('avg number of correct recs:', test_acc['num_correct'].mean())
test_acc['corr'] = (test_acc['num_correct']>0)*1
print('accuracy', test_acc['num_correct'].sum()/445253)
print('precision:', test_acc['num_correct'].sum() / sum(test_acc['article_id'].map(len)))
print('recall:', test_acc['num_correct'].sum() / sum(test_acc['test_recs'].map(len)))

avg number of correct recs: 0.20222601484707928
accuracy 0.13013275598367668
precision: 0.13449392430625676
recall: 0.04044520296941585


Save

In [None]:
final_res = test_acc[['customer_id', 'article_id', 'test_recs']]
final_res.rename({'test_recs': 'cf_recs', 'article_id':'purchases'}, axis=1, inplace=True)
final_res.to_csv("top5_collaborative_filtering_results.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
[float(i.rstrip()) for i in str(model.item_factors[0]).replace('[','').replace(']','').split(' ') if i != '' ]


[-0.06226186,
 -0.06586203,
 -0.03584025,
 0.03642365,
 0.24569519,
 0.0306516,
 0.04594054,
 -0.00867645,
 0.03291946,
 -0.05255023,
 -0.01077794,
 -0.15198345,
 0.15606453,
 0.03707664,
 -0.08060411]

In [None]:
[[float(i.rstrip()) for i in str(model.item_factors[j]).replace('[','').replace(']','').split(' ') if i != '' ] for j in range(model.item_factors.shape[0])]