In [1]:
from lightfm.data import Dataset
import pandas as pd
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
import numpy as np
from posixpath import join
import itertools

In [2]:
%%time
# load full data
RANK=10
EPOCHS=5
LOSS='bpr'
from scipy.sparse import load_npz
data = load_npz('/home/suils789/fulldata2.npz')
row, col = data.nonzero()

CPU times: user 55.4 s, sys: 28.1 s, total: 1min 23s
Wall time: 1min 26s


In [3]:
def build_interaction(data):
    
    row, col = data.nonzero()
    user_ids = set(row)
    item_ids = set(col)
    dataset = Dataset()
    dataset.fit(
        (uid for uid in user_ids),
        (iid for iid in item_ids)
        )

    num_users, num_items = dataset.interactions_shape()
    print((num_users, num_items))
    
    zip_data = zip(row,col)
    (interactions, weights) = dataset.build_interactions(zip_data)
    
    return interactions


In [6]:
row.shape, col.shape

((1148025491,), (1148025491,))

In [9]:
%%time

user_ids = set(row)
item_ids = set(col)
dataset = Dataset()
dataset.fit(
	(uid for uid in user_ids),
	(iid for iid in item_ids)
	)

num_users, num_items = dataset.interactions_shape()



CPU times: user 3min 10s, sys: 11.9 s, total: 3min 22s
Wall time: 3min 21s


In [10]:
num_users, num_items

(67768573, 5195)

In [17]:
%%time
def subset_data(percent):
    num_row = int(num_users*percent)
    num_col = int(num_items*percent)
    return zip(row[:num_row],col[:num_col])
PERCENT=1
subset = subset_data(PERCENT)
# zip_data = zip(row,col)

CPU times: user 14 µs, sys: 2 µs, total: 16 µs
Wall time: 19.6 µs
Compiler : 1.03 s


In [8]:
%%time
#build training/testing sparse matrix
# in format of (uid, itemid) pair
(interactions, weights) = dataset \
    .build_interactions(subset)

CPU times: user 9.4 ms, sys: 2.91 ms, total: 12.3 ms
Wall time: 12.1 ms


In [10]:
%%time
# train - all items are positive signals
implicit_model = LightFM(no_components=RANK, loss=LOSS)
implicit_model.fit(interactions, epochs=EPOCHS, num_threads=4)

CPU times: user 6.45 s, sys: 3.33 s, total: 9.78 s
Wall time: 9.48 s


In [12]:
%%time
# train precision
print(precision_at_k(implicit_model, interactions, k=2).mean())

0.8379121
CPU times: user 1.69 s, sys: 1.14 s, total: 2.83 s
Wall time: 2.83 s


In [14]:
%%time
print(auc_score(implicit_model, interactions, num_threads=4, check_intersections=True).mean())

0.79692084
CPU times: user 1.56 s, sys: 795 ms, total: 2.36 s
Wall time: 1.56 s


In [15]:
implicit_model.user_embeddings

array([[-0.24156521,  0.043418  ],
       [-0.17189297, -0.00130402],
       [ 0.05117647, -0.03159123],
       ...,
       [ 0.2102966 ,  0.17916626],
       [ 0.02212979, -0.1849086 ],
       [-0.06897962, -0.11932024]], dtype=float32)

In [14]:
outfile = 'user_embed'
np.save(outfile, implicit_model.user_embeddings)

In [15]:
# pickle latent space
np.load('user_embed.npy')

array([[ 0.00307047, -0.01270851],
       [ 0.03538265,  0.096807  ],
       [-0.17943452, -0.13600814],
       ...,
       [ 0.23745805,  0.21371886],
       [-0.23371974,  0.22680877],
       [ 0.19771738, -0.21621765]], dtype=float32)

In [None]:
# embeding visualization
from sklearn.cluster import KMeans
from sklearn import manifold
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi'] = "300"

def visual(embeding, k_cluster):
    tsne = manifold.TSNE(n_components=2, init='pca')
    latent_space = tsne.fit_transform(embeding)
    dbs = KMeans(n_clusters=k_cluster, init='k-means++').fit(embeding)
    plt.scatter(latent_space[:,0], latent_space[:,1], c=plt.cm.Accent(dbs.labels_))
    plt.show()


In [None]:
%%time
import random
embeding = np.load('user_embed.npy')
visual(embeding[:1000000], 5)