In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, coo_array
from sklearn.decomposition import NMF, TruncatedSVD
import matplotlib.pyplot as plt

In [None]:
events = pd.read_csv("/home/mbh/Downloads/events.csv")
events = events[events.event == "view"]
len(events)

In [None]:
K = 10
while events.visitorid.value_counts().min() < K or events.itemid.value_counts().min() < K:
    vc_usr = events.visitorid.value_counts()
    keep_usr = vc_usr[vc_usr >= K].index
    vc_item = events.itemid.value_counts()
    keep_item = vc_item[vc_item >= K].index
    print(keep_usr.min(), keep_item.min())
    events = events[events.visitorid.isin(keep_usr) & events.itemid.isin(keep_item)]
len(events)
events["visitorid"] = events.visitorid.factorize()[0]
events["itemid"] = events.itemid.factorize()[0]

In [None]:
def build_mat(df):
    df = df[["visitorid", "itemid"]].drop_duplicates()
    uid, iid = df.visitorid.values, df.itemid.values
    nuser = len(set(uid.tolist())), len(set(iid.tolist()))
    return coo_matrix((np.ones_like(uid), (uid, iid)))

In [None]:
mask = events.timestamp < 1438630203944
events_train = events[mask]
events_test = events[mask]
print(mask.mean())

mask = events_test.timestamp < 1434643203944
events_test_1 = events_test[mask]
events_test_2 = events_test[~mask]
print(mask.mean())

In [None]:
R = build_mat(events_train)
Rcsr = R.tocsr()
Rcsc = R.tocsc()
print(R.shape)

In [None]:
class ALS:

    def __init__(self, n, m, k, alpha=0.5) -> None:
        self.U = np.random.uniform(0,10,(n, k))
        self.V = np.random.uniform(0,10,(m, k))
        self.alpha = alpha
        self.k = k

    def rmse(self):
        return np.linalg.norm(self.R - self.U @ self.V.T)

    def fit(self, R, n_iter, supress=True):
        self.R = R
        for i in range(n_iter):
            self._step()
            
    def _step(self):
        U, V, R, alpha = self.U, self.V, self.R, self.alpha
        self.U[:] =  R @ V @ np.linalg.inv(V.T @ V + alpha * np.eye(self.k))
        self.V[:] =  R.T @ U @ np.linalg.inv( U.T @ U + alpha * np.eye(self.k))

In [None]:
nuser, nitem = R.shape
model = ALS(nuser, nitem, 64, alpha=0)
model.fit(R, 150)
U, V = model.U, model.V
#model = NMF(16)
#U = model.fit_transform(R)
#V = model.components_.T

In [None]:
U.shape, V.shape, R.shape

In [None]:
plt.hist(np.linalg.norm(U, axis=1), bins=40)
plt.yscale("log")

In [None]:
plt.scatter(np.linalg.norm(U, axis=1).reshape(-1), np.array(R.sum(axis=1)).reshape(-1))
plt.yscale("log")
plt.xscale("log")
plt.xlabel("l2 norm")
plt.ylabel("# interactions")

In [None]:
class UserRepr:

    def __init__(self, V):
        self.A = np.linalg.inv(V.T @ V) @ V.T

    def __call__(self, r):
        return r.reshape(1,-1).dot(self.A.T).reshape(-1)


class Recommender:

    def __init__(self, V):
        self.V = V

    def recommend(self, u, k=None):
        scores = self.V.dot(u)
        rec = np.argsort(scores)[::-1]
        if k is not None:
            rec = rec[:k]
        return rec

def ids2vec(ids, total):
    cols = np.array(ids)
    ones = np.ones_like(cols)
    return coo_matrix((ones, (ones*0, cols)), shape=(1,total)).tocsc()

In [None]:
compute_usr = UserRepr(V)
rec = Recommender(V)

In [None]:
def recall(y_true, y_pred, k=50):
    return len(set(y_true) & set(y_pred)) / min(k, len(y_true))

In [None]:
from tqdm import tqdm

nuser, nitem = R.shape
z = []
df = events_test_2
for vid in (pbar := tqdm(df.visitorid.unique())):
    gt = events_test_2[events_test_2.visitorid == vid].itemid.values
    interactions = events_train[events_train.visitorid == vid].itemid.values.tolist() + events_test_1[events_test_1.visitorid == vid].itemid.values.tolist()
    l0 = len(interactions)
    interactions = list(set(interactions))
    delta = l0 - len(interactions)
    if delta < 5:
        continue
    u = compute_usr(ids2vec(interactions, nitem))
    r = rec.recommend(u,50)
    z.append(recall(gt, r))
    pbar.set_description(f"{np.mean(z)}")

In [None]:
0.22822295698198428

In [None]:

def recall(Rcsc, i, r_, k=50):
    nz = Rcsc[i].nonzero()[1].tolist()
    rec = np.argsort(r_)[::-1][:k]
    return len(set(rec) & set(nz)) / min(k,len(nz))
    
recall(Rcsc, i, V.dot(x(Rcsc[i])), k=500)

In [None]:
thr = 0.001
norms = np.linalg.norm(U, axis=1)
(norms > thr).nonzero()

In [None]:
i = 38
U[i]

In [None]:
x(Rcsr[i])

In [None]:
((U[i] - x(Rcsr[i])) ** 2).sum()

In [None]:
((V.dot(U[i]) - np.array(Rcsc[i].todense()).reshape(-1))**2).mean()

In [None]:
((V.dot(x(Rcsr[i])) - np.array(Rcsc[i].todense()).reshape(-1))**2).mean()

In [None]:
nuser, nitem = R.shape
model = ALS(nuser, nitem, 16, alpha=-.5)
model.fit(R, 50)

In [None]:
U, V = model.U, model.V