In [1]:
!pip install -q tensorflow_addons

[?25l[K     |▎                               | 10 kB 23.7 MB/s eta 0:00:01[K     |▋                               | 20 kB 10.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.7 MB/s eta 0:00:01[K     |█▏                              | 40 kB 7.8 MB/s eta 0:00:01[K     |█▌                              | 51 kB 5.5 MB/s eta 0:00:01[K     |█▊                              | 61 kB 5.6 MB/s eta 0:00:01[K     |██                              | 71 kB 4.8 MB/s eta 0:00:01[K     |██▍                             | 81 kB 5.4 MB/s eta 0:00:01[K     |██▋                             | 92 kB 5.3 MB/s eta 0:00:01[K     |███                             | 102 kB 5.2 MB/s eta 0:00:01[K     |███▎                            | 112 kB 5.2 MB/s eta 0:00:01[K     |███▌                            | 122 kB 5.2 MB/s eta 0:00:01[K     |███▉                            | 133 kB 5.2 MB/s eta 0:00:01[K     |████▏                           | 143 kB 5.2 MB/s eta 0:00:01[K   

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import os
import heapq  
import gc
from tqdm import tqdm
import random
from sklearn.metrics import mean_squared_error

from tensorflow import keras
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import optimizers, callbacks, layers, losses
from tensorflow.keras.layers import Dense, Concatenate, Activation, Add, BatchNormalization, Dropout, Input, Embedding, Flatten, Multiply, Dot
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')

In [3]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
gpus = tf.config.experimental.list_physical_devices('GPU')

In [4]:
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

In [5]:
def mish(x):
    return x*tf.math.tanh(tf.math.softplus(x))

def leakyrelu(x, factor=0.2):
    return tf.maximum(x, factor*x)

## Matrix Factorization from scratch - SGD method

### Data Loading

In [6]:
!wget -q --show-progress -O movies.dat https://github.com/RecoHut-Datasets/movielens_1m/raw/main/ml1m_items.dat
!wget -q --show-progress -O ratings.dat https://github.com/RecoHut-Datasets/movielens_1m/raw/main/ml1m_ratings.dat



In [7]:
df = pd.read_csv('ratings.dat',
                     sep="\t",
                     header=None,
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'time'])

df.head()

Unnamed: 0,userId,movieId,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
uuid = df['userId'].unique()
uiid = df['movieId'].unique()

In [9]:
um = pd.pivot_table(df, values='rating', index='userId', columns='movieId').fillna(0)

### RMSE eval

In [10]:
def rmse(true, pred):
    user_idx, item_idx = np.nonzero(true)
    trues = [true[i][j] for i, j in zip(user_idx, item_idx)]
    preds = [pred[i][j] for i, j in zip(user_idx, item_idx)]
    return np.sqrt(mean_squared_error(trues, preds))

### Algorithm

In [11]:
def matrix_fatorization(M, k, epochs, lr=0.01):
    n_user, n_item = M.shape
    
    U = np.random.normal(0., 1./k, (n_user, k))
    V = np.random.normal(0., 1./k, (n_item, k))
    
    u_idx, i_idx = np.nonzero(M)
    
    for e in (range(epochs)):
        for i, j in zip(u_idx, i_idx):
            e_ij = M[i][j] - np.dot(U[i,:], V[j,:].T)
            
            U[i, :] = U[i, :] + lr*(e_ij*V[j, :] - 0.01*U[i,:])
            V[j, :] = V[j, :] + lr*(e_ij*U[i, :] - 0.01*V[j,:])
            
        recon = np.dot(U, V.T)
        print(f'epochs: {e}:', rmse(M, recon))
    return U, V.T

In [12]:
U, V = matrix_fatorization(um.values, 16, 5)

epochs: 0: 3.2609513753652335
epochs: 1: 1.111538625423202
epochs: 2: 0.9512405092366881
epochs: 3: 0.901285507831611
epochs: 4: 0.8711893277319607


In [13]:
recon = np.dot(U, V)
rmse(um.values, recon)

0.8711893277319607

### Inference

In [14]:
def get_best(record, U, V=V, top_k=10):
    prev = np.nonzero(record[0])[0]
    candidates = np.argsort(-np.dot(U, V))
    
    res = []
    cnt = 0
    for c in candidates:
        if c not in prev:
            res.append(c)
            cnt += 1
        if cnt == top_k:
            return res
get_best(um.values, U[0], V, 10)

[309, 16, 316, 56, 1133, 648, 2698, 1035, 1066, 646]

## Matrix Factorization from scratch - ALS method

In [None]:
df = pd.read_csv('ratings.dat',
                     sep="\t",
                     header=None,
                     engine='python',
                     names=['userId', 'movieId', 'rating', 'time'])

df.head()

Unnamed: 0,userId,movieId,rating
0,0,1104,5.0
1,0,639,3.0
2,0,853,3.0
3,0,3177,4.0
4,0,2162,5.0


In [None]:
uuid = df['userId'].unique()
uiid = df['movieId'].unique()

In [None]:
def extract_from_df(df, n_positive):
    df_ = df.copy()
    rtd = []
    user_id = df['userId'].unique()
    for i in tqdm(user_id):
        rtd += list(np.random.choice(df[df['userId']==i]['movieId'].index, n_positive, replace=False))
    return rtd

In [None]:
rtd = extract_from_df(df, 1)
train = df.drop(rtd)
test = df.loc[rtd]

100%|██████████| 6040/6040 [00:08<00:00, 751.86it/s]


In [None]:
R = pd.pivot_table(train, index='userId', values='rating', columns='movieId').fillna(0)

P = np.where(R>0, 1, 0)
R = R.values
n_u = R.shape[0]
n_i = R.shape[1]

k = 20
alpha = 40
lamda = 150
epochs = 10
X = np.random.rand(n_u, k)*0.01
Y = np.random.rand(n_i, k)*0.01

C = (1 + alpha*R)

In [None]:
def loss_function(C, P, X, Y, r_lambda):
    predict_error = np.square(P - np.matmul(X, Y.T))
    
    regularization = r_lambda * (np.mean(np.square(X)) + np.mean(np.square(Y)))
    confidence_error = np.mean(C * predict_error)
    total_loss = confidence_error + regularization
    predict_error = np.mean(predict_error)
    return predict_error, confidence_error, regularization, total_loss

def update(x, y, p, c=C):
    xt = x.T
    yt = y.T
    
    for u in range(n_u):
        c_ = C[u, :]
        p_ = P[u, :]
        cu = np.diag(c_)
        
        ycy = y.T.dot(cu).dot(y)
        ycyi = ycy+lamda*np.identity(ycy.shape[0])
        ycp = y.T.dot(cu).dot(p_.T)
        
        x[u] = np.linalg.solve(ycyi, ycp)
        
    for i in range(n_i):
        c_ = C[:, i]
        p_ = P[:, i]
        ci = np.diag(c_)
        
        xcx = x.T.dot(ci).dot(x)
        xcxi = xcx+lamda*np.identity(xcx.shape[0])
        xcp = x.T.dot(ci).dot(p_.T)
        
        y[i] = np.linalg.solve(xcxi, xcp)
        
    return x, y

for e in tqdm(range(epochs)):
    X, Y = update(X, Y, C)
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, X, Y, lamda)
    print('----------------step %d----------------' %e)
    print("predict error: %f" % predict_error)
    print("confidence error: %f" % confidence_error)
    print("regularization: %f" % regularization)
    print("total loss: %f" % total_loss)

 10%|█         | 1/10 [40:18<6:02:42, 2418.01s/it]

----------------step 0----------------
predict error: 0.118668
confidence error: 1.577255
regularization: 165.541228
total loss: 167.118483


 20%|██        | 2/10 [1:20:30<5:21:58, 2414.76s/it]

----------------step 1----------------
predict error: 0.197786
confidence error: 0.356723
regularization: 54.014504
total loss: 54.371228


 30%|███       | 3/10 [1:59:40<4:38:16, 2385.23s/it]

----------------step 2----------------
predict error: 0.184873
confidence error: 0.305381
regularization: 33.388967
total loss: 33.694348


In [None]:
def eval_hit(X, y, df, test, user_id, item_ids, top_k):
    df = pd.concat([df, test])
    items = list(set(item_ids) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])
    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = np.dot(X[user_id], Y[items].squeeze(1).T)
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    if items[-1][0] in top_k:
            return 1
    return 0

def eval_NDCG(X, Y, df, test, user_id, item_ids, top_k):
    df = pd.concat([df, test])
    items = list(set(item_ids) - set(df[df['userId']==user_id][df['rating']==1]['movieId'].values))
    np.random.shuffle(items)
    items = items[:99]
    items.append(test[test['userId']==user_id]['movieId'].values[0])
    items = np.array(items).reshape(-1, 1)

    user = np.full(len(items), user_id).reshape(-1, 1)

    preds = np.dot(X[user_id], Y[items].squeeze(1).T)
    item_to_pred = {item: pred for item, pred in zip(items.flatten(), preds)}

    top_k = heapq.nlargest(top_k, item_to_pred, key=item_to_pred.get)
    
    for i, item in enumerate(top_k, 1):
        if item == test[test['userId']==user_id]['movieId'].values:
            return np.log(i) / np.log(i+2)
    return 0

def eval_hit_wrapper(X, Y, df, test, item_ids, top_k):
    def f(user_id):
        return eval_hit(X, Y, df, test, user_id, item_ids, top_k)
    return f

def eval_NDCG_wrapper(X, Y, df, test, item_ids, top_k):
    def f(user_id):
        return eval_NDCG(X, Y, df, test, user_id, item_ids, top_k)
    return f

In [None]:
hits10 = list(map(eval_hit_wrapper(X, Y, train, test, uiid, 10), uuid))
print(sum(hits10)/len(hits10))

In [None]:
ndcg10 = list(map(eval_NDCG_wrapper(X, Y, train, test, uiid, 10), uuid))
print(sum(ndcg10)/len(ndcg10))

---

In [16]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-11-28 14:44:13

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

tensorflow_addons: 0.15.0
keras            : 2.7.0
IPython          : 5.5.0
matplotlib       : 3.2.2
pandas           : 1.1.5
tensorflow       : 2.7.0
numpy            : 1.19.5
seaborn          : 0.11.2



---

**END**