### Матричные факторизации

In [1]:
!pip install implicit lightfm faiss



In [2]:
!apt-get install libopenblas-dev
!apt-get install libomp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenblas-dev is already the newest version (0.2.20+ds-4).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [3]:
!wget --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
!mv ml-1m/*.dat .
!rm -r ml-1m*

--2021-03-18 19:13:12--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2021-03-18 19:13:13 (5.95 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [4]:
import implicit
import pandas as pd
import numpy as np
import scipy.sparse as sp

from lightfm.datasets import fetch_movielens

In [5]:
ratings = pd.read_csv('ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [6]:
movie_info = pd.read_csv('movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [8]:
implicit_ratings = ratings[ratings.rating >= 4]

In [9]:
implicit_ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
3,1,3408,4
4,1,2355,5
6,1,1287,5
7,1,2804,5


In [10]:
users = implicit_ratings.user_id
movies = implicit_ratings.movie_id
user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
user_item_t_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

In [11]:
model = implicit.als.AlternatingLeastSquares(factors=64, iterations=100, calculate_training_loss=True)



In [12]:
model.fit(user_item_t_csr)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [13]:
movie_info.head()

Unnamed: 0,movie_id,name,category
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                        for x in model.similar_items(item_id)]

In [15]:
get_similars(1, model)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 "2286    Bug's Life, A (1998)",
 '33    Babe (1995)',
 '2315    Babe: Pig in the City (1998)',
 '584    Aladdin (1992)',
 '3817    Went to Coney Island on a Mission From God... ...',
 '1526    Hercules (1997)',
 '2692    Iron Giant, The (1999)',
 '2252    Pleasantville (1998)']

Давайте теперь построим рекомендации для юзеров

Как мы видим юзеру нравится фантастика, значит и в рекомендациях ожидаем увидеть фантастику

In [16]:
get_user_history = lambda user_id, implicit_ratings : [movie_info[movie_info["movie_id"] == x]["name"].to_string() 
                                            for x in implicit_ratings[implicit_ratings["user_id"] == user_id]["movie_id"]]

In [17]:
get_user_history(4, implicit_ratings)

['3399    Hustler, The (1961)',
 '2882    Fistful of Dollars, A (1964)',
 '1196    Alien (1979)',
 '1023    Die Hard (1988)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '476    Jurassic Park (1993)',
 '1180    Raiders of the Lost Ark (1981)',
 '1885    Rocky (1976)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '3349    Thelma & Louise (1991)',
 '3633    Mad Max (1979)',
 '2297    King Kong (1933)',
 '1366    Jaws (1975)',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '2623    Run Lola Run (Lola rennt) (1998)',
 '2878    Goldfinger (1964)',
 '1220    Terminator, The (1984)']

Получилось! 

Мы действительно порекомендовали пользователю фантастику и боевики, более того встречаются продолжения тех фильмов, которые он высоко оценил

In [18]:
get_recommendations = lambda user_id, model : [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() 
                                               for x in model.recommend(user_id, user_item_csr)]

In [19]:
get_recommendations(4, model)

['585    Terminator 2: Judgment Day (1991)',
 '1271    Indiana Jones and the Last Crusade (1989)',
 '1182    Aliens (1986)',
 '1284    Butch Cassidy and the Sundance Kid (1969)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '2502    Matrix, The (1999)',
 '1884    French Connection, The (1971)',
 '1892    Rain Man (1988)',
 '3402    Close Encounters of the Third Kind (1977)',
 '1179    Princess Bride, The (1987)']

 Создадим модель, которая будет получать embeddings для пользователей и фильмов. Эта модель будет генерировать рекоммендации и similars с помощью библиотеки **faiss** (на gpu работает супер быстро)

In [33]:
from typing import List
import faiss

class RecommendModel:
    def __init__(self, user_embeddings: np.ndarray, item_embeddings: np.ndarray):
        assert user_embeddings.shape[1] == item_embeddings.shape[1]
        d = user_embeddings.shape[1]

        self.indices = {}
        self.embeddings = {}

        for name, embeddings in [
            ('user', user_embeddings),
            ('item', item_embeddings),
        ]:
            self.embeddings[name] = embeddings
            self.indices[name] = faiss.IndexFlatL2(d)
            self.indices[name].add(embeddings)

    def find_similars(self, name: str, i: int, top_k: int = 10) -> List[int]:
        embedding = self.embeddings[name][i].reshape((1, -1))
        d, i = self.indices[name].search(embedding, top_k)
        return zip(list(i[0]), list(d[0]))

    def similar_items(self, item_id: int):
        return self.find_similars('item', item_id, 10)


    def recommend(self, user_id: int, data: sp.csr.csr_matrix) -> List[int]:
        user_embedding = self.embeddings['user'][user_id].reshape((1, -1))

        dist, nearest = self.indices['item'].search(user_embedding, 100)

        row = data[user_id, :].toarray().flatten()
        recs = []
        for i, d in zip(list(nearest[0]), list(dist[0])):
            if row[i] == 0:
                recs.append((i, d))
        return recs[:10]

### Задание 1. Не использую готовые решения, реализовать SVD разложение используя SGD на explicit данных

### Задание 2. Не использую готовые решения, реализовать матричное разложение используя ALS на implicit данных

In [21]:
from scipy.sparse.linalg import spsolve
from scipy import sparse
from tqdm.auto import tqdm

In [22]:
from numpy.random import default_rng
rng = default_rng()

def implicit_als(sparse_data, alpha=40, iterations=10, reg=0.1, hidden=10):
    confidence = sparse_data * alpha
    X = sparse.csr_matrix(
        rng.standard_normal((sparse_data.shape[0], hidden)) * 0.001
    )
    Y = sparse.csr_matrix(
        rng.standard_normal((sparse_data.shape[1], hidden)) * 0.001
    )
    
    reg_I = reg * sparse.eye(hidden)

    for i in tqdm(range(iterations)):
        yT_y = Y.T.dot(Y)
        xT_x = X.T.dot(X)

        def calc_opt_value(M, other, row):
          p = row.copy()
          p[p > 0] = 1

          diag_minus = sparse.diags(row, [0])
          diag = diag_minus + sparse.eye(M.shape[0])

          val1 = M.T.dot(diag_minus).dot(M)
          val2 = M.T.dot(diag).dot(p.T)
          return spsolve(other + val1, val2)

        for u in tqdm(range(sparse_data.shape[0]), leave=False):
            u_row = confidence[u, :].toarray() 
            X[u] = calc_opt_value(Y, yT_y, u_row)
    
        for i in tqdm(range(sparse_data.shape[1]), leave=False):
            i_row = confidence[:, i].T.toarray()
            Y[i] = calc_opt_value(X, xT_x, i_row)
    return X, Y

In [23]:
user_vecs, item_vecs = implicit_als(user_item_csr, iterations=4, hidden=20, alpha=40)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6041.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6041.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6041.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6041.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))




In [24]:
user_embeddings = user_vecs.toarray().astype(np.float32)
item_embeddings = item_vecs.toarray().astype(np.float32)

In [34]:
als_model = RecommendModel(user_embeddings, item_embeddings)

In [35]:
get_recommendations(4, als_model)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '585    Terminator 2: Judgment Day (1991)',
 '1182    Aliens (1986)',
 '2502    Matrix, The (1999)',
 '1271    Indiana Jones and the Last Crusade (1989)',
 '847    Godfather, The (1972)',
 '453    Fugitive, The (1993)',
 '537    Blade Runner (1982)',
 '2847    Total Recall (1990)']

In [36]:
get_similars(1, als_model)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '1245    Groundhog Day (1993)',
 '2327    Shakespeare in Love (1998)',
 '2918    Who Framed Roger Rabbit? (1988)',
 '2928    Being John Malkovich (1999)',
 "2286    Bug's Life, A (1998)",
 '2647    Ghostbusters (1984)',
 '352    Forrest Gump (1994)',
 '1892    Rain Man (1988)']

### Задание 3. Не использую готовые решения, реализовать матричное разложение BPR на implicit данных