In [1]:
import pandas as pd
import numpy as np
import gdown 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sps
import pickle
from tqdm import tqdm
from joblib import Parallel, delayed

from lightfm import LightFM
from lightfm.evaluation import precision_at_k

## Загрузка данных

In [2]:
# Загрузка тестовых данных
gdown.download(url="https://drive.google.com/uc?export=download&id=1Ud6jFto6e7FW5y0LxxwX8afpmTrvA0u-",  
               output="test_transacrion_df.csv",
               quiet=False)
# Загрузка датафрейма с тестовыми данными
test_df = pd.read_csv("test_transacrion_df.csv", index_col=0)

Downloading...
From: https://drive.google.com/uc?export=download&id=1Ud6jFto6e7FW5y0LxxwX8afpmTrvA0u-
To: /home/noname/projects/hse_mlds_recsys_project/ML/test_transacrion_df.csv
100%|██████████| 52.4M/52.4M [00:05<00:00, 10.5MB/s]


In [3]:
# Загрузка кодировщика user_id
gdown.download(url="https://drive.google.com/uc?export=download&id=1eodl9OlaYy3NTu9TpbtPLHNurXxeHlhh",  
               output="encoder.pkl",
               quiet=False)
# Загрузка кодировщика
with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

Downloading...
From: https://drive.google.com/uc?export=download&id=1eodl9OlaYy3NTu9TpbtPLHNurXxeHlhh
To: /home/noname/projects/hse_mlds_recsys_project/ML/encoder.pkl
100%|██████████| 800k/800k [00:00<00:00, 3.93MB/s]


In [4]:
# Загрузка датафрейма с рейтингами
gdown.download(url="https://drive.google.com/uc?export=download&id=1epGrpzB8BEC2t5Od3hrL3x07B1VZIm3c",  
               output="ratings.csv",
               quiet=False)
rating_df = pd.read_csv("ratings.csv", index_col=0)

Downloading...
From (uriginal): https://drive.google.com/uc?export=download&id=1epGrpzB8BEC2t5Od3hrL3x07B1VZIm3c
From (redirected): https://drive.google.com/uc?export=download&id=1epGrpzB8BEC2t5Od3hrL3x07B1VZIm3c&confirm=t&uuid=2482ca56-dd86-4d59-84df-f8fb6eabc502
To: /home/noname/projects/hse_mlds_recsys_project/ML/ratings.csv
100%|██████████| 211M/211M [00:18<00:00, 11.3MB/s] 


In [5]:
# Перекодировка пользователей
init_test_df = test_df.copy()
test_df['user_id'] = encoder.transform(test_df['user_id'])
test_df = test_df[['user_id', 'product_id']].drop_duplicates()
test_df['rating'] = 1
rating_df['user_id'] = encoder.transform(rating_df['user_id'])

In [6]:
# Приведение типов к менее тяжеловесным
rating_df["user_id"] = pd.to_numeric(rating_df["user_id"], downcast="integer")
rating_df["product_id"] = pd.to_numeric(rating_df["product_id"], downcast="integer")
rating_df["rating"] = pd.to_numeric(rating_df["rating"], downcast="integer")
test_df["user_id"] = pd.to_numeric(test_df["user_id"], downcast="integer")
test_df["product_id"] = pd.to_numeric(test_df["product_id"], downcast="integer")
test_df["rating"] = pd.to_numeric(test_df["rating"], downcast="integer")

In [7]:
def make_sparse(dataset: pd.DataFrame) -> sps.coo_matrix:
    row = dataset["user_id"].to_numpy()
    col = dataset["product_id"].to_numpy()
    data = dataset["rating"].to_numpy()
    return sps.coo_matrix((data, (row, col)))

In [8]:
train_sparse = make_sparse(dataset=rating_df)
test_sparse = make_sparse(dataset=test_df)

## Метрика качества

In [9]:
def mapk(model, test_interactions, k:int=10, num_threads:int=60):
    p_at_k = precision_at_k(model=model, 
                            test_interactions=test_interactions,
                            k=10, 
                            num_threads=60)
    map_at_k = sum(p_at_k)/len(p_at_k)
    return map_at_k

## Модель

In [10]:
class LFM():

    def __init__(self, encoder, rating_df=None, model=None):
        if model is None:
            self.model = LightFM(no_components=10,
                                loss='warp', 
                                random_state=42, 
                                learning_rate=0.01)
        else:
            self.model = model
        self.encoder = encoder
        self.rating_df = rating_df

    def predict(self, users_to_recommend: list, k: int = 10):
        """
            Параллельное вычисление рекомендаций для пользователей
        """
        predictions = {}
        for uid in users_to_recommend:
            predictions[uid] = self.recommend(uid=uid, k=k)
        return predictions

    def cold_start(self):
        """
            Функция холодного старта
            возвращает популярные продукты по убываюнию
        """
        if self.rating_df is None:
            return None
        return np.argsort(np.array(self.rating_df.groupby("product_id")["rating"].sum()))[::-1]


    def recommend(self,uid: int, k: int):
        """
            Расчет рекомендаций
        """
        # Если ранее такого пользователя не было, то применяется холодный старт
        try:
            uid = self.encoder.transform(np.array([uid]))[0]
        except:
            return self.cold_start()[:k]
        
        items = sorted(self.rating_df['product_id'].unique())
        scores = self.model.predict(user_ids=[uid]*len(items), 
                                    item_ids=items)
        predict = np.array(items)[np.argsort(-scores)][:k]
        return predict.tolist()
        
    def fit(self, train_sparse, epochs, rounds, num_threads=60):
        for rounds in tqdm(range(rounds)): 
            self.model.fit_partial(train_sparse, 
                            sample_weight=train_sparse, 
                            epochs=epochs, 
                            num_threads=num_threads)


In [11]:
# Инициализация
model = LightFM(no_components=10, loss='warp', 
                random_state=42, learning_rate=0.01)

lfm_model = LFM(encoder=encoder, rating_df=rating_df, model=model)

In [12]:
# Обучение
lfm_model.fit(train_sparse=train_sparse, epochs=5, 
              rounds=5, num_threads=60)

100%|██████████| 5/5 [00:29<00:00,  6.00s/it]


In [13]:
# MAP@k
mapk(model=lfm_model.model, test_interactions=test_sparse, k=10)

0.08308500162132085

In [14]:
# Сохранение модели
with open("../app/models/lfm_model.pkl", "wb") as f:
    pickle.dump(lfm_model.model, f)

In [15]:
# Множество с рекоменлацией
predict_set = set(lfm_model.recommend(uid=3, k=10))
# Множество релевантных продуктов
relevant_set = set(np.array(init_test_df[init_test_df["user_id"] == 3]["product_id"]))
# Пересечение
relevant_set & predict_set 

{21903, 47766}

In [16]:
# Предикт для списка пользователей
lfm_model.predict(users_to_recommend=[1, 3, 2917], k=10)

{1: [13176, 6184, 16797, 21137, 196, 12341, 43352, 13575, 8571, 39275],
 3: [21137, 24852, 13176, 21903, 26209, 47209, 47626, 47766, 39275, 16797],
 2917: [21137, 24852, 21903, 13176, 47209, 26209, 47766, 47626, 22935, 24964]}