In [2]:
import pandas as pd
import numpy as np
from os import path, environ

In [3]:
#data_dir='C://Users//asus//Desktop//Netology//Data//ml-latest-small'
#data_dir='C://Users//mrasskazov//Desktop//DS//Data//ml-latest-small'
data_dir='/home/spynal/Netology/Data/ml-latest-small/'
environ["DATA_DIR"] = "/home/spynal/Netology/Data/ml-latest-small//"
tags=pd.read_csv(data_dir+'/tags.csv')
ratings=pd.read_csv(data_dir+'/ratings.csv')
movies=pd.read_csv(data_dir+'/movies.csv')

In [24]:
tags['movie_id']=tags['movieId'].astype('category').cat.codes.copy()
ratings["movie_id"] = ratings["movieId"].astype("category").cat.codes.copy()
ratings["user_id"] = ratings["userId"].astype("category").cat.codes.copy()

Склеиваем данные по рейтингам и тегам, для получения одинаковой размерности

In [25]:
movies_tags = movies[["movieId"]].drop_duplicates().join(
    tags,
    on="movieId",
    rsuffix="codes",
    lsuffix="tags",
    sort=True
).fillna("None")[['tag','movieId']]
movies_tags.shape

(9125, 2)

In [26]:
movies_tags_rat = ratings[["movieId", 'movie_id']].drop_duplicates().join(
    movies_tags,
    on="movieId",
    rsuffix="codes",
    lsuffix="movies_tags",
    sort=True
).fillna("None")[["tag", 'movie_id']]
movies_tags_rat.shape

(9066, 2)

Создаем матрицу тэгов

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
tags_features=CountVectorizer().fit_transform(movies_tags_rat['tag'])

Добавляем movie_id

In [28]:
from scipy.sparse import hstack, identity

features = hstack([
    identity(movies_tags_rat.shape[0]),
    tags_features
])

Получаем матрицу схожести между фильмами по тэгам

In [29]:
from implicit.nearest_neighbours import CosineRecommender
model=CosineRecommender()
model.fit(features)

In [30]:
from sklearn.model_selection import train_test_split
train, test=train_test_split(ratings, test_size=0.2 )

Подготавливаем данные для MREC

In [11]:
test_file_name = "ml.test.0"
test[["user_id", 'movie_id','rating']].to_csv(
    path.join(data_dir,test_file_name),
    sep="\t",
    header=False,
    index=False
)
train_file_name = "ml.train.0"
train[["user_id",'movie_id','rating']].to_csv(
    path.join(data_dir,train_file_name),
    sep="\t",
    header=False,
    index=False
)

Получаем рекомендации

In [12]:
recs = features.T.tocsr()
with open((data_dir+"/recs/ml.train.0.recs.tsv"), "w") as output_file:
    for user_id in train["user_id"].unique():
        for movie_id, score in model.recommend(user_id, recs):
                output_file.write("%s\t%s\t%s\n" % (user_id, movie_id,score))

In [13]:
recs = features.T.tocsr()
with open((data_dir+"/recs/ml.test.0.recs.tsv"), "w") as output_file:
    for user_id in test["user_id"].unique():
        for movie_id, score in model.recommend(user_id, recs):
                output_file.write("%s\t%s\t%s\n" % (user_id, movie_id,score))

Оцениваем

In [14]:
!mrec_evaluate --input_format tsv --test_input_format tsv --train $DATA_DIR/ml.train.0 --recsdir $DATA_DIR/recs

[2017-12-09 23:20:11,313] INFO: processing /home/spynal/Netology/Data/ml-latest-small/ml.train.0...
None
mrr            0.0224 +/- 0.0000
prec@5         0.0073 +/- 0.0000
prec@10        0.0048 +/- 0.0000
prec@15        0.0032 +/- 0.0000
prec@20        0.0024 +/- 0.0000


In [15]:
!mrec_evaluate --input_format tsv --test_input_format tsv --train $DATA_DIR/ml.test.0 --recsdir $DATA_DIR/recs

[2017-12-09 23:20:12,580] INFO: processing /home/spynal/Netology/Data/ml-latest-small/ml.test.0...
None
mrr            0.0224 +/- 0.0000
prec@5         0.0073 +/- 0.0000
prec@10        0.0048 +/- 0.0000
prec@15        0.0032 +/- 0.0000
prec@20        0.0024 +/- 0.0000


Подготавливаем данные для LightFM

In [31]:
last_movie_id = ratings["movie_id"].max()
last_user_id = ratings["user_id"].max()

In [32]:
from scipy.sparse import csr_matrix

user_x_item = ratings[["user_id", "movie_id"]].as_matrix()
mean_rating = ratings["rating"].mean()
user_item_matrix = csr_matrix(
    (
        (ratings["rating"] > mean_rating).tolist(),
        (
            [pair[0] for pair in user_x_item],
            [pair[1] for pair in user_x_item],
        )
    ),
    shape=(last_user_id + 1, last_movie_id + 1),
    dtype=np.float32
)

In [33]:
np.random.seed(0)
test_indices = np.random.choice(
    range(user_item_matrix.nnz),
    replace=False,
    size=int(user_item_matrix.nnz * 0.2)
).tolist()
train_data = user_item_matrix.copy()
train_data.data[test_indices] = 0
train_data.eliminate_zeros()
print("размер обучающей выборки: {}".format(train_data.nnz))
test_data = user_item_matrix.copy()
test_data.data[:] = 0
test_data.data[test_indices] = user_item_matrix.data[test_indices]
test_data.eliminate_zeros()
print("размер тестовой выборки: {}".format(test_data.nnz))

размер обучающей выборки: 41289
размер тестовой выборки: 10279


Обучаем модель на Оценках

In [34]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

model = LightFM(loss="warp")
model.fit(
    train_data,
    num_threads=4
)

train_mrr = auc_score(model, train_data).mean()
test_mrr = auc_score(model, test_data).mean()
train_prc = precision_at_k(model, train_data, k=10).mean()
test_prc = precision_at_k(model, test_data, k=10).mean()
train_rec = recall_at_k(model, train_data, k=10).mean()
test_rec = recall_at_k(model, test_data, k=10).mean()
print('ROC AUC: train %.2f, test %.2f, PRC: train %.2f, test %.2f, REC: train %.2f, test %.2f.' % (train_mrr, test_mrr,train_prc, test_prc,train_rec, test_rec))

ROC AUC: train 0.91, test 0.89, PRC: train 0.28, test 0.06, REC: train 0.08, test 0.06.


Получаем рекомендации

In [35]:
def get_recs_model(dataset):
    recs = model.predict(
        user_ids=np.array(dataset["user_id"].astype(np.int32).tolist()),
        item_ids=np.array(dataset["movie_id"].astype(np.int32).tolist()),
        num_threads=4
    )
    return recs

In [36]:
train_recs_model = get_recs_model(train)

Обучаем на оценки+тэги

In [40]:
hybrid = LightFM(loss="warp")
hybrid.fit(
    train_data,
    num_threads=4,
    item_features=features
)

train_mrr = auc_score(hybrid, train_data, item_features=features).mean()
test_mrr = auc_score(hybrid, test_data, item_features=features).mean()
train_prc = precision_at_k(hybrid, train_data, item_features=features,k=10).mean()
test_prc = precision_at_k(hybrid, test_data, item_features=features,k=10).mean()
train_rec = recall_at_k(hybrid, train_data, item_features=features,k=10).mean()
test_rec = recall_at_k(hybrid, test_data, item_features=features,k=10).mean()
print('ROC AUC: train %.2f, test %.2f, PRC: train %.2f, test %.2f, REC: train %.2f, test %.2f.' % (train_mrr, test_mrr,train_prc, test_prc,train_rec, test_rec))

ROC AUC: train 0.92, test 0.89, PRC: train 0.25, test 0.06, REC: train 0.07, test 0.06.


Получаем рекомендации

In [38]:
def get_recs_hybrid(dataset, features):
    recs = hybrid.predict(
        user_ids=np.array(dataset["user_id"].astype(np.int32).tolist()),
        item_ids=np.array(dataset["movie_id"].astype(np.int32).tolist()),
        item_features=features,
        num_threads=4
    )
    return recs

In [39]:
train_recs = get_recs_hybrid(train, features)

#### Вывод:
Самая лучшая модель получилась на оценки+тэги