In [2]:
import numpy as np
import pandas as pd

import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

import pickle



In [4]:
ratings = pd.read_csv("./data/ratings.csv")
books = pd.read_csv("./data/books.csv")
tags = pd.read_csv("./data/tags.csv")
book_tags = pd.read_csv("./data/book_tags.csv")

In [5]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [6]:
tags = pd.read_csv("./data/tags_cleaned.csv")

In [7]:
mapper = dict(zip(books.goodreads_book_id, books.book_id))

In [8]:
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [9]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [10]:
ratings_coo = sparse.coo_matrix((ratings.rating, (ratings.user_id, ratings.book_id)))

In [11]:
feature_ratings = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id, book_tags.tag_id)))

In [12]:
#число потоков нашего процессора. Зависит от того, на какой машине запускаете

NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 60 

#число эпох обучения
NUM_EPOCHS = 10 

In [13]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Разбиваем наш датасет на обучающую и тестовую выборки
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

In [14]:
precision_score = precision_at_k(model, test, num_threads=NUM_THREADS, k=10, item_features=feature_ratings).mean()
recall_score = recall_at_k(model, test, num_threads=NUM_THREADS, k=10, item_features=feature_ratings).mean()

print(recall_score, precision_score)

0.04059940961433687 0.087747075


In [15]:
with open('model.pkl', 'wb') as file:
	pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

Добавим эмбединги к модели

In [16]:
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [18]:
import nmslib

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [19]:
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


In [20]:
# Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
	nn = index.knnQuery(item_embeddings[book_id], k=n)
	return nn

In [21]:
# Давайте попробуем написать рекомендации к какой-нибудь книге, например к роману «1984» Джорджа Оруэлла.
books[books.original_title.str.find('1984')>=0]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [22]:
# Видим, что у книги id — 846.

# Ищем похожие книги.
nbm = nearest_books_nms(846, nms_idx)[0]

In [23]:
# Выводим похожие книги.
books[books.book_id.isin(nbm)][['authors', 'original_title']]

Unnamed: 0,authors,original_title
12,"George Orwell, Erich Fromm, Celâl Üster",Nineteen Eighty-Four
13,George Orwell,Animal Farm: A Fairy Story
27,William Golding,Lord of the Flies
47,Ray Bradbury,Fahrenheit 451
54,Aldous Huxley,Brave New World
270,Daniel Keyes,Flowers for Algernon
808,"Aldous Huxley, Christopher Hitchens",Brave New World/Brave New World Revisited
845,"George Orwell, Christopher Hitchens",Animal Farm & 1984
3251,Mark Twain,Tom Sawyer & Huckleberry Finn
8139,Aldous Huxley,Brave New World Revisited


In [24]:
with open('item_embeddings.pkl', 'wb') as file:
	pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
books[books.original_title.str.find('The Silence of the Lambs')>=0]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [26]:
nbm = nearest_books_nms(209, nms_idx)[0]
books[books.book_id.isin(nbm)][['authors', 'original_title']]

Unnamed: 0,authors,original_title
208,Thomas Harris,The Silence of the Lambs
430,Thomas Harris,Red Dragon
767,Dennis Lehane,Shutter Island
1175,Dennis Lehane,Mystic River
1484,James Ellroy,The Black Dahlia
1801,Thomas Harris,Hannibal
3261,Patricia Highsmith,The Talented Mr. Ripley
4421,Thomas Harris,Hannibal Rising
5312,Scott B. Smith,A Simple Plan
9792,Patricia Highsmith,Strangers on a Train


In [27]:
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
books['title'] = books.title.str.lower()
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [28]:
books.book_id

0           1
1           2
2           3
3           4
4           5
        ...  
9995     9996
9996     9997
9997     9998
9998     9999
9999    10000
Name: book_id, Length: 10000, dtype: int64