In [11]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

folder = 'C:/Users/serge/Downloads/goodbooks-10k-master/goodbooks-10k-master/'

In [13]:
ratings = pd.read_csv(folder + 'ratings.csv')
books = pd.read_csv(folder + 'books.csv')
tags = pd.read_csv(folder + 'tags.csv')
book_tags = pd.read_csv(folder + 'book_tags.csv')

In [14]:
tags_clean = pd.read_csv(folder + 'tags_cleaned.csv')

In [15]:
tags_clean

Unnamed: 0,tag_id,tag_name
0,509,19th-century
1,923,20th-century
2,941,21st-century
3,1499,abuse
4,1540,action
...,...,...
329,33114,young-adult
330,33121,young-adult-fantasy
331,33124,young-adult-fiction
332,33165,youth


Let's create a dictionary so we could find **book_id** based on **goodreads_book_id**

In [16]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

Add **book_id** to **book_tags**

In [18]:
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [19]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))

In [21]:
# processor threads
NUM_THREADS = 8 

# vector components
NUM_COMPONENTS = 30 

# learning epochs count
NUM_EPOCHS = 10 

# Naive Model

In [22]:
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)

train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

In [23]:
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_at_k,prec_score)

0.006190289542626801 0.013340445


In [50]:
import pickle
filename = 'naive_model.pickle'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)

# Improving Model

In [26]:
# getting embeddings

item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [24]:
pip install nmslib

Collecting nmslibNote: you may need to restart the kernel to use updated packages.





  Downloading nmslib-2.1.1-cp38-cp38-win_amd64.whl (661 kB)
     ------------------------------------- 661.7/661.7 KB 10.5 MB/s eta 0:00:00
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
     ------------------------------------- 188.5/188.5 KB 11.9 MB/s eta 0:00:00
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1





In [28]:
import nmslib
 
# creating searching graph
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
# starting to add books to graph
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [32]:
# function to search books by book_id in our graph
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [33]:
books[books.original_title.str.find('1984')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [45]:
nbm = nearest_books_nms(846,nms_idx)[0]

In [46]:
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
13,14,7613,7613,2207778,896,452284244,9780452000000.0,George Orwell,1945.0,Animal Farm: A Fairy Story,...,1881700,1982987,35472,66854,135147,433432,698642,648912,https://images.gr-assets.com/books/1424037542m...,https://images.gr-assets.com/books/1424037542s...
54,55,5129,5129,3204877,515,60929871,9780061000000.0,Aldous Huxley,1932.0,Brave New World,...,1022601,1079135,20095,26367,60328,219895,389379,383166,https://images.gr-assets.com/books/1487389574m...,https://images.gr-assets.com/books/1487389574s...
102,103,7126,7126,391568,1310,140449264,9780140000000.0,"Alexandre Dumas, Robin Buss",1844.0,Le Comte de Monte-Cristo,...,555822,601220,15925,10381,22482,89980,183142,295235,https://images.gr-assets.com/books/1309203605m...,https://images.gr-assets.com/books/1309203605s...
129,130,2165,2165,69741,666,684830493,9780685000000.0,Ernest Hemingway,1952.0,The Old Man and the Sea,...,520630,574328,16716,28645,52583,136217,186747,170136,https://images.gr-assets.com/books/1329189714m...,https://images.gr-assets.com/books/1329189714s...
193,194,153747,153747,2409320,1823,142437247,9780142000000.0,"Herman Melville, Andrew Delbanco, Tom Quirk",1851.0,"Moby Dick; or, The Whale",...,358050,397963,11223,35983,52601,103625,102432,103322,https://images.gr-assets.com/books/1327940656m...,https://images.gr-assets.com/books/1327940656s...
352,353,12996,12996,995103,1053,743477553,9780743000000.0,William Shakespeare,1603.0,"The Tragedy of Othello, The Moor of Venice",...,238875,256757,4334,4281,16576,64922,92076,78902,https://images.gr-assets.com/books/1459795105m...,https://images.gr-assets.com/books/1459795105s...
705,706,54479,54479,4537271,1903,014044906X,9780140000000.0,"Jules Verne, Michael Glencross, Brian W. Aldiss",1872.0,Le tour du monde en quatre-vingts jours,...,117108,141132,4341,1273,6693,36897,55603,40666,https://images.gr-assets.com/books/1308815551m...,https://images.gr-assets.com/books/1308815551s...
713,714,12938,12938,2342136,1108,074348276X,9780743000000.0,William Shakespeare,1603.0,The Tragedie of King Lear,...,135448,147282,3079,2825,10502,36179,47682,50094,https://images.gr-assets.com/books/1331563731m...,https://images.gr-assets.com/books/1331563731s...
777,778,30597,30597,3043569,1206,451527887,9780452000000.0,"Victor Hugo, Walter J. Cobb",1831.0,Notre-Dame de Paris,...,119851,133421,3255,1952,6849,30636,47753,46231,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [None]:
Thomas Harris, 

In [38]:
books[books.original_title.str.find('The Silence of the Lambs')>=0].head(2) # book_id - 209

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [47]:
nbm = nearest_books_nms(209,nms_idx)[0]
books[books.book_id.isin(nbm)].original_title

3          To Kill a Mockingbird
4               The Great Gatsby
7         The Catcher in the Rye
25             The Da Vinci Code
102     Le Comte de Monte-Cristo
159           Great Expectations
208     The Silence of the Lambs
430                   Red Dragon
650        Le Fantôme de l'Opéra
1801                    Hannibal
Name: original_title, dtype: object

In [54]:
with open('item_embeddings.pickle', 'wb') as file:
   pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)