# Stating work on project

##First part. 

Repeating work, shown in teacher's part

In [1]:
!pip install lightfm nmslib

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |████████████████████████████████| 317kB 5.2MB/s 
[?25hCollecting nmslib
[?25l  Downloading https://files.pythonhosted.org/packages/be/77/aebbd03a32488024d2ae2230b47a28f6fa83c887318e673fa5d3234f7772/nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5MB)
[K     |████████████████████████████████| 13.5MB 217kB/s 
Collecting pybind11<2.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/00/84/fc9dc13ee536ba5e6b8fd10ce368fea5b738fe394c3b296cde7c9b144a92/pybind11-2.6.1-py2.py3-none-any.whl (188kB)
[K     |████████████████████████████████| 194kB 39.7MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-linux_x86_64.whl size=706129 sha256=1e8736434237344a488666761c00fec804e40

In [40]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from pathlib import Path
import pickle

In [3]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

## Load and clean data

In [28]:
DATA_FOLDER = Path("data")

ratings = pd.read_csv(DATA_FOLDER/"ratings.csv")
books = pd.read_csv(DATA_FOLDER/"books.csv")
tags = pd.read_csv(DATA_FOLDER/"tags_cleaned.csv")
book_tags = pd.read_csv(DATA_FOLDER/"book_tags.csv")


In [29]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,509,19th-century
1,923,20th-century
2,941,21st-century
3,1499,abuse
4,1540,action


In [30]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

In [31]:
book_tags 

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716
...,...,...,...
999907,33288638,21303,7
999908,33288638,17271,7
999909,33288638,1126,7
999910,33288638,11478,7


In [32]:
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]

In [33]:
book_tags["id"] = book_tags.goodreads_book_id.apply(lambda book_id: mapper[book_id])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
ratings.dropna(inplace=True)

## Creating sparse matrices

In [35]:
ratings_coo = sparse.coo_matrix((ratings.rating, (ratings.user_id, ratings.book_id)))

In [36]:
book_tags_coo = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id, book_tags.tag_id)))

# Training model

In [37]:
#число потоков нашего процессора
NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 30 

#число эпох обучения
NUM_EPOCHS = 10 

In [39]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Разбиваем наш датасет на обучающую и тестовую выборки
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =book_tags_coo, verbose=True)

Epoch: 100%|██████████| 10/10 [09:27<00:00, 56.74s/it]


In [43]:
#Тестируем нашу модель
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=book_tags_coo).mean()
 
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=book_tags_coo).mean()

print(recall_at_k,prec_score)

0.041151734593119146 0.07723892


In [44]:
with open("lightfm.model","wb") as f_model:
  pickle.dump(model, f_model)

# Embadings

In [45]:
# Достаём эбмеддинги

item_biases, item_embeddings = model.get_item_representations(features=book_tags_coo)

In [46]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [47]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [48]:
books[books.original_title.str.find('1984')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,Animal Farm / 1984,eng,4.26,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...


In [49]:
nbm = nearest_books_nms(846,nms_idx)[0]

In [50]:
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
12,13,5470,5470,153313,995,451524934,9780452000000.0,"George Orwell, Erich Fromm, Celâl Üster",1949.0,Nineteen Eighty-Four,1984,eng,4.14,1956832,2053394,45518,41845,86425,324874,692021,908229,https://images.gr-assets.com/books/1348990566m...,https://images.gr-assets.com/books/1348990566s...
13,14,7613,7613,2207778,896,452284244,9780452000000.0,George Orwell,1945.0,Animal Farm: A Fairy Story,Animal Farm,eng,3.87,1881700,1982987,35472,66854,135147,433432,698642,648912,https://images.gr-assets.com/books/1424037542m...,https://images.gr-assets.com/books/1424037542s...
47,48,4381,4381,1272463,507,307347974,9780307000000.0,Ray Bradbury,1953.0,Fahrenheit 451,Fahrenheit 451,spa,3.97,570498,1176240,30694,28366,64289,238242,426292,419051,https://images.gr-assets.com/books/1351643740m...,https://images.gr-assets.com/books/1351643740s...
54,55,5129,5129,3204877,515,60929871,9780061000000.0,Aldous Huxley,1932.0,Brave New World,Brave New World,eng,3.97,1022601,1079135,20095,26367,60328,219895,389379,383166,https://images.gr-assets.com/books/1487389574m...,https://images.gr-assets.com/books/1487389574s...
270,271,18373,18373,3337594,163,156030306,9780156000000.0,Daniel Keyes,1966.0,Flowers for Algernon,Flowers for Algernon,en-US,4.07,313044,336199,11328,4223,14882,65106,122462,129526,https://images.gr-assets.com/books/1367141311m...,https://images.gr-assets.com/books/1367141311s...
288,289,76620,76620,1357456,193,038039586X,9780380000000.0,Richard Adams,1972.0,Watership Down,"Watership Down (Watership Down, #1)",eng,4.05,292426,308373,10399,9158,15767,52906,102093,128449,https://images.gr-assets.com/books/1405136931m...,https://images.gr-assets.com/books/1405136931s...
808,809,5479,5479,39947767,38,60776099,9780061000000.0,"Aldous Huxley, Christopher Hitchens",1932.0,Brave New World/Brave New World Revisited,Brave New World / Brave New World Revisited,eng,4.16,108124,110115,1012,1215,3784,18335,39753,47028,https://images.gr-assets.com/books/1331315450m...,https://images.gr-assets.com/books/1331315450s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,Animal Farm / 1984,eng,4.26,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
902,903,667,667,287946,460,452281253,9780452000000.0,Ayn Rand,1938.0,Anthem,Anthem,eng,3.62,95620,106766,7096,6095,10982,27984,34074,27631,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
8139,8140,5481,5481,16335101,80,60898526,9780061000000.0,Aldous Huxley,1958.0,Brave New World Revisited,Brave New World Revisited,en-US,3.93,11073,12286,714,231,691,2765,4567,4032,https://images.gr-assets.com/books/1410136964m...,https://images.gr-assets.com/books/1410136964s...


Найдите рекомендации для книги Thomas Harris, The Silence of the Lambs. Какие книги присутствуют в списке рекомендаций?

In [52]:
books[books.original_title.str.find('The Silence of the Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,"The Silence of the Lambs (Hannibal Lecter, #2)",eng,4.14,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


In [54]:
nbm = nearest_books_nms(209,nms_idx)[0]

In [55]:
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,"The Silence of the Lambs (Hannibal Lecter, #2)",eng,4.14,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...
273,274,22034,22034,266624,259,451205766,9780451000000.0,"Mario Puzo, Robert Thompson, Peter Bart",1969.0,The Godfather,The Godfather,eng,4.36,256480,270386,5832,2930,5985,30009,83730,147732,https://images.gr-assets.com/books/1394988109m...,https://images.gr-assets.com/books/1394988109s...
430,431,28877,28877,925503,191,525945563,9780526000000.0,Thomas Harris,1981.0,Red Dragon,"Red Dragon (Hannibal Lecter, #1)",eng,4.01,194013,205433,3309,3012,7790,43235,80662,70734,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
767,768,21686,21686,1234227,134,038073186X,9780381000000.0,Dennis Lehane,2003.0,Shutter Island,Shutter Island,eng,4.07,113718,124032,6990,1636,4727,22089,49875,45705,https://images.gr-assets.com/books/1329269081m...,https://images.gr-assets.com/books/1329269081s...
981,982,40024,40024,2266643,70,812976142,9780813000000.0,Caleb Carr,1994.0,The Alienist,"The Alienist (Dr. Laszlo Kreizler, #1)",eng,4.05,96981,100908,4026,1798,4571,18715,37572,38252,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
1484,1485,21704,21704,434,93,446698873,9780447000000.0,James Ellroy,1987.0,The Black Dahlia,"The Black Dahlia (L.A. Quartet, #1)",en-US,3.75,61412,65404,1944,1999,4999,17641,23340,17425,https://images.gr-assets.com/books/1387048173m...,https://images.gr-assets.com/books/1387048173s...
1801,1802,32418,32418,2992500,132,99297701,9780099000000.0,Thomas Harris,1999.0,Hannibal,"Hannibal (Hannibal Lecter, #3)",eng,3.72,57569,63555,2098,2166,5811,17220,20844,17514,https://images.gr-assets.com/books/1327356556m...,https://images.gr-assets.com/books/1327356556s...
4421,4422,32416,32416,46673,94,385339410,9780385000000.0,Thomas Harris,2006.0,Hannibal Rising,"Hannibal Rising (Hannibal Lecter, #4)",en-US,3.44,22767,25973,1317,1468,3733,8087,7174,5511,https://images.gr-assets.com/books/1394208690m...,https://images.gr-assets.com/books/1394208690s...
5312,5313,21727,21727,593515,46,307279952,9780307000000.0,Scott B. Smith,1993.0,A Simple Plan,A Simple Plan,,3.91,18628,19650,986,478,1086,4239,7690,6157,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9792,9793,15677,15677,1187279,77,393321983,9780393000000.0,Patricia Highsmith,1950.0,Strangers on a Train,Strangers on a Train,eng,3.82,9788,11500,1055,195,720,2843,4922,2820,https://images.gr-assets.com/books/1331234879m...,https://images.gr-assets.com/books/1331234879s...


## Save embedings

In [58]:
item_embeddings.shape

(10001, 30)

In [61]:
with open("embeddings.pkl", "wb") as f_embeddings:
  pickle.dump(item_embeddings, f_embeddings, protocol=pickle.HIGHEST_PROTOCOL)