# 1. Imports

In [21]:
#pip install lightfm
#pip install scikit_optimize
#pip install pandas-profiling

In [22]:
# import dependent libraries
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
from skopt import forest_minimize

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw=True)

# 2. Data

In [23]:
%%time
books_metadata = pd.read_json('data/goodreads_books_poetry.json', lines=True)
interactions = pd.read_json('data/goodreads_interactions_poetry.json', lines=True)

CPU times: total: 33.4 s
Wall time: 34.2 s


# 3. Data Inspection & Preparation

In [24]:
books_metadata.sample(3)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
23231,8845263436,6,[],US,ita,"[{'count': '830', 'name': 'to-read'}, {'count'...",,False,3.83,,...,10,"I libri di Tolkien, Letteraria straniera",2009,https://www.goodreads.com/book/show/9707838-la...,https://images.gr-assets.com/books/1327356712m...,9707838,42,6538830,La Leggenda di Sigurd e Gudrún,La Leggenda di Sigurd e Gudrún
22411,XXXXOLP004,42,[],US,eng,"[{'count': '105', 'name': 'to-read'}, {'count'...",,False,3.14,,...,4,,2011,https://www.goodreads.com/book/show/13076893-p...,https://images.gr-assets.com/books/1321637322m...,13076893,58,18243823,People Who Don't Know Me Think I'm Somebody,People Who Don't Know Me Think I'm Somebody
21469,0060952571,8,[],US,eng,"[{'count': '25', 'name': 'to-read'}, {'count':...",,False,3.97,B003ZSHUJS,...,10,,1997,https://www.goodreads.com/book/show/1063570.Fa...,https://s.gr-assets.com/assets/nophoto/book/11...,1063570,31,1050210,Falling Water,Falling Water


In [25]:
books_metadata.shape

(36514, 29)

In [26]:
books_metadata.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

- Selecionando apenas algumas variáveis da nossa base de dados de livros:

In [27]:
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
                                          'publication_year', 'ratings_count', 'language_code']]
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
2651,13553229,3.83,False,800.0,2008,159,ara
3434,27178990,3.88,False,312.0,2016,8,
6403,150251,4.28,False,260.0,2000,1570,en-US
5584,1127913,4.48,False,,1982,20,
1338,16174875,3.33,True,40.0,2012,143,eng


- Análise Exploratória Preliminar:

In [28]:
import pandas_profiling
import numpy as np

# replace blank cells with NaN
books_metadata_selected.replace('', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_metadata_selected.replace('', np.nan, inplace=True)


In [29]:
# not taking book_id into the profiler report
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages',
                                                                  'publication_year', 'ratings_count']])
profile.to_file('results/profiler_books_metadata_1.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

- Considerando os resultados anteriores, algumas transformações podem ser feitas:
    + Substituir os valores faltantes de variáveis categóricas com outros valores a fim de criar uma nova categoria
    + Converter valores de variáveis numéricas em intervalos discretos

In [30]:
# using pandas cut method to convert fields into discrete intervals
books_metadata_selected['num_pages'].replace(np.nan, -1, inplace=True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)

# rounding ratings to nearest .5 score
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x*2)/2)

# using pandas qcut method to convert fields into quantile-based discrete intervals
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)

# replacing missing values to year 2100
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace=True)

# replacing missing values to 'unknown'
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace=True)


# convert is_ebook column into 1/0 where true=1 and false=0
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(
    lambda x: 1.0*(x == 'true'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_metadata_selected['num_pages'].replace(np.nan, -1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)
A value

In [31]:
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                        'publication_year', 'ratings_count']])
profile.to_file('./results/profiler_books_metadata_2.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
4465,1245616,4.0,0.0,"(-11.961, 437.44]",1961,"(10.0, 12.0]",eng
5908,82362,3.5,0.0,"(-11.961, 437.44]",2005,"(29.0, 34.0]",eng
25121,4746103,3.5,0.0,"(-11.961, 437.44]",1990,"(12.0, 14.0]",per
13343,7932491,4.5,0.0,"(-11.961, 437.44]",2010,"(59.0, 73.0]",unknown
21995,764135,4.5,0.0,"(-11.961, 437.44]",1987,"(49.0, 59.0]",unknown


## Data Inspection & Preparation: Interactions data

In [48]:
interactions.columns.values

array(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'], dtype=object)

In [49]:
interactions.sample(5)

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
2598608,9410d2e023afb85c24206823958a874a,71654,52be467b884e4268f2567e87f458cda6,False,0,,Wed Jun 14 11:01:27 -0700 2017,Wed Jun 14 11:01:28 -0700 2017,,
1810316,1d2cede91667daf4b1cf2334d4bf2501,99713,b7ef86d8bcb91fe69f0a7bad02c0563d,True,5,,Wed Jun 06 20:38:27 -0700 2012,Wed Jun 06 20:38:27 -0700 2012,,
1094332,1166d9ed5beb38126be3cb4158cf0843,15479231,325b1a50b17abe47cd3616aa095e2552,True,5,"<a target=""_blank"" href=""http://youtu.be/siZgc...",Tue Dec 16 11:50:36 -0800 2014,Tue Dec 16 11:51:17 -0800 2014,,
459380,1f74f1f40beeedb19db82aabc9e27124,203220,32c345f6d62f8f28d1ea9127cae4074e,False,0,,Sat Feb 02 05:43:34 -0800 2013,Sat Feb 02 05:43:34 -0800 2013,,
762557,3c707803c0d7b4423e32e020f47ad281,11904233,52dc2571487c65c42d5f367f57bd092a,False,0,,Sat Nov 15 12:45:27 -0800 2014,Sat Nov 15 12:45:28 -0800 2014,,


In [50]:
interactions['is_read'].value_counts()

False    1420740
True     1313610
Name: is_read, dtype: int64

In [35]:
interactions.shape

(2734350, 10)

- Vamos selecionar apenas algumas variáveis, que exigem o mínimo de manipulação. Mas, é bom lembrar que, quanto mais informação melhor, para gerar bons sistemas de recomendação. 

In [51]:
# Limit the books metadata to selected fields
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]

# mapping boolean to string
booleanDictionary = {True: 'true', False: 'false'}
interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)

interactions_selected.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)


Unnamed: 0,user_id,book_id,is_read,rating
1619044,b3b8ec8a22ea3b542c7f0deb797b393d,1420,True,5
1210164,c9911d89be00c3f06a2dfc6d45ee6ec2,22909597,False,0
443103,503044091e3321cfaf79e59f14fb02e4,137126,True,3
1911768,ccfad5b61800d59744e5a50fc2fe8822,16128472,True,5
1206120,23b7c6868d2dfe394b8395e86398eb26,8113299,True,3


In [37]:
profile = pandas_profiling.ProfileReport(interactions_selected[['is_read', 'rating']])
profile.to_file('results/profiler_interactions.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

- Algumas transformações a serem realizadas:
    + converter `is_read` para 1/0

In [52]:
# convert is_read column into 1/0 where true=1 amd false=0
interactions_selected['is_read'] = interactions_selected.is_read.map(
    lambda x: 1.0*(x=='true'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_selected['is_read'] = interactions_selected.is_read.map(


In [53]:
interactions_selected.sample(5)

Unnamed: 0,user_id,book_id,is_read,rating
2402235,8aefd78afa68a7bbb0d3073e95e4db29,2711,1.0,5
2682041,db6f0646226f0f95eefcebfbef28f294,30119,0.0,0
1729109,aa75da92b43f3083ac6a2a43b6b91adb,23841432,0.0,0
1870950,e0779fdff9a8822169931f61de9f510b,15997,1.0,5
2499496,8a6eb52145c155f461c8f8ec8dde76c1,346573,0.0,0


- Uma vez que temos dois campos denotando interação entre usuário e livro, vamos checar quantos data points temos em que o usuário não leu o livro, mas mesmo assim o avaliou. 

In [54]:
interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns='rating', index='is_read', values=0)

rating,0,1,2,3,4,5
is_read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,1420740.0,,,,,
1.0,84551.0,20497.0,64084.0,237942.0,405565.0,500971.0


- Podemos concluir que usuários com rating $\geq 1$ leram o livro. Então, vamos usar `ratings` coo o score final, dropar interações nas quais `is_read` é falso, e limitar os dados de interação a 5000 usuários aleatórios para limitar o tamanho dos dados em análises futuras. 

In [55]:
import random

interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]
interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()),
                                                                                                 k=5000))]
interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,rating
1099100,660259203ff8748bbda74d7282288dd0,18295863,4
1874790,27ff98e9a56c8ad23b5c3b37d0e838ad,6871008,5
2353983,44a1e0fd79b3e49a854d15a88e562c84,147923,5
2255203,0920eaa05410c565793b5117600336dc,118389,3
845896,d503047f12dbf56e2715db4bd92de364,1382,3
2555372,3b33bcf1ffa90e8bc3f13c5c377f3c8d,72911,0
92098,94b604b8e6092eef24e27c0304b4d5fe,30119,4
880934,525cdb3ad7d0ddcf4f27a097ecdcd33a,9756378,3
1898078,59bca8b9aecd9af6b08755b1494b8480,9771670,4
1685597,9704ed4503e822cbb3d3fce71b5110e8,395090,5


In [56]:
interactions_selected.shape


(21086, 3)

## Data Preprocessing 

Agora, transformamos os dados disponíveis em matrizes esparsas que podem ser usadas em operações. Vamos criar o books_metadata, cada linha contém os pesos dos livros para cada feature. Antes, porém, vamos criar um dicionário de itens para referência futura.  

In [58]:
item_dict = {}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i, 'book_id'])] = df.loc[i, 'title']

In [60]:
# dummify categorcial fetures
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages',
                                                                                        'publication_year', 'ratings_count', 
                                                                                        'language_code'])

books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_selected_transformed.head(5)

Unnamed: 0,book_id,average_rating_0.0,average_rating_1.0,average_rating_1.5,average_rating_2.0,average_rating_2.5,average_rating_3.0,average_rating_3.5,average_rating_4.0,average_rating_4.5,...,language_code_tel,language_code_tgl,language_code_tha,language_code_tlh,language_code_tur,language_code_ukr,language_code_unknown,language_code_urd,language_code_vie,language_code_zho
0,234,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,236,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,241,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,244,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,254,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [61]:
# convert to csr (compressed sparse row) matrix
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
books_metadata_csr

<36514x357 sparse matrix of type '<class 'numpy.uint8'>'
	with 219084 stored elements in Compressed Sparse Row format>

- Agora, criamos uma matriz de interações, também com um dicionário para uso futuro de casos.

In [63]:
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')

# fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)

user_book_interaction.head()

book_id,234,236,244,254,284,285,286,290,291,292,...,35691576,35738258,35784849,35826743,35846198,35903748,35960350,36054248,36056405,36350410
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0019b485b2e71132a2cc6059cad87f26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00355d3fe92a6c0db46e20f152195fa7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004f7fe78d7909a898fac66d7eac9ec2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00677ed28927d8f3b3984a00a800decf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00729e6ca632b3d3e442b4da2f1dc13c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0

for i in user_id:
    user_dict[i] = counter
    counter += 1

In [66]:
# convert to csr matrix
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
user_book_interaction_csr

<5000x6343 sparse matrix of type '<class 'numpy.float64'>'
	with 19789 stored elements in Compressed Sparse Row format>

# Model Training

In [70]:
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

model = model.fit(user_book_interaction_csr,
                  epochs=100,
                  num_threads=16, verbose=False)

In [81]:
def sample_recommendation_user(model, interactions, user_id, user_dict,
                              item_dict, threshold=0, nrec_items=5, show=True):
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id, :]\
                                [interactions.loc[user_id, :] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    
    if show == True:
        print("User: " + str(user_id))
        print("Known Likes: ")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter += 1
            
        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter += 1

In [83]:
sample_recommendation_user(model, user_book_interaction, '00677ed28927d8f3b3984a00a800decf', user_dict, item_dict)

User: 00677ed28927d8f3b3984a00a800decf
Known Likes: 
1- Milk and Honey
2- Selected Poems
3- Hamlet
4- The Odyssey

 Recommended Items:
1- King Me
2- Space, in Chains
3- Tin House: Rehab
4- Mi chica revolucionaria
5- The Psalms with Commentary
