In [12]:
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline
import lightfm
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
from skopt import forest_minimize

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw = True)

In [15]:
books_metadata = pd.read_json('c://Users/pr1266/Desktop/recommander_system/goodreads_books_poetry.json', lines=True)
interactions = pd.read_json('c://Users/pr1266/Desktop/recommander_system/goodreads_interactions_poetry.json', lines=True)

In [16]:
books_metadata.columns.values

array(['isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month',
       'edition_information', 'publication_year', 'url', 'image_url',
       'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'], dtype=object)

In [17]:
books_metadata.sample(2)
books_metadata.shape

(36514, 29)

In [18]:
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
                                          'publication_year', 'ratings_count', 'language_code']]
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
23711,525111,3.97,False,132,2005,83,
12931,705822,4.25,False,256,2007,11,
22458,8075404,4.34,False,535,2010,209,
35759,13591348,4.64,False,84,2012,28,
28594,1226393,4.33,False,60,2001,3,


In [20]:
import pandas_profiling
import numpy as np

books_metadata_selected.replace('', np.nan, inplace=True)
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                                  'publication_year', 'ratings_count']])
profile.to_file('profiler_books_metadata_1.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [21]:

books_metadata_selected['num_pages'].replace(np.nan, -1, inplace = True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x * 2) / 2)
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace = True)
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace = True)
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(
    lambda x: 1.0 * (x == 'true'))

In [23]:
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                        'publication_year', 'ratings_count']])
profile.to_file('profiler_books_metadata_2.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [24]:
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
2474,268009,4.0,0.0,"(-11.961, 437.44]",2001,"(49.0, 59.0]",unknown
22195,11687703,4.0,0.0,"(-11.961, 437.44]",2001,"(14.0, 16.0]",per
3117,140025,4.0,0.0,"(-11.961, 437.44]",2005,"(94.0, 125.0]",unknown
19747,15724823,5.0,1.0,"(-11.961, 437.44]",2012,"(5.0, 7.0]",eng
149,6554190,3.0,0.0,"(-11.961, 437.44]",1986,"(25.0, 29.0]",per


In [25]:
interactions.columns.values

array(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'], dtype=object)

In [26]:
interactions.sample(5)

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
1791717,c5823767a1a164cd8e9d029f1806f2aa,23337740,b08dad6f71ab9893846defe32ebd2766,True,5,,Tue Jul 21 13:32:26 -0700 2015,Tue Jul 21 13:32:32 -0700 2015,,
3404,5b31fc1fa8ba1f4e48b3a075e267ad9e,1371,d8740a72d60bc3255852e071dd786400,False,0,,Fri Jan 24 20:40:06 -0800 2014,Fri Jan 24 20:40:06 -0800 2014,,
2710251,84d5040696c32e59f562f94608efc169,30119,2fb6d95565b0cfc10f1d9ebdaeb030ed,True,5,,Tue Jul 25 02:20:37 -0700 2017,Tue Jul 25 02:20:37 -0700 2017,,
1944986,7ac2e7e075d5ac72df1ca4ee9f7f0572,46199,56e0b29d347b19c79cb6548b1477bb61,True,3,,Fri Aug 05 12:43:56 -0700 2016,Fri Aug 05 12:43:57 -0700 2016,,
788209,d3962209cc0fa41f797885167a324fd6,30119,848bd52f65bbacda9d1f7f99e185d49d,False,0,,Thu Feb 13 16:01:41 -0800 2014,Thu Feb 13 16:01:41 -0800 2014,,


In [27]:
interactions.shape

(2734350, 10)

In [28]:
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]
booleanDictionary = {True: 'true', False: 'false'}
interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)

interactions_selected.sample(5)

Unnamed: 0,user_id,book_id,is_read,rating
878462,89e92699758ef55cca1b9971f97c8601,13516581,True,2
104162,24b358882aa7425f300c80d4ea059c9d,6352248,False,0
465319,9e3d67e280ffb1f56c61c3de44ae2dea,118285,False,0
1518326,dbd2f42a7f4ae9852dbc32c48f658ac5,139864,False,0
908035,8fe45035cd2d98e1f1a555790f27c377,6990156,False,0


In [32]:
profile = pandas_profiling.ProfileReport(interactions_selected[['is_read', 'rating']])
profile.to_file('profiler_interactions.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=16.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [33]:
interactions_selected['is_read'] = interactions_selected.is_read.map(
    lambda x: 1.0*(x == 'true'))

In [34]:
interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,is_read,rating
2055404,d1ccf24f9af4f7d12a2a8af9dabbaee5,18079805,0.0,0
16699,6c43241093fc8157425ff04abed4c0f8,1589254,1.0,5
742128,6f2de977a0e4f74a051e040b73611a28,157687,0.0,0
1736057,edac5c19d761cacec641d88544adf75c,1432,1.0,5
789311,7c19bea1f1ebf109514dff0b40de065e,12009713,0.0,0
233935,870caad4a81d2c2ad80798d7ab5c794f,16170625,1.0,5
526688,1a487fc0784807b98980b8aa1cef21a7,76546,0.0,0
2011979,c768e77bb40a3c214fbc2256ed630a53,6045449,1.0,2
1240768,7788c676f68044585ba9061fb438d629,23513349,0.0,0
1363603,9f26f842f72f47d1c849257605770292,676,1.0,5


In [35]:
interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns = 'rating', index = 'is_read', values = 0)

rating,0,1,2,3,4,5
is_read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,1420740.0,,,,,
1.0,84551.0,20497.0,64084.0,237942.0,405565.0,500971.0


In [36]:
import random

interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]
interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()), 
                                                                                                  k = 5000))]
interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,rating
2542635,cdbb6210e92ce74536df17bfb8e55d54,23919,4
2346441,c46b4e1326ec5cf30685c7ecdeed92ca,32053,4
2479645,ae51eeb65d388b0adb8cdd3d87d3ba41,7824768,0
1770206,56b0b3202f94c48ee5c9154a11ee14aa,11747831,4
1517640,65bd653064c171b654adffc610502ee4,1381,5
410072,a648278f5d4eb1c1722888792b2958c4,508192,4
1682794,bb8a48d2e08411deb01351ba723d7bdb,1371,3
1176586,17a6ab8d07d16fad518b058882395631,2168850,5
2656237,2980cf0859c347e314d5401d3fe407bd,5289,5
751696,70bc0acefbfa9f2eaadff9c33a4ce7f8,89220,5


In [37]:
interactions_selected.shape

(23894, 3)

In [38]:
item_dict ={}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']

In [39]:
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages', 
                                                                                         'publication_year', 'ratings_count', 
                                                                                         'language_code'])

books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_selected_transformed.head(5)

Unnamed: 0,book_id,average_rating_0.0,average_rating_1.0,average_rating_1.5,average_rating_2.0,average_rating_2.5,average_rating_3.0,average_rating_3.5,average_rating_4.0,average_rating_4.5,...,language_code_tel,language_code_tgl,language_code_tha,language_code_tlh,language_code_tur,language_code_ukr,language_code_unknown,language_code_urd,language_code_vie,language_code_zho
0,234,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,236,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,241,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,244,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,254,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [40]:
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
books_metadata_csr

<36514x357 sparse matrix of type '<class 'numpy.uint8'>'
	with 219084 stored elements in Compressed Sparse Row format>

In [41]:
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')
user_book_interaction = user_book_interaction.fillna(0)
user_book_interaction.head(10)

book_id,234,254,286,289,290,291,292,448,676,1371,...,35606560,35654589,35668923,35670989,35783117,35846198,35896040,35905478,36049650,36126998
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000200adc5009a722bdee406efe18ea6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000883382802f2d95a3dd545bb953882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001323f7d91674e9b8b3d4c4e7fd17fc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0021909de18ab01ec27517ea8dc0aa93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00265ed033c788fce8a86120e55cf253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002db785799846c260d69824af6dc36d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0036baac32a945baa66252ce5795373f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004d435847c22f0325f5f4700b21b00d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
004fa6050c0916488f5da8d94b1654e5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005d83a471aed1691c8447b52ce4baaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [43]:
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
user_book_interaction_csr

<5000x7210 sparse matrix of type '<class 'numpy.float64'>'
	with 22313 stored elements in Compressed Sparse Row format>

In [44]:
model = LightFM(loss='warp',
                random_state = 2016,
                learning_rate = 0.90,
                no_components = 150,
                user_alpha = 0.000005)

model = model.fit(user_book_interaction_csr,
                  epochs = 100,
                  num_threads = 16, verbose = False)

In [45]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 5, show = True):
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print ("User: " + str(user_id))
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter += 1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter += 1

In [57]:
sample_recommendation_user(model, user_book_interaction, '004d435847c22f0325f5f4700b21b00d', user_dict, item_dict)

User: 004d435847c22f0325f5f4700b21b00d
Known Likes:
1- The Raven and Other Poems
2- Where the Sidewalk Ends
3- A Light in the Attic
4- The Iliad

 Recommended Items:
1- The Oresteia
2- Metamorphoses
3- Selected Poems
4- Foolsgold: Making Something from Nothing and Freeing Your Creative Process
5- The Waste Land
