Recommender model notes

Questions to be answered
a) What is LightFM?
b) Do we have feature importances/ partial dependendence plots with LightFM
c) Do we need training test separate datasets in case of unsupervised model?
d) Alternative methodologies available to develop model along with pros and cons
e) Possible issues with LightFM
f) What feature transformations/hyper parameters are best for LightFM

Hybrid recommender system is a special type of recommender system that combines both content and collaborative filtering method. Combining collaborative filtering and content-based filtering could be more effective in some cases. Hybrid approaches can be implemented in several ways: by making content-based and collaborative-based predictions separately and then combining them; by adding content-based capabilities to a collaborative-based approach (and vice versa). 

a) Detailed discussion of Light FM in this Kaggle notebook along with advantages and process:
https://www.kaggle.com/bond0071/lightfm-hybrid-recommendation-system
https://arxiv.org/pdf/1507.08439.pdf

Other good reads:
https://towardsdatascience.com/recommendation-system-in-python-lightfm-61c85010ce17

Credits: https://github.com/kapadias/mediumposts/blob/master/recommender/published_notebooks/recommendation_python_lightfm.ipynb



In [1]:
# !pip install lightfm --user

In [2]:
# !pip install skopt --user

In [3]:

# import dependent libraries
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
# from skopt import forest_minimize

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw=True)


# update the working directory to the root of the project
os.chdir('..')
warnings.filterwarnings("ignore")



In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
os.getcwd()

'C:\\Users\\aditi\\OneDrive\\Desktop\\ALL STUDY\\Kaggle'

In [6]:
%%time
books_metadata = pd.read_json('./RecommenderModel/Data/goodreads_books_poetry.json', lines=True)
interactions = pd.read_json('./RecommenderModel/Data/goodreads_interactions_poetry.json', lines=True)

Wall time: 1min 20s


In [7]:
books_metadata.columns.values
books_metadata.sample(2)
books_metadata.shape

array(['isbn', 'text_reviews_count', 'series', 'country_code',
       'language_code', 'popular_shelves', 'asin', 'is_ebook',
       'average_rating', 'kindle_asin', 'similar_books', 'description',
       'format', 'link', 'authors', 'publisher', 'num_pages',
       'publication_day', 'isbn13', 'publication_month',
       'edition_information', 'publication_year', 'url', 'image_url',
       'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'], dtype=object)

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
17292,080706887X,111,[],US,eng,"[{'count': '1183', 'name': 'to-read'}, {'count...",,False,4.58,B00BVJHM14,...,,,,https://www.goodreads.com/book/show/65350.New_...,https://s.gr-assets.com/assets/nophoto/book/11...,65350,2133,6398751,"New and Selected Poems, Vol. 2","New and Selected Poems, Vol. 2"
1892,0942996046,2,[],US,,"[{'count': '11', 'name': 'to-read'}, {'count':...",,False,4.38,,...,1.0,,1986.0,https://www.goodreads.com/book/show/1330786.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,1330786,8,1320276,The Indian Never Had a Horse and Other Poems,The Indian Never Had a Horse and Other Poems


(36514, 29)

In [8]:
# Limit the books metadata to selected fields
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
                                          'publication_year', 'ratings_count', 'language_code']]
books_metadata_selected.sample(5)

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
21845,1429465,4.09,False,87,2004.0,22,
8008,22047498,4.09,False,82,,20,en-US
11253,1570130,4.5,False,63,1999.0,10,
6403,150251,4.28,False,260,2000.0,1570,en-US
17908,18775905,4.0,True,110,2013.0,1,eng


In [9]:
import pandas_profiling
import numpy as np

# replace blank cells with NaN
books_metadata_selected.replace('', np.nan, inplace=True)

# not taking book_id into the profiler report
# profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
#                                                                   'publication_year', 'ratings_count']])
# profile.to_file('./RecommenderModel/results/profiler_books_metadata_1.html')

In [10]:
books_metadata_selected.head()

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
0,16037549,3.83,False,80.0,1887,3,eng
1,22466716,3.83,False,128.0,2015,37,
2,926662,4.38,False,,2008,45,
3,926667,3.71,False,190.0,1964,115,
4,29065952,5.0,False,118.0,2015,9,eng


In [11]:
books_metadata_selected.to_csv('books_metadata.csv')


In [None]:
# using pandas cut method to convert fields into discrete intervals
books_metadata_selected['num_pages'].replace(np.nan, -1, inplace=True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)

# rounding ratings to neares .5 score
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x*2)/2)

# using pandas qcut method to convert fields into quantile-based discrete intervals
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)

# replacing missing values to year 2100
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace=True)

# replacing missing values to 'unknown'
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace=True)


# convert is_ebook column into 1/0 where true=1 and false=0
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(
    lambda x: 1.0*(x == 'true'))

In [None]:
books_metadata_selected.head()
# profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
#                                                         'publication_year', 'ratings_count']])
# profile.to_file('./RecommenderModel/results/profiler_books_metadata_2.html')

In [12]:
interactions.columns.values
interactions.sample(5)
interactions.shape

array(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'], dtype=object)

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
679725,ad49333737d28c4069a325773587534b,17937655,2d53bbb230d536f52228dd64162a0a26,False,0,,Sun Jan 05 12:15:59 -0800 2014,Sun Jan 05 12:15:59 -0800 2014,,
158171,3f35f6a56868e0c1e1659a7547d932b5,27494,c5150fe64822f0ce932a15bb8cebb0d2,True,4,Date read is an educated guess. I quite enjoye...,Wed Dec 19 16:04:48 -0800 2007,Wed Dec 19 16:07:23 -0800 2007,Mon Feb 01 00:00:00 -0800 1999,
2242131,5e9249908d98b966b58a29a42d2442cb,237794,7010ee5f68cb97cd5779f342fd5d91ee,True,3,,Sat Feb 16 05:19:10 -0800 2013,Sat Feb 16 05:19:17 -0800 2013,,
2271807,957d6014f74b5176e1cab9240de8ecf6,146208,82c4356eaacfed5ac831665961425275,True,4,,Wed Feb 20 19:16:52 -0800 2008,Wed Feb 20 19:16:52 -0800 2008,,
2484864,65c12e85653cdc59e0ec9728dba120fa,5867256,94324a8a605cb8ac9ace4647c1ba8d23,True,5,,Mon Sep 02 12:58:35 -0700 2013,Mon Sep 02 12:58:49 -0700 2013,Mon Sep 02 12:58:49 -0700 2013,


(2734350, 10)

In [20]:

# Limit the books metadata to selected fields
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]

# mapping boolean to string
booleanDictionary = {True: 'true', False: 'false'}
interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)

interactions_selected.sample(5)
# profile = pandas_profiling.ProfileReport(interactions_selected[['is_read', 'rating']])
# profile.to_file('./RecommenderModel/results/profiler_interactions.html')

Unnamed: 0,user_id,book_id,is_read,rating
696616,446a78a97c434dca98e7ea176ee9fa1f,13625561,False,0
2571022,d41507942f298d4a9fe5b42b9e573ff7,29863352,False,0
634577,12f917ba157de3ec0bef08acfce4d000,15855683,True,5
966821,73a277acf8817f12f5a0d23ffba0b311,3322992,True,4
2263290,08320c0f71fced731c3c886e5d2b38c0,23513349,True,4


In [21]:
# convert is_read column into 1/0 where true=1 and false=0
interactions_selected['is_read'] = interactions_selected.is_read.map(
    lambda x: 1.0*(x == 'true'))
interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,is_read,rating
2592636,1308b1cd9282e2fa86df9bc6d9713bf4,23780641,1.0,5
543221,bec52dcfb7218307476dec982ff9d76c,1246850,0.0,0
1194113,988f87d4afd617da6c592af77a730f1d,1381,0.0,0
70799,ac2824b924a7998b465850b610995657,35606560,0.0,0
1625575,339a20c85c9f0b5ae2b94a7c3f692f0d,47730,0.0,0
1377846,eae4ea576e19cc78f3ebfb1d628366ae,34714825,0.0,0
395985,e3555b1fc3f8d74964158c64eebb2d33,2547,0.0,0
2703259,fa45d06995de2cf72b9cf604328adf01,157985,1.0,5
1356421,7e2375c9fd9ecb137223189d428f9a62,146152,0.0,0
369195,362951a48642978f7ea95ac9e25a76ed,332226,0.0,0


In [22]:
interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns='rating', index='is_read', values=0)

rating,0,1,2,3,4,5
is_read,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,1420740.0,,,,,
1.0,84551.0,20497.0,64084.0,237942.0,405565.0,500971.0


In [23]:

import random

interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]

interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()), 
                                                                                                  k=1000))]

interactions_selected.sample(10)

Unnamed: 0,user_id,book_id,rating
612187,a223e258f11871ac0fd2761c1777b07e,1383945,5
1350730,73c0df56edf216dd642dab26bcdecbd8,27822,5
1165094,13a6a4a45aed2d1fac8be1922c4ed390,780534,5
57945,1489911bb792e073cf8edf37bdb683ac,31435158,4
936537,72e3c14956b032cdb62fd1ff58cfd413,763426,3
2472485,271774ce7b882ae1445b659e18409e07,30118,4
1956676,89665f0397305dc80464671f91a99153,30341619,0
1562955,4d290e2a85b00448c24445b49fbcc64b,1384,2
737536,cf1caea8983d0cd0d9d49c7a7e866a6e,618944,3
132001,d7d78c3e0bf6bba1e619e55ae148acf0,22375630,5


In [24]:
interactions_selected.shape

(4529, 3)

In [25]:
interactions_selected.to_csv('interactions.csv')

In [None]:
item_dict ={}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']

In [None]:
item_dict

In [None]:
books_metadata_selected.head(5)

In [None]:
# dummify categorical features
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages', 
                                                                                         'publication_year', 'ratings_count', 
                                                                                         'language_code'])

books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_selected_transformed.head(5)
books_metadata_selected_transformed.describe(include="all")
books_metadata_selected_transformed.publication_year_1887.head()

In [None]:

# convert to csr matrix
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
books_metadata_csr

In [None]:
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')

# fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)

user_book_interaction.head(10)