In [1]:
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline
import lightfm
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
from skopt import forest_minimize

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw = True)



In [None]:
books_metadata = pd.read_json('c://Users/pr1266/Desktop/recommander_system/goodreads_books_poetry.json', lines=True)
interactions = pd.read_json('c://Users/pr1266/Desktop/recommander_system/goodreads_interactions_poetry.json', lines=True)

In [None]:
books_metadata.columns.values

In [None]:
books_metadata.sample(2)
books_metadata.shape

In [None]:
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
                                          'publication_year', 'ratings_count', 'language_code']]
books_metadata_selected.sample(5)

In [None]:
import pandas_profiling
import numpy as np

books_metadata_selected.replace('', np.nan, inplace=True)
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                                  'publication_year', 'ratings_count']])
profile.to_file('profiler_books_metadata_1.html')

In [None]:

books_metadata_selected['num_pages'].replace(np.nan, -1, inplace = True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x * 2) / 2)
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace = True)
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace = True)
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(
    lambda x: 1.0 * (x == 'true'))

In [None]:
profile = pandas_profiling.ProfileReport(books_metadata_selected[['average_rating', 'is_ebook', 'num_pages', 
                                                        'publication_year', 'ratings_count']])
profile.to_file('profiler_books_metadata_2.html')

In [None]:
books_metadata_selected.sample(5)

In [None]:
interactions.columns.values

In [None]:
interactions.sample(5)

In [None]:
interactions.shape

In [None]:
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]
booleanDictionary = {True: 'true', False: 'false'}
interactions_selected['is_read'] = interactions_selected['is_read'].replace(booleanDictionary)

interactions_selected.sample(5)

In [None]:
profile = pandas_profiling.ProfileReport(interactions_selected[['is_read', 'rating']])
profile.to_file('profiler_interactions.html')

In [None]:
interactions_selected['is_read'] = interactions_selected.is_read.map(
    lambda x: 1.0*(x == 'true'))

In [None]:
interactions_selected.sample(10)

In [None]:
interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns = 'rating', index = 'is_read', values = 0)

In [None]:
import random

interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]
interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()), 
                                                                                                  k = 5000))]
interactions_selected.sample(10)

In [None]:
interactions_selected.shape

In [None]:
item_dict ={}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']

In [None]:
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages', 
                                                                                         'publication_year', 'ratings_count', 
                                                                                         'language_code'])

books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_selected_transformed.head(5)

In [None]:
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
books_metadata_csr

In [None]:
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')
user_book_interaction = user_book_interaction.fillna(0)
user_book_interaction.head(10)

In [None]:
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [None]:
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
user_book_interaction_csr

In [None]:
model = LightFM(loss='warp',
                random_state = 2016,
                learning_rate = 0.90,
                no_components = 150,
                user_alpha = 0.000005)

model = model.fit(user_book_interaction_csr,
                  epochs = 100,
                  num_threads = 16, verbose = False)

In [None]:
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 5, show = True):
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print ("User: " + str(user_id))
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter += 1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter += 1

In [None]:
sample_recommendation_user(model, user_book_interaction, '004d435847c22f0325f5f4700b21b00d', user_dict, item_dict)