In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import glob
import sys
import re

from scipy import sparse #cut down on memory size

from sklearn.metrics.pairwise import pairwise_distances 
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# total number of books 2999
df_description = pd.read_csv('../datasets/book_hybrid_dataset/hybrid_description.csv')
df_description.head()

Unnamed: 0.1,Unnamed: 0,book_id,description,book_name
0,0,1,when harry potter and the half blood prince o...,Harry Potter and the Half-Blood Prince
1,1,2,there is a door at the end of a silent corrid...,Harry Potter and the Order of the Phoenix
2,2,3,harry potter s life is miserable his parents...,Harry Potter and the Sorcerer's Stone
3,3,4,the dursleys were so mean and hideous that su...,Harry Potter and the Chamber of Secrets
4,4,5,harry potter s third year at hogwarts is full...,Harry Potter and the Prisoner of Azkaban


In [125]:
# dropping the unnamed column
df_description.drop(columns = ['Unnamed: 0', 'book_id'], inplace = True)

In [126]:
#setting index as the book_name
df_description.set_index('book_name', inplace=True)

In [127]:
df_description.shape

(2616, 1)

In [128]:
#dropping duplicate entries
df_description.drop_duplicates(inplace = True)

In [3]:
df_rating = pd.read_csv('../datasets/book_hybrid_dataset/hybrid_ratings.csv')
df_rating.head()

Unnamed: 0.1,Unnamed: 0,user_id,rating,book_name,book_id
0,0,11375137,5,Harry Potter and the Half-Blood Prince,1
1,1,4886576,5,Harry Potter and the Half-Blood Prince,1
2,2,14066678,5,Harry Potter and the Half-Blood Prince,1
3,3,68204342,5,Harry Potter and the Half-Blood Prince,1
4,4,13614926,5,Harry Potter and the Half-Blood Prince,1


In [131]:
#dropped unnamed columns for ratings
df_rating.drop(columns = ['Unnamed: 0'], inplace = True)

In [132]:
df_rating.head()

Unnamed: 0,user_id,rating,book_name,book_id
0,11375137,5,Harry Potter and the Half-Blood Prince,1
1,4886576,5,Harry Potter and the Half-Blood Prince,1
2,14066678,5,Harry Potter and the Half-Blood Prince,1
3,68204342,5,Harry Potter and the Half-Blood Prince,1
4,13614926,5,Harry Potter and the Half-Blood Prince,1


In [133]:
# mask to find book names via book id
df_rating[df_rating['book_id']==1000]

Unnamed: 0,user_id,rating,book_name,book_id
133120,2197345,3,Millionaire Women Next Door: The Many Journeys...,1000
133121,571436,2,Millionaire Women Next Door: The Many Journeys...,1000
133122,9408524,4,Millionaire Women Next Door: The Many Journeys...,1000
133123,967569,4,Millionaire Women Next Door: The Many Journeys...,1000
133124,1397170,3,Millionaire Women Next Door: The Many Journeys...,1000
...,...,...,...,...
133179,1515645,4,Millionaire Women Next Door: The Many Journeys...,1000
133180,3503636,2,Millionaire Women Next Door: The Many Journeys...,1000
133181,5881079,3,Millionaire Women Next Door: The Many Journeys...,1000
133182,141429,1,Millionaire Women Next Door: The Many Journeys...,1000


In [134]:
# shape
df_rating.shape

(396932, 4)

In [135]:
# shape
df_description.shape

(2526, 1)

## Item based - Recommender 1

In [136]:
# ratings recommender system, index is the book name, the column is all the user ids, the values are the ratings
pivot = df_rating.pivot_table(index = 'book_name', columns = 'user_id', values = 'rating')# user id as the index- swap
pivot.head()

user_id,1,3,5,6,8,9,14,18,21,26,...,105717126,105719592,105725950,105728167,105736634,105738922,105757466,105757814,105782804,105823641
book_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""A"" Is for Zebra",,,,,,,,,,,...,,,,,,,,,,
"""The Earth is Flat"" and Other Great Mistakes",,,,,,,,,,,...,,,,,,,,,,
"1,000 Places to See in the U.S.A. Canada Before You Die",,,,,,,,,,,...,,,,,,,,,,
100 Albums That Changed Music: And 500 Songs You Need to Hear,,,,,,,,,,,...,,,,,,,,,,
100 Bullshit Jobs...And How to Get Them,,,,,,,,,,,...,,,,,,,,,,


In [137]:
# we want a metrix with a smaller file size for easier computation
pivot_sparse = sparse.csr_matrix(pivot.fillna(0))

In [138]:
# cos sim: -1 and +1 where +1 is good
# pairwise: 0 and 1 where 0 is good
recommender = pairwise_distances(pivot_sparse, metric='cosine')
recommender

array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        0.99179174],
       [1.        , 1.        , 1.        , ..., 1.        , 0.99179174,
        0.        ]])

In [139]:
# Converted the above matrix into a dataframe and setting up the column and index names to be book_names
recommender_df = pd.DataFrame(recommender, index = pivot.index, columns = pivot.index)
recommender_df.head()

book_name,"""A"" Is for Zebra","""The Earth is Flat"" and Other Great Mistakes","1,000 Places to See in the U.S.A. Canada Before You Die",100 Albums That Changed Music: And 500 Songs You Need to Hear,100 Bullshit Jobs...And How to Get Them,100 Years of Lynchings,1000 Cornish Place Names Explained,1000 Families,"1000 Questions, 1000 Réponses",1000 Record Covers,...,Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,Ziggy the Zebra,Zodiac,eBay PowerSeller Secrets: Insider Tips from eBay's Most Successful Sellers,eBay for Dummies,sliding home,Ángeles y demonios,"Ìsarà: A Voyage Around ""Essay""",حقيقة الخديعة,デスノート 1: (退屈) [Desu Nōto: Taikutsu]
book_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""A"" Is for Zebra",0.0,1.0,1.0,1.0,0.961031,0.9639,1.0,1.0,1.0,1.0,...,1.0,0.879509,1.0,0.939422,0.967257,1.0,1.0,1.0,1.0,1.0
"""The Earth is Flat"" and Other Great Mistakes",1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"1,000 Places to See in the U.S.A. Canada Before You Die",1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100 Albums That Changed Music: And 500 Songs You Need to Hear,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
100 Bullshit Jobs...And How to Get Them,0.961031,1.0,1.0,1.0,0.0,0.975775,1.0,1.0,1.0,1.0,...,1.0,0.919144,1.0,0.959349,0.974512,1.0,1.0,1.0,1.0,1.0


In [140]:
# checking for the shape to verify if it is a square matrix
recommender_df.shape

(2428, 2428)

In [141]:
#Output for item based recommender system along with the distance calculations - closer to 0, better the score
recommender_df['Harry Potter and the Half-Blood Prince'].sort_values()[1:6]

book_name
Harry Potter and the Order of the Phoenix    0.454364
Harry Potter and the Goblet of Fire          0.531247
Harry Potter and the Prisoner of Azkaban     0.585688
Harry Potter and the Chamber of Secrets      0.626723
Harry Potter and the Sorcerer's Stone        0.704154
Name: Harry Potter and the Half-Blood Prince, dtype: float64

## Content based - Recommender 2


In [142]:
# Count vectorized the dataset along with setting up hyper parameters
cvec = CountVectorizer(stop_words = ENGLISH_STOP_WORDS, max_df = 0.95, min_df = 3, binary = True)
matrix = cvec.fit_transform(df_description['description'])
df_matrix = pd.DataFrame(matrix.todense(), columns = cvec.get_feature_names())

In [143]:
#removed numbers and foreign languages
english_columns = []
non_english_columns = []
for word in list(df_matrix.columns):
    if re.sub("[^a-zA-Z]", " ", word) == word:
        english_columns.append(word)
    else:
        non_english_columns.append(word)

In [144]:
df_clean = df_matrix[english_columns]
df_clean

Unnamed: 0,aaron,abandon,abandoned,abandons,abbey,abducted,aber,abiding,abigail,abilities,...,zero,zeus,zinn,zonas,zone,zoo,zu,zukunft,zum,zur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2522,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
pivot_sparse_2 = sparse.csr_matrix(df_clean.fillna(0))

Calaculating the pairwise distance for all the books - Output is an array, we convert it into a dataframe in the following step

In [146]:
recommender_2 = pairwise_distances(pivot_sparse_2, metric='cosine')
recommender_2

array([[0.        , 0.81728593, 0.83381948, ..., 0.98442058, 0.91060455,
        0.97366597],
       [0.81728593, 0.        , 0.91622922, ..., 1.        , 0.98907544,
        0.96460038],
       [0.83381948, 0.91622922, 0.        , ..., 0.93333333, 0.9043659 ,
        0.94365638],
       ...,
       [0.98442058, 1.        , 0.93333333, ..., 0.        , 0.93479493,
        0.95774229],
       [0.91060455, 0.98907544, 0.9043659 , ..., 0.93479493, 0.        ,
        0.86773999],
       [0.97366597, 0.96460038, 0.94365638, ..., 0.95774229, 0.86773999,
        0.        ]])

In [147]:
# A square pairwise distance matrix dataframe
recommender_df_2 = pd.DataFrame(recommender_2, index = df_description.index, columns = df_description.index)
recommender_df_2

book_name,Harry Potter and the Half-Blood Prince,Harry Potter and the Order of the Phoenix,Harry Potter and the Sorcerer's Stone,Harry Potter and the Chamber of Secrets,Harry Potter and the Prisoner of Azkaban,Harry Potter and the Goblet of Fire,The Harry Potter Collection,"Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)",NaN,Harry Potter Collection,...,Behind a Mask: The Unknown Thrillers of Louisa May Alcott,Rose in Bloom,Louisa May Alcott: A Biography: With an Introduction to the New Edition,Louisa May Alcott (Biography (a & E)),The Journals of Louisa May Alcott,"Louisa May Alcott and ""Little Women"": Biography, Critique, Publications, Poems, Songs, and Contemporary Relevance",The Secret Garden,My Secret Garden: Women's Sexual Fantasies,The Secret Garden,The Secret Garden
book_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Harry Potter and the Half-Blood Prince,0.000000,0.817286,0.833819,0.830337,0.841765,0.850104,0.863164,0.910874,0.889621,0.867804,...,0.919006,0.982582,0.911515,0.978993,0.953551,0.967156,0.929338,0.984421,0.910605,0.973666
Harry Potter and the Order of the Phoenix,0.817286,0.000000,0.916229,0.929825,0.817679,0.888057,0.938686,0.960064,0.981453,0.940765,...,1.000000,1.000000,1.000000,0.971761,0.953171,1.000000,0.984169,1.000000,0.989075,0.964600
Harry Potter and the Sorcerer's Stone,0.833819,0.916229,0.000000,0.832458,0.721901,0.857461,0.853615,0.936436,0.940959,0.874292,...,0.965342,0.981366,0.962136,0.977527,0.925464,0.982432,0.911808,0.933333,0.904366,0.943656
Harry Potter and the Chamber of Secrets,0.830337,0.929825,0.832458,0.000000,0.908839,0.865668,0.754744,0.880192,0.925811,0.743315,...,0.978225,0.976585,1.000000,1.000000,0.937561,1.000000,0.952506,0.979057,0.989075,0.964600
Harry Potter and the Prisoner of Azkaban,0.841765,0.817679,0.721901,0.908839,0.000000,0.864276,0.911501,0.930829,0.935751,0.914502,...,0.981142,0.979722,0.938194,0.975544,0.932408,0.980882,0.917739,0.981863,0.943234,0.969343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Louisa May Alcott and ""Little Women"": Biography, Critique, Publications, Poems, Songs, and Contemporary Relevance",0.967156,1.000000,0.982432,1.000000,0.980882,1.000000,0.974283,0.949748,0.953324,0.950310,...,0.890401,0.970537,0.880263,0.928933,0.862507,0.000000,0.880477,0.973648,0.917521,1.000000
The Secret Garden,0.929338,0.984169,0.911808,0.952506,0.917739,0.979797,0.963114,1.000000,0.949790,0.964365,...,0.960701,0.936613,0.957066,0.974518,0.887313,0.880477,0.000000,0.905509,0.812696,0.808337
My Secret Garden: Women's Sexual Fantasies,0.984421,1.000000,0.933333,0.979057,0.981863,1.000000,0.975602,1.000000,0.933579,0.976430,...,0.974006,0.944098,0.971602,1.000000,0.981366,0.973648,0.905509,0.000000,0.934795,0.957742
The Secret Garden,0.910605,0.989075,0.904366,0.989075,0.943234,0.972117,0.987273,1.000000,0.942253,0.987705,...,0.959322,0.956259,0.940746,0.947247,0.931959,0.917521,0.812696,0.934795,0.000000,0.867740


In [148]:
# dropping duplicate column names
recommender_df_2 = recommender_df_2.loc[:, ~recommender_df_2.columns.duplicated()]
# dropping duplicate indices
recommender_df_2 = recommender_df_2.loc[~recommender_df_2.index.duplicated(keep='first')]
#confirming the index equals column names
list(recommender_df_2.index) == list(recommender_df_2.columns)

True

Output for content based recommender 

In [149]:
recommender_df_2['Harry Potter and the Half-Blood Prince'].sort_values()[1:6]

book_name
Harry Potter and the Order of the Phoenix    0.817286
Harry Potter and the Chamber of Secrets      0.830337
Harry Potter and the Sorcerer's Stone        0.833819
Harry Potter and the Prisoner of Azkaban     0.841765
Harry Potter and the Goblet of Fire          0.850104
Name: Harry Potter and the Half-Blood Prince, dtype: float64

## Hybrid - Combining the results from Recommender 1 and Recommender 2

- Defining a function which would merge the two recommender systems togeather
- Concats the output of the two lists 
- Takes an average of the books that exist in both the recommender systems
- Displays the top five books

In [156]:
def average_similar_books(book_name):
    list_1 = list(recommender_df[book_name].sort_values()[1:6].index)
    list_2 = list(recommender_df_2[book_name].sort_values()[1:6].index)
    
    intersection = list(set(list_1).intersection(set(list_2)))
    intersection_recommender_1 = recommender_df.loc[list_1,:][book_name][intersection]
    intersection_recommender_2 = recommender_df_2.loc[list_2,:][book_name][intersection]
    
    average_scores = (intersection_recommender_1 + intersection_recommender_2) /2
    first_recommender_scores = recommender_df.loc[list_1,:][book_name].drop(labels = intersection)
    second_recommender_scores = recommender_df_2.loc[list_2,:][book_name].drop(labels = intersection)
    
    book_results = pd.concat([average_scores,
                              first_recommender_scores
                              ,second_recommender_scores],axis = 0).sort_values(ascending = True)
    combined_scores = list(book_results.index)
    
    return book_results

- The distance should be as close to 0 as possible for better result
- In this case due to RAM issues we only cosidered 3000 books

In [157]:
average_similar_books("Stylin' with CSS: A Designer's Guide")

book_name
HTML Utopia                                                                            0.725749
CSS Mastery: Advanced Web Standards Solutions                                          0.745110
HTML, XHTML, and CSS (Visual Quickstart Guide)                                         0.751931
Head First HTML with CSS & XHTML                                                       0.772356
Agile Web Development with Rails: A Pragmatic Guide                                    0.779279
Willem de Kooning: The Late Paintings, the 1980s                                       0.901753
Competing for Customers and Capital                                                    0.906795
The Language of New Media Design: Theory and Practice                                  0.921402
Doing Things with Things: The Design and Use of Everyday Objects                       0.931589
Eric Meyer on CSS: Mastering the Language of Web Design with Cascading Style Sheets    0.950244
Name: Stylin' with CSS: A Desi

In [158]:
average_similar_books('Harry Potter and the Half-Blood Prince')

book_name
Harry Potter and the Order of the Phoenix    0.635825
Harry Potter and the Goblet of Fire          0.690675
Harry Potter and the Prisoner of Azkaban     0.713726
Harry Potter and the Chamber of Secrets      0.728530
Harry Potter and the Sorcerer's Stone        0.768987
Name: Harry Potter and the Half-Blood Prince, dtype: float64

In [57]:
#Saving the dataset as a csv to use in the flask app
#recommender_df.to_csv('../flask/assets/recommender_1.csv')

In [192]:
#Saving the dataset as a csv to use in the flask app
#recommender_df_2.to_csv('../flask/assets/recommender_2.csv')