In [24]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

In [11]:
pd.__version__

'1.3.5'

In [12]:
DATA_DIR = "../input/book-recommendation-dataset/"

In [13]:
os.listdir(DATA_DIR)

['Ratings.csv', 'Users.csv', 'Books.csv']

In [14]:
books_df = pd.read_csv(DATA_DIR+'Books.csv')
books_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [15]:
ratings_df = pd.read_csv(DATA_DIR+'Ratings.csv')
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [16]:
ratings_df.shape

(1149780, 3)

In [17]:
users_df = pd.read_csv(DATA_DIR+'Users.csv')
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [18]:
users_ratings = pd.merge(ratings_df, books_df, on='ISBN', how="inner")
users_ratings.shape

(1031136, 10)

In [19]:
users_ratings = users_ratings.head(1000)

In [20]:
def normalize(pred_ratings):
    '''
    This function will normalize the input pred_ratings
    
    params:
        pred_ratings (List -> List) : The prediction ratings 
    '''
    return (pred_ratings - pred_ratings.min()) / (pred_ratings.max() - pred_ratings.min())
  
def generate_prediction_df(mat, pt_df, n_factors):
    '''
    This function will calculate the single value decomposition of the input matrix
    given n_factors. It will then generate and normalize the user rating predictions.
    
    params:
        mat (CSR Matrix) : scipy csr matrix corresponding to the pivot table (pt_df)
        pt_df (DataFrame) : pandas dataframe which is a pivot table
        n_factors (Integer) : Number of singular values and vectors to compute. 
                              Must be 1 <= n_factors < min(mat.shape). 
    '''
    
    if not 1 <= n_factors < min(mat.shape):
        raise ValueError("Must be 1 <= n_factors < min(mat.shape)")
        
    # matrix factorization
    u, s, v = svds(mat, k = n_factors)
    s = np.diag(s)

    # calculate pred ratings
    pred_ratings = np.dot(np.dot(u, s), v) 
    pred_ratings = normalize(pred_ratings)
    
    # convert to df
    pred_df = pd.DataFrame(
        pred_ratings,
        columns = pt_df.columns,
        index = list(pt_df.index)
    ).transpose()
    return pred_df

def recommend_items(pred_df, usr_id, n_recs):
    '''
    Given a usr_id and pred_df this function will recommend
    items to the user.
    
    params:
        pred_df (DataFrame) : generated from `generate_prediction_df` function
        usr_id (Integer) : The user you wish to get item recommendations for
        n_recs (Integer) : The number of recommendations you want for this user
    '''
    
    usr_pred = pred_df[usr_id].sort_values(ascending = False).reset_index().rename(columns = {usr_id : 'sim'})
    rec_df = usr_pred.sort_values(by = 'sim', ascending = False).head(n_recs)
    return rec_df

In [21]:
pt_df = users_ratings.pivot_table(
        columns = 'ISBN',
        index = 'User-ID',
        values = 'Book-Rating'
    ).fillna(0)

In [22]:
pt_df

ISBN,0060517794,0155061224,034545104X,038550120X,0425115801,0446520802,0449006522,052165615X,0521795028,0553561618,055356451X,0786013990,0786014512,2080674722
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
243,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276747,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278026,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
mat = pt_df.values
mat = csr_matrix(mat)

In [26]:
pred_df = generate_prediction_df(mat, pt_df, 10)

In [28]:
pred_df.head()

Unnamed: 0_level_0,243,496,638,645,741,1660,2010,2288,2313,3363,...,276727,276729,276733,276744,276746,276747,277427,278026,278418,278843
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0060517794,4.475272e-15,4.470271e-15,4.313216e-15,4.313216e-15,4.313216e-15,4.425075e-15,4.313216e-15,4.365245e-15,4.277741e-15,4.313216e-15,...,4.313216e-15,4.336115e-15,4.313216e-15,4.446977e-15,4.313216e-15,0.9,4.313216e-15,4.529291e-15,4.313216e-15,0.7
0155061224,4.313059e-15,4.313213e-15,4.313216e-15,4.313216e-15,4.313216e-15,4.313342e-15,4.313216e-15,4.313326e-15,4.313006e-15,4.313216e-15,...,4.313216e-15,4.313216e-15,4.313216e-15,4.313144e-15,4.313216e-15,4.31329e-15,4.313216e-15,4.313007e-15,4.313216e-15,4.313273e-15
034545104X,4.31077e-15,4.188484e-15,4.313216e-15,4.313216e-15,4.313216e-15,4.319028e-15,4.313216e-15,4.303108e-15,0.5,4.313216e-15,...,4.313216e-15,4.260779e-15,4.313216e-15,4.409728e-15,4.313216e-15,4.24936e-15,4.313216e-15,4.309839e-15,4.313216e-15,4.26355e-15
038550120X,4.245232e-15,4.330712e-15,4.313216e-15,4.313216e-15,4.313216e-15,4.230917e-15,4.313216e-15,4.294935e-15,4.382822e-15,4.313216e-15,...,4.313216e-15,4.353499e-15,4.313216e-15,0.7,4.313216e-15,4.485195e-15,4.313216e-15,4.22257e-15,4.313216e-15,4.446977e-15
0425115801,4.465299e-15,4.320294e-15,4.313216e-15,4.313216e-15,4.313216e-15,4.288083e-15,4.313216e-15,1.0,4.30814e-15,4.313216e-15,...,4.313216e-15,4.307755e-15,4.313216e-15,4.301011e-15,4.313216e-15,4.360042e-15,4.313216e-15,4.58359e-15,4.313216e-15,4.349637e-15
