In [2]:
# execute to import notebook styling for tables and width etc.
from IPython.core.display import HTML
import urllib.request
response = urllib.request.urlopen('https://raw.githubusercontent.com/DataScienceUWL/DS775v2/master/ds755.css')
HTML(response.read().decode("utf-8"));

<font size=18>Lesson 11 - Self-Assessment Solutions</font>

# <font color = "blue"> Self-Assessment: Setting up the File </font>

In [13]:
# load the data
import pandas as pd
import numpy as np
bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv')
bx.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,6251,345370775,1
1,6251,044021145X,1
2,6251,312983271,1
3,6251,080410526X,1
4,6251,743418174,1


In [14]:
print("Mean book rating:     ", '%.2f' % bx['Book-Rating'].mean())

Mean book rating:      2.63


In [15]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = bx.copy()
y = bx['User-ID']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify=y, random_state=42)

In [21]:
X_test.head()
y_test.head()

2664    235105
2466    227447
672      55548
994      95359
1597    135045
Name: User-ID, dtype: int64

In [4]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# <font color = "blue"> Self-Assessment: Baseline RMSE to Assess Model Performance </font>

In [9]:
#Define the baseline model to always return 5.
def baseline(user_id, book_rating):
    return 5

In [10]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-book tuples from the testing dataset
    id_pairs = zip(X_test['User-ID'], X_test['ISBN'])
    
    #Predict the rating for every user-book tuple
    y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['Book-Rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [11]:
score(baseline)

4.062702986381795

# <font color = "blue"> Self-Assessment: Weighted Mean User-Based Filter </font>

In [132]:
#Build the ratings matrix using pivot_table function
#r_matrix = X_train.pivot_table(values='Book-Rating', index='User-ID', columns='ISBN')
r_matrix = X_train.pivot(values='Book-Rating', index='User-ID', columns='ISBN')

r_matrix.head()

ISBN,006101351X,014025448X,014028009X,034540288X,038079487X,043935806X,044021145X,044022165X,044023722X,044651652X,...,743418174,767902521,767905180,786868716,786881852,804106304,804114986,805063897,842329129,971880107
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,,,,1.0,,,1.0,,,,...,1.0,,,1.0,,,,,,
6575,,,,,,,,,,,...,,,,,,,,,,
7346,1.0,,,,,,,,,,...,,,,,,10.0,1.0,,,
11601,,1.0,,,,,,,,,...,,,,,,,,,,
11676,9.0,,,,,,,,,,...,,,,,,,,,,


In [133]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [134]:
r_matrix_dummy.head()

ISBN,006101351X,014025448X,014028009X,034540288X,038079487X,043935806X,044021145X,044022165X,044023722X,044651652X,...,743418174,767902521,767905180,786868716,786881852,804106304,804114986,805063897,842329129,971880107
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7346,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,0.0
11601,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11676,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [136]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head()

User-ID,6251,6575,7346,11601,11676,13552,14521,16795,21014,23768,...,238781,254465,258534,260897,261829,265115,265313,266226,269566,274308
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6251,1.0,0.019892,0.030961,0.005078,0.142988,0.048059,0.064752,0.01861,0.002779,0.035858,...,0.015235,0.023108,0.006664,0.026021,0.018473,0.029876,0.003219,0.052636,0.079578,0.04595
6575,0.019892,1.0,0.00154,0.022224,0.115155,0.00239,0.0,0.001018,0.132236,0.046586,...,0.075006,0.145943,0.014581,0.001582,0.0,0.012711,0.019368,0.014397,0.051694,0.0
7346,0.030961,0.00154,1.0,0.003931,0.193686,0.016909,0.019491,0.024488,0.002151,0.104082,...,0.043239,0.030895,0.0,0.044759,0.017873,0.256943,0.007474,0.112041,0.281033,0.032603
11601,0.005078,0.022224,0.003931,1.0,0.002773,0.030508,0.005024,0.031187,0.00388,0.00626,...,0.007092,0.002934,0.006204,0.407819,0.029023,0.046359,0.0,0.055132,0.020838,0.085563
11676,0.142988,0.115155,0.193686,0.002773,1.0,0.009544,0.21218,0.081307,0.022762,0.088125,...,0.033282,0.022946,0.0,0.018948,0.001261,0.010877,0.275982,0.079053,0.027163,0.043917


In [137]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, book_id):
    
    #Check if book_id exists in r_matrix and if there is overlap with other 
    # users, 
#    sim_scores = cosine_sim[user_id]
#    sss = sum(sim_scores)
    if book_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[book_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
                
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        if sim_scores.sum()>0:
            wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
        else:  # user had zero cosine similarity with other users
            wmean_rating = 6.0
    
    else:
        #Default to a rating of 6.0 in the absence of any information
        wmean_rating = 6.0
    
    return wmean_rating

In [138]:
score(cf_user_wmean)

3.607093266358255

The RMSE with the user-based collaborative filter is 3.61 compared to 4.70 for the baseline model, so predicted ratings are more precise.  

# <font color = "blue"> Self-Assessment: Weighted Mean Item-Based Filter - Solution </font>

In [5]:
#Build the ratings matrix using pivot_table function
#r_matrix = X_train.pivot_table(values='Book-Rating', index='ISBN', columns='User-ID')
r_matrix_item = X_train.pivot(values='Book-Rating', index='ISBN', columns='User-ID')

r_matrix_item.head()

User-ID,6251,6575,7346,11601,11676,13552,14521,16795,21014,23768,...,238781,254465,258534,260897,261829,265115,265313,266226,269566,274308
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006101351X,,,1.0,,9.0,,6.0,,,,...,,,,,,,,1.0,,1.0
014025448X,,,,1.0,,,,,,,...,,,,,,8.0,,,,
014028009X,,,,,,,,,,,...,,,,,,,,,,
034540288X,1.0,,,,,,,,,,...,,,,,,,,,,1.0
038079487X,,,,,,,,,,,...,,,,,,1.0,,,,


In [149]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_item_dummy = r_matrix_item.copy().fillna(0)

In [150]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim_item = cosine_similarity(r_matrix_item_dummy, r_matrix_item_dummy)

In [151]:
#Convert into pandas dataframe 
cosine_sim_item = pd.DataFrame(cosine_sim_item, index=r_matrix_item.index, columns=r_matrix_item.index)

cosine_sim_item.head(10)

ISBN,006101351X,014025448X,014028009X,034540288X,038079487X,043935806X,044021145X,044022165X,044023722X,044651652X,...,743418174,767902521,767905180,786868716,786881852,804106304,804114986,805063897,842329129,971880107
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006101351X,1.0,0.005847,0.044958,0.035806,0.0,0.051717,0.036379,0.032858,0.044481,0.064775,...,0.032187,0.047741,0.009323,0.007262,0.006989,0.058881,0.043183,0.029235,0.006989,0.026854
014025448X,0.005847,1.0,0.00932,0.0,0.044639,0.0,0.031109,0.0,0.007245,0.0,...,0.033029,0.049897,0.0,0.0,0.0,0.0,0.0,0.0,0.071714,0.010206
014028009X,0.044958,0.00932,1.0,0.0,0.0,0.0,0.032617,0.0,0.0,0.034416,...,0.030783,0.006341,0.0,0.0,0.0,0.391058,0.18929,0.2796,0.0,0.007134
034540288X,0.035806,0.0,0.0,1.0,0.0,0.032827,0.0635,0.0,0.0,0.150756,...,0.044947,0.0,0.0,0.025351,0.0,0.0,0.0,0.0,0.048795,0.0
038079487X,0.0,0.044639,0.0,0.0,1.0,0.0,0.0,0.055744,0.17788,0.146524,...,0.051876,0.033748,0.063267,0.024639,0.0,0.049947,0.0,0.0,0.539464,0.0
043935806X,0.051717,0.0,0.0,0.032827,0.0,1.0,0.04169,0.005021,0.034955,0.197952,...,0.014754,0.328266,0.0,0.003329,0.032035,0.0,0.0,0.0,0.006407,0.131306
044021145X,0.036379,0.031109,0.032617,0.0635,0.0,0.04169,1.0,0.019424,0.005635,0.19146,...,0.031395,0.038806,0.066136,0.00322,0.006197,0.0,0.19146,0.0,0.037182,0.007938
044022165X,0.032858,0.0,0.0,0.0,0.055744,0.005021,0.019424,1.0,0.020357,0.023057,...,0.175295,0.0,0.039823,0.031018,0.074629,0.01048,0.046114,0.0,0.0,0.114708
044023722X,0.044481,0.007245,0.0,0.0,0.17788,0.034955,0.005635,0.020357,1.0,0.107019,...,0.087744,0.049298,0.057762,0.076484,0.017319,0.01216,0.05351,0.0,0.0,0.033276
044651652X,0.064775,0.0,0.034416,0.150756,0.146524,0.197952,0.19146,0.023057,0.107019,1.0,...,0.0,0.184257,0.078507,0.015287,0.029424,0.061978,0.090909,0.246183,0.0,0.0


In [152]:
#Item-Based Collaborative Filter using Weighted Mean Ratings
def cf_item_wmean(user_id, book):
    
    #Check if user exists in r_matrix
    if user_id in r_matrix_item:
        
        #Get the similarity scores for the item in question with every other item
        sim_scores = cosine_sim_item[book]
        
        #Get the user ratings for the book in question
        m_ratings = r_matrix_item[user_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        if sim_scores.sum() > 0:
            wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
        else: # the book has zero cosine similarity with other books
            wmean_rating = 6
    
    else:
        #Default to a rating of 6.0 in the absence of any information
        wmean_rating = 6.0
    
    return wmean_rating

In [153]:
score(cf_item_wmean)

3.4119539180908327

The weighted-mean item-based collaborative filter is the best so far at RMSE = 3.41.  The weighted-mean item-based collaborative filter had RMSE = 3.61 and the baseline model had RMSE = 4.70.  

# <font color = "blue"> Self-Assessment: kNN-Based Collaborative Filter - Solution  </font>

In [43]:
bx.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,6251,345370775,1
1,6251,044021145X,1
2,6251,312983271,1
3,6251,080410526X,1
4,6251,743418174,1


In [156]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
#data = Dataset.load_from_df(ratings, reader)
data = Dataset.load_from_df(bx, reader)

#Define the algorithm object; in this case kNN
knn = KNNBasic()

#Replace "evaluate" with "cross_validate"
#Evaluate the performance in terms of RMSE
from surprise.model_selection import cross_validate
cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4196  3.2729  3.5745  3.4041  3.6306  3.4603  0.1280  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.02    0.02    0.02    0.02    0.01    0.02    0.00    


{'test_rmse': array([3.41962002, 3.27289974, 3.57448864, 3.40405396, 3.63055715]),
 'fit_time': (0.0027573108673095703,
  0.002229928970336914,
  0.0025441646575927734,
  0.00127410888671875,
  0.0016109943389892578),
 'test_time': (0.02289295196533203,
  0.023328065872192383,
  0.02286076545715332,
  0.019240856170654297,
  0.014546394348144531)}

The RMSE for each model used so far are stated below ranked from best to worst:

- weighted-mean item-based collaborative filter: RMSE = 3.41 

- kNN-based collaborative filter: (average) RMSE = 3.44 (note that this one will vary slightly since it is based on randomly selected subsets in cross-validation)

- weighted-mean item-based collaborative filter: RMSE = 3.61 

- baseline model: RMSE = 4.70.  

# <font color = "blue"> Self-Assessment: SVD Filter - Solution  </font>

In [158]:
#Import SVD
from surprise import SVD

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
#data = Dataset.load_from_df(ratings, reader)
data = Dataset.load_from_df(bx, reader)

#Define the SVD algorithm object
svd = SVD()

#Replace "evaluate" with "cross_validate"
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.2274  3.2124  3.3377  3.2269  3.1134  3.2236  0.0711  
Fit time          0.19    0.17    0.17    0.17    0.17    0.17    0.01    
Test time         0.00    0.00    0.00    0.01    0.00    0.00    0.00    


{'test_rmse': array([3.22744306, 3.21241507, 3.33765507, 3.2269326 , 3.11338327]),
 'fit_time': (0.19271016120910645,
  0.16982817649841309,
  0.1732008457183838,
  0.1691887378692627,
  0.16899585723876953),
 'test_time': (0.004391908645629883,
  0.0042819976806640625,
  0.004575014114379883,
  0.006760120391845703,
  0.004892110824584961)}

The RMSE for each model used so far are stated below ranked from best to worst:

- SVD filter: (average) RMSE = 3.23 (note that this one will vary slightly since it is based on randomly selected subsets in cross-validation)

- weighted-mean item-based collaborative filter: RMSE = 3.41 

- kNN-based collaborative filter: (average) RMSE = 3.44 (note that this one will vary slightly since it is based on randomly selected subsets in cross-validation)

- weighted-mean item-based collaborative filter: RMSE = 3.61 

- baseline model: RMSE = 4.70.  

# <font color = "blue"> Self-Assessment: Hybrid Recommender </font>

In [8]:
# load the data
import pandas as pd
import numpy as np
bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv')

In [9]:
#Build the ratings matrix using pivot_table function
#r_matrix = X_train.pivot_table(values='Book-Rating', index='ISBN', columns='User-ID')
r_matrix_item = bx.pivot(values='Book-Rating', index='ISBN', columns='User-ID')

r_matrix_item.head()

User-ID,6251,6575,7346,11601,11676,13552,14521,16795,21014,23768,...,238781,254465,258534,260897,261829,265115,265313,266226,269566,274308
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006101351X,,,1.0,,9.0,,6.0,,,,...,,,,,,,,1.0,,1.0
014025448X,,,,1.0,,,,,1.0,,...,,,,,,8.0,,,,
014028009X,,,,,,,,,,,...,,,,,,8.0,,,,
034540288X,1.0,,,,,,,,,,...,,,,,,,,,,1.0
038079487X,,,,,,,,,,1.0,...,,,,,,1.0,,,,


In [10]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_item_dummy = r_matrix_item.copy().fillna(0)

In [11]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim_item = cosine_similarity(r_matrix_item_dummy, r_matrix_item_dummy)

In [12]:
#Convert into pandas dataframe 
cosine_sim_item = pd.DataFrame(cosine_sim_item, index=r_matrix_item.index, columns=r_matrix_item.index)

cosine_sim_item.head(10)

ISBN,006101351X,014025448X,014028009X,034540288X,038079487X,043935806X,044021145X,044022165X,044023722X,044651652X,...,743418174,767902521,767905180,786868716,786881852,804106304,804114986,805063897,842329129,971880107
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006101351X,1.0,0.00904,0.411609,0.068252,0.316978,0.06507,0.02129,0.061752,0.04349,0.21279,...,0.041756,0.230627,0.004612,0.036638,0.003855,0.370128,0.216256,0.04648,0.005387,0.026835
014025448X,0.00904,1.0,0.240276,0.0,0.037747,0.0,0.024101,0.0,0.007033,0.0,...,0.031987,0.364547,0.0,0.004102,0.056105,0.0,0.0,0.0,0.070571,0.097646
014028009X,0.411609,0.240276,1.0,0.0,0.220576,0.002096,0.199513,0.002304,0.0,0.175953,...,0.045171,0.216187,0.02942,0.033957,0.002732,0.220911,0.114955,0.094712,0.030547,0.00951
034540288X,0.068252,0.0,0.0,1.0,0.0,0.021668,0.034658,0.047619,0.070799,0.028868,...,0.048299,0.0,0.033787,0.020646,0.0,0.0,0.0,0.0,0.039467,0.098295
038079487X,0.316978,0.037747,0.220576,0.0,1.0,0.002744,0.002195,0.057299,0.188316,0.032907,...,0.055058,0.02301,0.034236,0.04707,0.039344,0.294372,0.137953,0.0,0.459897,0.006225
043935806X,0.06507,0.0,0.002096,0.021668,0.002744,1.0,0.022529,0.003095,0.032215,0.041282,...,0.012558,0.212561,0.0,0.032209,0.022027,0.00265,0.064362,0.011067,0.005131,0.178904
044021145X,0.02129,0.024101,0.199513,0.034658,0.002195,0.022529,1.0,0.032183,0.051529,0.033017,...,0.053567,0.020778,0.059721,0.023613,0.008808,0.027557,0.123541,0.053106,0.024621,0.332156
044022165X,0.061752,0.0,0.002304,0.047619,0.057299,0.003095,0.032183,1.0,0.065742,0.045363,...,0.370294,0.0,0.024133,0.289044,0.044376,0.020387,0.05658,0.012161,0.0,0.126379
044023722X,0.04349,0.007033,0.0,0.070799,0.188316,0.032215,0.051529,0.065742,1.0,0.036788,...,0.136782,0.038586,0.086115,0.074548,0.071975,0.00866,0.084122,0.00904,0.092209,0.041755
044651652X,0.21279,0.0,0.175953,0.028868,0.032907,0.041282,0.033017,0.045363,0.036788,1.0,...,0.089234,0.069225,0.017556,0.003576,0.014673,0.116527,0.068599,0.014744,0.0,0.008513


Note that the surprise package changed a bit since the book by Banik was published.  The code to train a model using cross validation has changed a bit as shown below.  Also, notice that we aren't splitting the data into test and training and testing sets, rather we're using the whole dataset for illustration.

In [75]:
#Build the SVD based Collaborative filter
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1,11))
ratings = pd.read_csv('./data/BX-Book-Ratings-3000.csv')
data = Dataset.load_from_df(ratings[['User-ID', 'ISBN', 'Book-Rating']], reader)

#data.split(n_folds=5)
#svd = SVD()
#trainset = data.build_full_trainset()
#svd.train(trainset)

algo = SVD()
trainset = data.build_full_trainset()
cross_validate(algo,data,cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4209  3.1858  3.2525  3.1077  3.1686  3.2271  0.1073  
MAE (testset)     2.5153  2.3401  2.3772  2.3184  2.2951  2.3692  0.0779  
Fit time          0.19    0.16    0.19    0.17    0.19    0.18    0.01    
Test time         0.00    0.00    0.01    0.00    0.00    0.00    0.00    


{'test_rmse': array([3.42091181, 3.18582138, 3.25245198, 3.10773299, 3.1685569 ]),
 'test_mae': array([2.51528226, 2.34014543, 2.37715992, 2.31842354, 2.29508037]),
 'fit_time': (0.18690204620361328,
  0.15990519523620605,
  0.18990802764892578,
  0.1705338954925537,
  0.185960054397583),
 'test_time': (0.004264354705810547,
  0.004138946533203125,
  0.005401134490966797,
  0.004408121109008789,
  0.00418400764465332)}

In [76]:
def hybrid(user_id, isbn):
    
    # Obtain the index of the item that matches the isbn
    #idx = indices[isbn]
    
    #Extract the similarity scores and their corresponding index for every item from the cosine_sim matrix
    sim_scores = list(enumerate(cosine_sim_item[isbn]))
    
    #excluding the similarity score of the item with itself
    #del sim_scores[isbn]
    
    #Sort the (index, score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Select the top 25 tuples, excluding the first 
    #(as it is the similarity score of the item with itself)
    sim_scores = sim_scores[1:25]
    
    #Store the cosine_sim indices of the top 25 items in a list
    item_indices = [i[0] for i in sim_scores]

    #Extract the metadata of the aforementioned items
    items = bx.iloc[item_indices][['ISBN']]
    
    #Compute the predicted ratings using the SVD filter
    items['est_rating'] = items['ISBN'].apply(lambda x: algo.predict(user_id, x).est)
    
    #Sort the items in decreasing order of predicted rating
    items = items.sort_values('est_rating', ascending=False)
    
    #Return the top 10 items as recommendations
    return items.head(10)

In [77]:
hybrid('31315', '440214041')

Unnamed: 0,ISBN,est_rating
107,60915544,4.325185
151,385504209,3.651876
130,60938455,3.408992
126,068484477X,3.044899
45,385484518,2.956411
173,70212570,2.817037
28,786868716,2.815441
152,440214041,2.798919
177,671027360,2.696707
43,312976275,2.674702


# <font color = "blue"> Self-Assessment: Type of Recommenders - Solution </font>

**Put the letter of the recommender system with the number of its description.**

1. c

2. e

3. a

4. f

5. b

6. d
