In [1]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd
import scipy.sparse
from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

# You will be working with data from an Online Book Store. 

### Every time a customers buys a book, the customer can rate the book and the Book Store uses that data to create recommendations to future customers.

### In this exercise you will have the opportunity to help the Book Store team choosing which books to display in different areas of the website.

## Task 1: Understanding the data

Data is available under the `./data/` folder in this folder will will find 2 files:

* `BookRatings.csv` has the historical ratings given by the customers and represents all the books sold. 
* `BooksInfo.csv`: has the information about the main genre of the book. 

In [2]:
ratings = pd.read_csv('data/BookRatings.csv')
books_info = pd.read_csv('data/BooksInfo.csv')

Look at the raw files and print out the first rows of each file (unrated)

In [3]:
#BookRatings
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,1155,451402103,8
1,1155,671689231,7
2,1155,671032658,9
3,1155,671701053,5
4,1155,451409256,9


In [4]:
#BooksInfo
books_info.head()

Unnamed: 0,ISBN,Genre
0,195153448,Social Science
1,2005018,Actresses
2,60973129,1940-1949
3,374157065,Medical
4,393045218,Design


### Task 1.1 EDA (unrated)
- check for Ratings with incomplete data, 
- check for the duplicate records in ratings 
- check for books without Genre

In [5]:
### Your answer

## Task 2: Rating Matrix

### Task 2.1: Create the ratings matrix

In [6]:
def make_ratings(data):
    """
    Parameters
        data - the ratings dataframe with ratings per ISBN and User-ID
        
    Returns:
        R - (numpy.ndarray) Ratings matrix with the User-ID, ISBN and Book-Rating
        hint: don't forget to put zeros on places where you do not have ratings
    
    Extra Hint: Your input is a pandas DataFrame but you want to output an array (use .to_numpy)!
    """

    # YOUR CODE HERE
    users, user_pos = np.unique(data['User-ID'], return_inverse=True)
    items, item_pos = np.unique(data['ISBN'], return_inverse=True)
    
    R = np.zeros((len(users), len(items)))
    R[user_pos, item_pos] = data['Book-Rating']
    
    return R

R = make_ratings(ratings)

f"We have {R.shape[0]} user and {R.shape[1]} items."

'We have 5693 user and 47711 items.'

In [7]:
expected_hash = '226ef8abe773e3aceec1c057383c1628959c25882846e686412ef7e1ff96873d'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '0729c13ebd725201c1445a00c825237d305ff650cd72f50e45259bd942a75ef4'
assert hashlib.sha256(str(R[0].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = 'f1e42019aecc858ffbcca7fddec511b761b474916fde37b1a6ff321a9b459330'
assert hashlib.sha256(str(R[:,0].sum()).encode()).hexdigest() == expected_hash_2

### Task 2.2: Convert the Ratings Matrix to a Sparse Representation

In [8]:
from scipy.sparse import csr_matrix

def get_csr(matrix):
    """
    Parameters
        matrix - The Ratings Matrix.
    
    Returns
        H - The Compressed Sparse Row Matrix
        
    """
    
    # YOUR CODE HERE
    return csr_matrix(matrix)
    
sparse_mat = get_csr(R)

In [9]:
expected_hash = '3068469d4140f3f5fd47d88d14718db567a2ed03bf28240202061d61ea56147c'
assert hashlib.sha256(str(sparse_mat).encode()).hexdigest() == expected_hash

### Task 2.3: Calculate the density score of the matrix

In [10]:
def get_density_score(matrix):
    """
    Parameters
        matrix - Ratings Matrix
        
    Returns:
        dense_score - (float) Density Score of Orig Matrix. 
    """
    # YOUR CODE HERE
    return matrix[matrix.nonzero()].size / matrix.size
    
dense_score = get_density_score(R)
f"The Density Score is {dense_score}."

'The Density Score is 0.0004009664679853458.'

In [11]:
np.testing.assert_almost_equal(dense_score,0.0004,4)

## Task 3: Non-personalized Recommendations

### Task 3.0: Merge the 2 datasets (Rating and Books_info)

Merge the datasets Ratings and Books_info in order to have a dataframe that have the Geners assign to the book rating. If a book never had a rating it will not apear in this dataframe.

Hint | You might need to use the function <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">merge()</a>  and explore the parameter 'how'.



In [12]:
def get_book_ratings_df(ratings_, books_info_):
    """
    Parameters
        ratings_ - DataFrame
        books_info_ - DataFrame
        
    Returns:
        book_ratings - DataFrame
    """
    
    # YOUR CODE HERE
    return ratings_.merge(right=books_info_, on='ISBN', how='left')

book_ratings = get_book_ratings_df(ratings, books_info)
book_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Genre
0,1155,451402103,8,True Crime
1,1155,671689231,7,Biography & Autobiography
2,1155,671032658,9,Fiction
3,1155,671701053,5,Biography & Autobiography
4,1155,451409256,9,Fiction


In [13]:
expected_hash = 'c1d3dbf9ef7fb86036e5c933ff8de7a66d67b7dd25508764451e3ac8c300f110'
assert hashlib.sha256(str(book_ratings.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '3c4340f3a5aa8a40da4f7a2dc2f3ef4645ba099b58e986d12bd5f65b709efb20'
assert hashlib.sha256(str(book_ratings['Book-Rating'].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '581cd6bccf7862e391ce07768616c8427d6cf9ddec881f6984e3cbd835379997'
assert hashlib.sha256((book_ratings[(book_ratings['ISBN']=='1558744150')&(book_ratings['User-ID']==48579)].reset_index()['Genre'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.1: The 5 most popular Books on the store

The Book store want to display on the Homepage a collection of the most popular books in the store. Since we don't have information on purchases we are going to use the ratings in a similar fashion.

Create a list with the top 5 more popular books on the store (books with more ratings). The values in the list should be ordered from the most popular to the least popular.

Hint | You might find useful to use the following functions (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html ">groupby()</a> - to group the data 
- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.size.html">size()</a> -  to get the number of lines


In [14]:
def get_popular_books(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_5_popular_books - The top 5 popular books
    """
    
    # YOUR CODE HERE
    return df.groupby(by='ISBN').size().nlargest(n).index

top_5_popular_books = get_popular_books(ratings, 5)   

In [15]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top_5_popular_books)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'ecf0bb677736450811308765d0a80c698603dae939c42388f4f19880fa7dc704'
assert hashlib.sha256(str(top_5_popular_books[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '6cf1c4943f89becc6f4a3d7013d542d14082edcb7038bc38792f2045419a556e'
assert hashlib.sha256(str(top_5_popular_books[4]).encode()).hexdigest() == expected_hash_2


### Task 3.2: Top 5 better rated books

The Book store also want to display on the Homepage a collection of the books with better ratings on the store. 

Create a list of the top 5 better rated books with more than 10 rates.

In [16]:
def get_top5_rates(data, n, k):
    """
    Parameters
        data - DataFrame with ratings
        n - Top-n items
        k - Mininum number of ratings
        
    Returns
        top_books - List of ids of top-n best mean rated items.
        Your indices should refer only to items with more than k ratings (subset of original matrix).
    """
    # YOUR CODE HERE
    return (data
         .groupby(by='ISBN').filter(lambda x: len(x) > k)
         .groupby(by='ISBN').mean()
         .nlargest(n, columns='Book-Rating')
         .index)
    
top5_rates = get_top5_rates(ratings, 5, 10)

In [17]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top5_rates)).encode()).hexdigest() == expected_hash

expected_hash_1 = '176e1ad48051114c46de83e1b5b55bf6bc21dbfce49a62ff352cfdef48ff6357'
assert hashlib.sha256(str(top5_rates[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '20865e898050bb593da47f242377658bf3653fe9931bb645e6b1bbf29440d9f0'
assert hashlib.sha256(str(top5_rates[4]).encode()).hexdigest() == expected_hash_2

### Task 3.3: Loyal Customers

The Book Store wants to reward the customers that gave more rating on the website. 

Create a list of the top 10 users with more ratings.

In [18]:
def get_loyal_customers(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_10_loyal_customers - The top 10 loyal customers
    """
    
    # YOUR CODE HERE
    return df.groupby(by='User-ID').size().nlargest(n).index

top_10_loyal_customers = get_loyal_customers(ratings, 10)

In [19]:
expected_hash = '4a44dc15364204a80fe80e9039455cc1608281820fe2b24f1e5233ade6af1dd5'
assert hashlib.sha256(str(len(top_10_loyal_customers)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'c182d826ceb2b42f749faf0dd41929c88dff7a57a6000e2e7d16e5229ca6640b'
assert hashlib.sha256(str(top_10_loyal_customers[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '4df81fd140c781b33e9adde8d4bc1dbf520c4a2748f31f5abbe04182176580c6'
assert hashlib.sha256(str(top_10_loyal_customers[7]).encode()).hexdigest() == expected_hash_2

### Task 3.4: Just a Question - In which genre a user created more ratings?

Hint | You find useful to use the following functions (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html">sort_values()</a> -  to sort the data by the number of ratings

In [20]:
get_book_ratings_df(ratings, books_info).head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Genre
0,1155,451402103,8,True Crime
1,1155,671689231,7,Biography & Autobiography
2,1155,671032658,9,Fiction
3,1155,671701053,5,Biography & Autobiography
4,1155,451409256,9,Fiction


In [21]:
# YOUR CODE HERE

genre_user_top_rating = (get_book_ratings_df(ratings, books_info)
                         .groupby(by='Genre')
                         .size().nlargest(1).index[0])

# genre_user_top_rating = 'Fiction'

print(genre_user_top_rating)

Fiction


In [22]:
expected_hash = 'efa9a3729d47c5c47c0c763107f82dbeb8ba63e479274b2661edf418850791fb'
assert hashlib.sha256(str(genre_user_top_rating).encode()).hexdigest() == expected_hash

### Task 3.5: The Most popular Books by genre

The Book Store wants to display the most popular book by genre when the customer navegates to the genre tab.

Create a function that returns a Dataframe only with the most popular book per genre.

Hint | You might find useful to use the following functions (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html">head()</a> - to get a retricted number of lines per group

In [23]:
def get_top1_popularity_genre(df):
    """
    Parameters
        df - DataFrame
        
    Returns:
        top_books_genre - DataFrame
    """
    
    # YOUR CODE HERE
    unique_genre = np.unique(df['Genre'].dropna().astype(str))
    
    result = pd.DataFrame(unique_genre, columns=['Genre'])
    
    result['ISBN'] = result['Genre'].apply(lambda x: get_popular_books(df[df.Genre == x], 1).values[0])
    
    return result
    

top_books_genre = get_top1_popularity_genre(book_ratings)

f"Number of genrers in the dataset book_ratings is {book_ratings['Genre'].nunique()} and Number of genrers in the dataset top_books_genre is {top_books_genre['Genre'].nunique()}."

'Number of genrers in the dataset book_ratings is 2543 and Number of genrers in the dataset top_books_genre is 2543.'

In [24]:
expected_hash = '9a85c6d41062f7ba7fd7c7130eb5975156f0fd04f93d74fc27778a6726d7c1f3'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Fiction'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash

expected_hash_1 = '481b11af7b7f0cab7895d47507e7d85310dc49d4fc951117abecfbf7e23a28f2'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='poems'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '3d89b8a0dd59309c672f7a1af89ba217cf9cba6213adecf1906d4f3992a85cc9'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Biography & Autobiography'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.6: Top 3 better average rated books by genre

The Book Store wants to display also in the genre tab the "Better books to read". 

Create a function that returns a Dataframe only with the top 3 books with higher average rating. Don't forget to display the ISBN, Genre and rating.

In [25]:
def get_top3_rates_genre(df):
    
    """
    Parameters
        df - DataFrame with rantings and genre
              
    Returns
        books - DataFrame sorted by genre with top 3 books higher average rating
    """
    
    # YOUR CODE HERE
    return (df.groupby(by=['ISBN', 'Genre']).mean()
            .sort_values(by='Book-Rating', ascending=False)
            .groupby(by='Genre').head(3).reset_index(drop=False)
            .sort_values(by='Genre').drop('User-ID', axis=1)
           )
    

top3_rates_genre = get_top3_rates_genre(book_ratings)
top3_rates_genre

Unnamed: 0,ISBN,Genre,Book-Rating
3809,0874494982,"""ABCs""",5.0
3305,1566192927,"""Aesops fables""",6.0
2151,0590479776,"""Aesops fables""",8.0
2336,0451525655,"""Aesops fables""",8.0
3650,0590448293,"""April Fools Day""",5.0
...,...,...,...
3555,033033347X,Young women,5.0
2442,0300072279,"Ypres, 3rd Battle of, 1917",8.0
705,0528816012,Zip code,10.0
2921,3407805810,avstrijska knjiÅ¾evnost - mladinska knjiÅ¾evno...,7.0


In [26]:
top3_rates_genre[top3_rates_genre['Genre']=='Fiction']

Unnamed: 0,ISBN,Genre,Book-Rating
0,1929001037,Fiction,10.0
9,671448331,Fiction,10.0
8,312305311,Fiction,10.0


In [27]:
expected_hash = '250302a44bedd984034e258ba47827a340db357e8553b4d85ff573d894329123'
assert hashlib.sha256(str(top3_rates_genre.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '1e56c660887ba75c099588c47bf90b565fe315821214b14f1255a73cab988ed5'
assert hashlib.sha256(str(round(top3_rates_genre['Book-Rating'].sum(),0)).encode()).hexdigest() == expected_hash_1

### Task 4: Most common groups of books

The Book store wants to display groups of 3 books that the users usually rate in groups.

Create a function that returns the 3 most common 3 books-set the users rate together for a minimum support of 0.3% ordered by support.

In [28]:
def get_apriori_booksets(R, min_support=0.003, n=3, m=3):
    
    # YOUR CODE HERE
    R_ = pd.DataFrame(R > 0)
    x = apriori(R_, min_support, max_len=m)

    x = (x[x['itemsets'].apply(lambda element: len(element) == 3)]
         .sort_values(by='support', ascending=False)
         .reset_index()
         .head(3))
    
    return x

get_3_booksets = get_apriori_booksets(R, min_support=0.003, n=3, m=3)

In [29]:
expected_hash = '89274fc6d8d6e3864b90500aeb82f76719a006d11ac2787d67bac8245a5e8e46'
assert hashlib.sha256(str(get_3_booksets.shape).encode()).hexdigest() == expected_hash

assert 16018 in get_3_booksets.reset_index()['itemsets'][0]
assert 15979 in get_3_booksets.reset_index()['itemsets'][0]
assert 16130 in get_3_booksets.reset_index()['itemsets'][1]