In [80]:
import numpy as np
import pandas as pd
import sklearn

In [81]:
bookRatings = pd.read_csv("dataset/ratings.csv")
bookRatings.head(10)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
5,2,26,4
6,2,315,3
7,2,33,4
8,2,301,5
9,2,2686,5


In [82]:
bookList = pd.read_csv("dataset/books.csv")
bookList.head(10)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
6,7,5907,5907,1540236,969,618260307,9780618000000.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,...,2071616,2196809,37653,46023,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...
7,8,5107,5107,3036731,360,316769177,9780317000000.0,J.D. Salinger,1951.0,The Catcher in the Rye,...,2044241,2120637,44920,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...
8,9,960,960,3338963,311,1416524797,9781417000000.0,Dan Brown,2000.0,Angels & Demons,...,2001311,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813.0,Pride and Prejudice,...,2035490,2191465,49152,54700,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...


In [83]:
# find any missing column
column = ['user_id', 'book_id', 'rating']
for columns in column:
    missing = bookRatings[columns].isnull().value_counts()
    print(missing)

# drops missing columns if any
bookRatings = bookRatings.dropna(how='any')

False    5976479
Name: user_id, dtype: int64
False    5976479
Name: book_id, dtype: int64
False    5976479
Name: rating, dtype: int64


In [84]:
noOfRatings = len(bookRatings)
noOfBooks = bookRatings['book_id'].nunique()
noOfUsers = bookRatings['user_id'].nunique()

print("Number of ratings: ", noOfRatings)
print("Number of unique books: ", noOfBooks)
print("Number of unique users: ", noOfUsers)
print("Average number of ratings per user: ", round(noOfRatings/noOfUsers, 2))
print("Average number of ratings per book: ", round(noOfRatings/noOfBooks, 2))

Number of ratings:  5976479
Number of unique books:  10000
Number of unique users:  53424
Average number of ratings per user:  111.87
Average number of ratings per book:  597.65


In [85]:
# Using collaborative filtering to recommend a user a book 
# Assuming that similar users like similar things

In [86]:
# Transforming data into user-item matrix
# No information about user or item is required 
from scipy.sparse import csr_matrix

def generateSparseMatrix(df):
    """ Generates a sparse matrix from a pandas dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        matrix: a sparse matrix
        userMap: dict. maps user id to user indices
        userMapInv: dict. maps user indices to user id
        bookMap: dict. maps book id to book indices
        bookMapInv: dict. maps book indices to book id
    """
    noOfUsers = df['user_id'].nunique()
    noOfBooks = df['book_id'].nunique()

    userMap = dict(zip(np.unique(df["user_id"]), list(range(noOfUsers))))
    bookMap = dict(zip(np.unique(df["book_id"]), list(range(noOfBooks))))
    
    userMapInv = dict(zip(list(range(noOfUsers)), np.unique(df["user_id"])))
    bookMapInv = dict(zip(list(range(noOfBooks)), np.unique(df["book_id"])))
    
    userIndex = [userMap[i] for i in df['user_id']]
    bookIndex = [bookMap[i] for i in df['book_id']]

    matrix = csr_matrix((df["rating"], (bookIndex, userIndex)), shape=(noOfBooks, noOfUsers))
    
    return matrix, userMap, bookMap, userMapInv, bookMapInv

In [87]:
matrix, userMap, bookMap, userMapInv, bookMapInv = generateSparseMatrix(bookRatings)

In [88]:
sparsity = matrix.count_nonzero()/(matrix.shape[0]*matrix.shape[1])

print("Matrix sparsity: ", round(sparsity*100,2))


Matrix sparsity:  1.12


In [89]:
from scipy.sparse import save_npz

save_npz('data/user_item_matrix.npz', matrix)


In [90]:
import pickle
userAndBookMaps = {"userMap": userMap, "bookMap":bookMap, "userMapInv": userMapInv, "bookMapInv": bookMapInv}

mappingFile = open("data/mappings.pkl", "wb")
pickle.dump(userAndBookMaps, mappingFile)
mappingFile.close()

In [91]:
from sklearn.neighbors import NearestNeighbors

def findSimilarBooks(bookId, matrix, n=10, metric='cosine', show_distance=False):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        bookId: id of the movie of interest
        matrix: user-item utility matrix
        n: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar movie ID's
    """
    neighbourIds = []
    
    
    mappingFile = open("data/mappings.pkl", "rb")
    mapping = pickle.load(mappingFile)
    mappingFile.close()
    bMap = mapping["bookMap"]
    bMapInv = mapping["bookMapInv"]
    
    bookIndex = bMap[bookId]
    bookVector = matrix[bookIndex]
    n+=1
    kNN = NearestNeighbors(n_neighbors=n, algorithm="brute", metric=metric)
    kNN.fit(matrix)
    if isinstance(bookVector, (np.ndarray)):
        bookVector = bookVector.reshape(1,-1)
    neighbour = kNN.kneighbors(bookVector, return_distance=show_distance)
    for i in range(0,n):
        bookIndex = neighbour.item(i)
        neighbourIds.append(bMapInv[bookIndex])
    neighbourIds.pop(0)
    return neighbourIds

In [95]:
bookTitles = dict(zip(bookList['book_id'], bookList['original_title']))

bookId = 10

from scipy.sparse import load_npz
testMatrix = load_npz('data/user_item_matrix.npz')
similarIds = findSimilarBooks(bookId, testMatrix, n=20)
bookTitle = bookTitles[bookId]

print("People who read ", bookTitle, " also liked:")
for i in similarIds:
    print(bookTitles[i])

People who read  Pride and Prejudice  also liked:
Jane Eyre
Sense and Sensibility
Little Women
To Kill a Mockingbird
Emma
The Great Gatsby
Wuthering Heights
An Excellent conceited Tragedie of Romeo and Juliet
Harry Potter and the Philosopher's Stone
Persuasion
Het Achterhuis: Dagboekbrieven 14 juni 1942 - 1 augustus 1944
The Catcher in the Rye
Memoirs of a Geisha
Nineteen Eighty-Four
The Hobbit or There and Back Again
Gone with the Wind
The Hunger Games
Animal Farm: A Fairy Story
Twilight
Of Mice and Men 
