1. Implement collaborive filtering recommender system (memory based) to give at least five recommendation to a user/item
    - User-based
    - Item-based
    
2. (Bonus) Wrap the implementations into function
    - parameters: user_id/item_id, number of top n users/items to consider, number of recommendation
    - return value: collection of recommendation

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
books = pd.read_csv('book1-100k.csv')
books = books[:20000]
ratings = pd.read_csv('user_rating_0_to_1000.csv')
ratings = ratings[:20000]

In [3]:
books.head()

Unnamed: 0,Id,Name,RatingDist1,pagesNumber,RatingDist4,RatingDistTotal,PublishMonth,PublishDay,Publisher,CountsOfReview,PublishYear,Language,Authors,Rating,RatingDist2,RatingDist5,ISBN,RatingDist3
0,1,Harry Potter and the Half-Blood Prince (Harry ...,1:9896,652,4:556485,total:2298124,16,9,Scholastic Inc.,28062,2006,eng,J.K. Rowling,4.57,2:25317,5:1546466,,3:159960
1,2,Harry Potter and the Order of the Phoenix (Har...,1:12455,870,4:604283,total:2358637,1,9,Scholastic Inc.,29770,2004,eng,J.K. Rowling,4.5,2:37005,5:1493113,0439358078,3:211781
2,3,Harry Potter and the Sorcerer's Stone (Harry P...,1:108202,309,4:1513191,total:6587388,1,11,Scholastic Inc,75911,2003,eng,J.K. Rowling,4.47,2:130310,5:4268227,,3:567458
3,4,Harry Potter and the Chamber of Secrets (Harry...,1:11896,352,4:706082,total:2560657,1,11,Scholastic,244,2003,eng,J.K. Rowling,4.42,2:49353,5:1504505,0439554896,3:288821
4,5,Harry Potter and the Prisoner of Azkaban (Harr...,1:10128,435,4:630534,total:2610317,1,5,Scholastic Inc.,37093,2004,eng,J.K. Rowling,4.57,2:24849,5:1749958,043965548X,3:194848


In [4]:
# create rating list with movie titles
books_ratings = pd.merge(
    left= ratings,
    right= books[['Id','Name','Authors']],
    how='inner',
    left_on='Name',
    right_on='Name'
)

books_ratings['Rating'].replace(['it was amazing', 'really liked it', 'liked it', 'it was ok', 'did not like it', 'This user doesnt have any rating'], [5, 4, 3, 2, 1, 1], inplace= True)

books_ratings.head()

Unnamed: 0,ID,Name,Rating,Id,Authors
0,1,Agile Web Development with Rails: A Pragmatic ...,5,45,Dave Thomas
1,18,Agile Web Development with Rails: A Pragmatic ...,4,45,Dave Thomas
2,35,Agile Web Development with Rails: A Pragmatic ...,5,45,Dave Thomas
3,96,Agile Web Development with Rails: A Pragmatic ...,4,45,Dave Thomas
4,124,Agile Web Development with Rails: A Pragmatic ...,4,45,Dave Thomas


In [5]:
user_book_matrix = books_ratings.pivot_table(index='ID', columns='Name',values='Rating', fill_value=0)

user_book_id_matrix = books_ratings.pivot_table(index='ID', columns='Id',values='Rating', fill_value=0)

In [6]:
books_authors = books_ratings[['Id', 'Authors']].copy()
books_authors = books_authors.set_index('Id')
books_authors = books_authors['Authors'].apply(pd.Series).stack()

books_feature_matrix = pd.get_dummies(books_authors).sum(level=0)
books_feature_matrix.head()

Unnamed: 0_level_0,A.C. Weisbecker,A.S. Byatt,Abigail Thomas,Adam Gopnik,Adam Hochschild,Adam Smith,Adrienne Sharp,Aeschylus,Aesop,Agatha Christie,...,Wisława Szymborska,Wole Soyinka,Yann Martel,Zadie Smith,Zilpha Keatley Snyder,Zora Neale Hurston,Zoë Heller,bell hooks,Åsne Seierstad,Éric-Emmanuel Schmitt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
books_ratings['ID'].value_counts().head()

284    683
327    363
166    337
338    288
5      265
Name: ID, dtype: int64

# Content-based Recommendation

In [8]:
curr_user = 5
curr_book = 3143

In [9]:
# current user read book 
curr_user_read_book = books_ratings[books_ratings['ID'] == curr_user]['Id']

In [10]:
# current user read book rating
curr_user_read_book_rating = books_ratings[books_ratings['ID'] == curr_user][['Id', 'Rating']]

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix = pd.DataFrame(
    data=cosine_similarity(books_feature_matrix),
    columns=books_feature_matrix.index.tolist(),
    index=books_feature_matrix.index.tolist()
)

cosine_matrix.head()

Unnamed: 0,45,8695,4197,21,960,15245,5,1202,3750,27457,...,16451,16875,5207,12936,14352,2811,13152,17208,22283,28777
45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8695,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4197,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
960,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Memory Based Filtering

## user based

In [12]:
# which user we'll give a recommendation to
#curr_user

# create user similarity matrix
user_sim_score = pd.DataFrame(
    data=cosine_similarity(user_book_matrix),
    index=user_book_matrix.index.tolist(),
    columns=user_book_matrix.index.tolist()
)

# get users with similar preference with current user
user_sim_with_curr = user_sim_score[curr_user].sort_values(ascending=False).reset_index().rename(columns={'index' : 'id', curr_user:'sim_score'})

top_n = 3

top_n_sim_user = user_sim_with_curr[: top_n +1]

# get books who have been read by top n similar users
top_n_book_matrix = user_book_matrix[user_book_matrix.index.isin(top_n_sim_user['id'])].T

# have been user read but did not read by current user
user_id_tmp = top_n_book_matrix.columns.tolist()
user_id_tmp.remove(curr_user)

curr_user_unread_books = top_n_book_matrix[
    (top_n_book_matrix[curr_user] == 0)
    & 
      (top_n_book_matrix[user_id_tmp[0]] > 0)
    & (top_n_book_matrix[user_id_tmp[1]] > 0)
    & (top_n_book_matrix[user_id_tmp[2]] > 0)    
]

# order and filter
curr_user_unread_books['est_rating_by_curr_user'] = curr_user_unread_books.sum(axis=1) / top_n

curr_user_unread_books.sort_values(by='est_rating_by_curr_user', ascending=False)[:10]['est_rating_by_curr_user']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  curr_user_unread_books['est_rating_by_curr_user'] = curr_user_unread_books.sum(axis=1) / top_n


Name
The Westing Game                                              5.000000
1984                                                          4.666667
A Tale of Two Cities                                          4.333333
Charlie and the Great Glass Elevator (Charlie Bucket, #2)     4.333333
Island of the Blue Dolphins                                   4.333333
Twelfth Night                                                 4.333333
Harry Potter and the Chamber of Secrets (Harry Potter, #2)    3.333333
A Heartbreaking Work of Staggering Genius                     2.666667
Name: est_rating_by_curr_user, dtype: float64

In [25]:
user_sim_with_curr

Unnamed: 0,id,sim_score
0,5,1.000000
1,284,0.313211
2,327,0.288622
3,198,0.263835
4,166,0.249932
...,...,...
171,105,0.000000
172,96,0.000000
173,92,0.000000
174,89,0.000000


In [23]:
top_n_sim_user

Unnamed: 0,id,sim_score
0,5,1.0
1,284,0.313211
2,327,0.288622
3,198,0.263835


## item based

In [21]:
# which book we'd like to recommend to a user
curr_book_title = books[books['Id'] == curr_book]['Name'].values[0]

# create book similarity matrix
book_sim_score = pd.DataFrame(
    data=cosine_similarity(user_book_matrix.T),
    index= user_book_matrix.T.index.tolist(),
    columns=user_book_matrix.T.index.tolist()
)

# get books which are similarly preferenced as current book
books_sim_with_curr = book_sim_score[curr_book_title].sort_values(ascending=False).reset_index().rename(columns={'index' : 'title', curr_book_title: 'sim_score'})

top_n = 3
top_n_sim_book = books_sim_with_curr[:top_n + 1]

# # get users who have read top n books but haven't read current books
# top_n_user_matrix = user_book_matrix.T[user_book_matrix.T.index.isin(top_n_sim_book['title'])].T

# # the book has been read by the user but curr book didnt
# movie_tmp = top_n_user_matrix.columns.tolist()
# movie_tmp.remove(curr_book_title)

# curr_books_unread_users = top_n_user_matrix[
#       (top_n_user_matrix[curr_book_title] == 0)
#     & 
#       (top_n_user_matrix[movie_tmp[0]] > 0)
#     & (top_n_user_matrix[movie_tmp[1]] > 0)
#     & (top_n_user_matrix[movie_tmp[2]] > 0)    
# ]

In [22]:
top_n_sim_book

Unnamed: 0,title,sim_score
0,Kiffe Kiffe Tomorrow,1.0
1,Ballet,1.0
2,Drina Dances in Switzerland,1.0
3,Drina Dances in Paris,1.0


In [24]:
books_sim_with_curr

Unnamed: 0,title,sim_score
0,Kiffe Kiffe Tomorrow,1.0
1,Ballet,1.0
2,Drina Dances in Switzerland,1.0
3,Drina Dances in Paris,1.0
4,Drina Dances in Madeira,1.0
...,...,...
2453,The Boleyn Inheritance (The Plantagenet and Tu...,0.0
2454,"The Body in the Library (Miss Marple, #3)",0.0
2455,"The Body Farm (Kay Scarpetta, #5)",0.0
2456,The Body,0.0


In [26]:
curr_book_title

'Drina Dances in Exile'