In [63]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# 1. Import data
All of our raw data is stored in S3 bucket. In the first block, we import the readers' ratings data and book title data from S3.

In [67]:
#Read Rating Data
df=pd.read_csv('s3://niu00056-msba6330/Final Project/BookRatings.csv',sep=';')
#Read Book Data
book_name=pd.read_csv('s3://niu00056-msba6330/Final Project/Books.csv',sep=';')
book_name=book_name[['ISBN','Title']]
book_name['ISBN']=['book_'+i for i in book_name['ISBN']]

# 2. Tidy data into book rating format
For our item-based recommnder system, the input needs to be in pivot format with user ID as the column names, book ID as the indexes, and the ratings as the values.

In [68]:
# Filter users that have rated at least 3 books
temp = df[df['Rating']!=0]
s=temp.groupby('User-ID').Rating.count()
unique_id = s[s >= 3].index.tolist()
# Take a random subset of user_id
np.random.seed(1)
user_index = np.random.choice(unique_id, 1000, replace=False)
df = temp[temp['User-ID'].isin(user_index)]

In [69]:
df=df.pivot(index='ISBN',columns='User-ID',values='Rating')
df[pd.isnull(df)]=0

In [70]:
df.columns=['user_'+str(i) for i in df.columns]
df.index=['book_'+i for i in df.index]

In [71]:
df1=df.copy()

# 3. Recommender System
Our recommender system will be an item-based recommender using KNN as the estimator. By finding books that are similar, we could recommed books to different users based on their reading history. At the same time, we can also predict the ratings specific user would give on various books.

In [78]:
#Recommended Books
def recommend_books(user, num_recommended_books):
  print('The list of the Books {} Has Read \n'.format(user))
  for m in df[df[user] > 0][user].index.tolist():
    #Adding Book Name
    if len(book_name[book_name['ISBN']==m]['Title'].values)==0:
        name='Unknown'
    else:
        name=str(book_name[book_name['ISBN']==m]['Title'].values[0])
    print('{} - {}'.format(m,name))
  print('\n')
  recommended_books = []
  for m in df[df[user] == 0].index.tolist():
    index_df = df.index.tolist().index(m)
    predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
    recommended_books.append((m, predicted_rating))
  sorted_rm = sorted(recommended_books, key=lambda x:x[1], reverse=True)
  print('The list of the Recommended Books \n')
  rank = 1
  for recommended_book in sorted_rm[:num_recommended_books]:
    #Adding Book Name
    if len(book_name[book_name['ISBN']==recommended_book[0]]['Title'].values)==0:
        name='Unknown'
    else:
        name=str(book_name[book_name['ISBN']==recommended_book[0]]['Title'].values[0])
    print('{}: {} - {} - predicted rating:{}'.format(rank, recommended_book[0],name,recommended_book[1]))
    rank = rank + 1

In [79]:
#Recommeder System
# store the original dataset in 'df', and create the copy of df, df1 = df.copy().
def book_recommender(user, num_neighbors, num_recommendation):
  number_neighbors = num_neighbors

  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(df.values)
  distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

  user_index = df.columns.tolist().index(user)

  for m,t in list(enumerate(df.index)):
    if df.iloc[m, user_index] == 0:
      sim_books = indices[m].tolist()
      book_distances = distances[m].tolist()
    
      if m in sim_books:
        id_book = sim_books.index(m)
        sim_books.remove(m)
        book_distances.pop(id_book) 

      else:
        sim_books = sim_books[:num_neighbors-1]
        book_distances = book_distances[:num_neighbors-1]
           
      book_similarity = [1-x for x in book_distances]
      book_similarity_copy = book_similarity.copy()
      nominator = 0

      for s in range(0, len(book_similarity)):
        if df.iloc[sim_books[s], user_index] == 0:
          if len(book_similarity_copy) == (number_neighbors - 1):
            book_similarity_copy.pop(s)
          
          else:
            book_similarity_copy.pop(s-(len(book_similarity)-len(book_similarity_copy)))
            
        else:
          nominator = nominator + book_similarity[s]*df.iloc[sim_books[s],user_index]
          
      if len(book_similarity_copy) > 0:
        if sum(book_similarity_copy) > 0:
          predicted_r = nominator/sum(book_similarity_copy)
        
        else:
          predicted_r = 0

      else:
        predicted_r = 0
        
      df1.iloc[m,user_index] = predicted_r
  recommend_books(user, num_recommendation)

# 4. Demo

In [77]:
book_recommender('user_243',5,5)

The list of the Books user_243 Has Read 

book_0060915544 - The Bean Trees
book_0060977493 - The God of Small Things
book_0140272100 - Vanished
book_0316601950 - The Pilot's Wife : A Novel
book_0316776963 - Me Talk Pretty One Day
book_0316899984 - River, Cross My Heart
book_0375400117 - Memoirs of a Geisha
book_0385316895 - Legacy of Silence
book_0385720106 - A Map of the World
book_0425163407 - Unnatural Exposure
book_044023722X - A Painted House
book_0446364800 - The General's Daughter
book_0446606383 - The Midnight Club
book_0449006522 - Manhattan Hunt Club
book_0553580388 - The Patient
book_0786863986 - A Monk Swimming
book_0803251718 - Crazy Horse
book_155874262X - Chicken Soup for the Soul (Chicken Soup for the Soul)


The list of the Recommended Books 

1: book_038097438X - Paradise Fever: Growing Up in the Shadow of the New Age - predicted rating:10.0
2: book_0452281784 - Bad Heir Day - predicted rating:10.0
3: book_0670889202 - Penny Dreadful - predicted rating:10.0
4: book_00