In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.spatial.distance import cosine

In [None]:
# Read the input files and rename the columns with readable names.
#books_data contains information about the books.
#users_data contains the information about the users.
#ratings_data contains information about the ratings.

bookfile = '../input/BX-Books.csv'
books_data = pd.read_csv(bookfile , sep=';', error_bad_lines=False, encoding="latin-1" )
books_data.columns = ['Book_ID', 'Book_Title', 'Book_Author', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
userfile = '../input//BX-Users.csv'
users_data = pd.read_csv(userfile, sep=';', error_bad_lines=False, encoding="latin-1")
users_data.columns = ['User_ID', 'Location', 'Age']
ratings_data = pd.read_csv('../input/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings_data.columns = ['User_ID', 'Book_ID', 'Book_Rating']

In [None]:
#checking shapes of the datasets
print(books_data.shape)
print(users_data.shape)
print(ratings_data.shape)

In [None]:
# Delete the columns which are not used such as information of images of the book. 
print(books_data.head())

In [None]:
books_data.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'],axis =1, inplace=True)

In [None]:
books_data.loc [:]

In [None]:
#Checking the highest rated book 
ratings_count = pd.DataFrame(ratings_data.groupby('Book_ID')['Book_Rating'].count())
ratings_countsorted = ratings_count.sort_values('Book_Rating',ascending=False)
ratings_countsorted.head()

In [None]:
#validate Book_ID in ratings_data with book_data and remove the unmatched Book_ID
ratings_valid = ratings_data[ratings_data.Book_ID.isin(books_data.Book_ID)]
print(ratings_data.shape)
print(ratings_valid.shape)

In [None]:


ratings_explicit = ratings_valid [ratings_valid.Book_Rating != 0]
ratings_implicit = ratings_valid [ratings_valid.Book_Rating == 0]
print(ratings_explicit.shape)
print(ratings_implicit.shape)

In [None]:
rating_count = pd.DataFrame(ratings_explicit.groupby(['User_ID'])['Book_Rating'].mean())
rating_count['numofratings']= pd.DataFrame(ratings_explicit.groupby(['User_ID'])['Book_Rating'].count())
highestrateduser = rating_count.sort_values('numofratings', ascending = False).head(10)
print (highestrateduser)


In [None]:
import seaborn as sns
#plotting count of bookRating
sns.countplot(data=ratings_explicit , x='Book_Rating')
plt.show()
#It can be seen that higher ratings are more common amongst users and rating 8 has been rated highest number of times

In [None]:
# gives the count of top 10 popular books

rating_count = pd.DataFrame(ratings_explicit.groupby(['Book_ID'])['Book_Rating'].mean())
rating_count['numofratings']= pd.DataFrame(ratings_explicit.groupby(['Book_ID'])['Book_Rating'].count())
top_rated = rating_count.loc[(rating_count.numofratings > 100) & (rating_count.Book_Rating > 8)]
top10popular = top_rated.sort_values('Book_Rating', ascending = False).head(10)
print(" Top 10 Popular books recommended are")

print(top10popular)

In [None]:
top10popular.merge(books_data,left_index = True,right_on= 'Book_ID')

In [None]:
#User based recommended system
ratings_explicit = pd.DataFrame.sort_values(ratings_explicit ,['User_ID','Book_ID'],ascending=[0,1])
print(ratings_explicit.shape)

In [None]:
counts1 = ratings_explicit['Book_ID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book_ID'].isin(counts1[counts1 >= 100].index)]
counts = ratings_explicit['Book_Rating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book_Rating'].isin(counts[counts >= 100].index)]
print(ratings_explicit.shape)

In [None]:
RatingMatrix=pd.pivot_table(ratings_explicit, values='Book_Rating',
                                    index=['User_ID'], columns=['Book_ID'])
print (RatingMatrix.shape)
RatingMatrix.head()

In [None]:
def FindSimilaritydistance(userA,userB):
   
    userA=np.array(userA)-np.nanmean(userA) 
    #  normalizing userA 
    userB=np.array(userB)-np.nanmean(userB)

    # find the similarity between 2 users
    commonbooks=[]    
    for i in range(len(userA)):
        if userA[i]>0 and userB[i]>0:
            commonbooks.append(i)
    # Gives us movies for which both users have non NaN ratings 

    if len(commonbooks)==0:
        # If there are no movies in common 
        return 0
    else:
        userA=np.array([userA[i] for i in commonbooks])
        userB=np.array([userB[i] for i in commonbooks])
        
        return cosine(userA,userB)

In [None]:
# Find similar user and predict the user rating
def SimilarUserRatings(RecommendUser,K):
    
    # Creates an empty matrix with row userid and similarity column
    similaritytable=pd.DataFrame(index=RatingMatrix.index,
                                  columns=['Similarity'])
    # Find the similarity between user i and the recommenduser and add it to the similarity table
    userloc_index=RatingMatrix.loc[RecommendUser]
    for i in RatingMatrix.index:
        similaritytable.loc[i]=FindSimilaritydistance(userloc_index,
                                          RatingMatrix.loc[i])
     
    #sort the similaritytable in descending order
    similaritytable=pd.DataFrame.sort_values(similaritytable,
                                              ['Similarity'],ascending= False)
    
     # find the K Nearest neighbours 
    knearest=similaritytable[:K]
    
    similarity_mean = knearest['Similarity'].sum()
    #print (sum_row)
    # preditedrating of book will be held in predictItemRating where row is bookID and column is rating
    predictbookRating=pd.DataFrame(index=RatingMatrix.columns, columns=['Book_Rating'])
    
     #Because the index of similarity table and ratingmatrix is same we can get the userid 
    SimilarUserID=RatingMatrix.loc[knearest.index]
    
    # passing the bookid and similaruserid to get the ratings of book and predit the ratings of all the rated book
    for ratedbookid in RatingMatrix.columns:
         # start with the average rating of the user
        Useraveragerating=np.nanmean(RatingMatrix.loc[RecommendUser])
        #Pass the similaruserID
        for suserid in SimilarUserID.index:
             # If the neighbour has rated that book then add it for prediction
            if RatingMatrix.loc[suserid,ratedbookid]>0:
                #normalise the rating of the user for this book and mutiple with similarity
                Useraveragerating = Useraveragerating + ((RatingMatrix.loc[suserid,ratedbookid]
                                    -np.nanmean(RatingMatrix.loc[suserid]))*knearest.loc[suserid,'Similarity'])
                
        # We are out of the loop which uses the similaruserid,     
        # add rating to the predicted book Rating matrix
        predictbookRating.loc[ratedbookid,'Book_Rating']= Useraveragerating
        
    return predictbookRating

In [None]:
def Recommendations(RecommendUser,N):
    if type(RecommendUser) is not int or (RecommendUser not in RatingMatrix.index.values):
        print ( "User id should be integer and  part of the list of users ")
        return 0
    else:
        predictbookRating=SimilarUserRatings(RecommendUser,10)
    
        # find the list of books which are already read 
        booksalreadyread=list(RatingMatrix.loc[RecommendUser]
                                  .loc[RatingMatrix.loc[RecommendUser]>0].index)
    
        predictbookRating=predictbookRating.drop(booksalreadyread)
            
        topRecommendations_sorted=pd.DataFrame.sort_values(predictbookRating,
                                                ['Book_Rating'],ascending=[0])[:N]
    
        # This will give the list of top recommended books 
        # we need to find the details of books from book_id
        topRecommendations1 = topRecommendations_sorted.merge(books_data,left_index=True, right_on='Book_ID')
        topRecommendations1=topRecommendations1.reset_index()
        #deleting the predicted book rating and reseting index
        topRecommendations = topRecommendations1.drop(['index','Book_Rating'],axis =1)
        topRecommendations.index= topRecommendations.index + 1
    
        return topRecommendations
    

In [None]:
 print (Recommendations (248718,10))