In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno
import warnings 
warnings.filterwarnings("ignore")

pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [None]:
# System
import warnings
import os
warnings.filterwarnings("ignore")
%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
user=pd.read_csv("/kaggle/input/bookcrossing/bx-csv-dump/BX-Users.csv",error_bad_lines=False, delimiter=';', encoding = 'ISO-8859-1')
books=pd.read_csv("/kaggle/input/bookcrossing/bx-csv-dump/BX-Books.csv",error_bad_lines=False, delimiter=';', encoding = 'ISO-8859-1')
ratings=pd.read_csv("/kaggle/input/bookcrossing/bx-csv-dump/BX-Book-Ratings.csv",error_bad_lines=False, delimiter=';', encoding = 'ISO-8859-1')

In [None]:
user.head()

In [None]:
books.head()

In [None]:
ratings.head()

In [None]:
data = pd.merge(books, ratings, on='ISBN', how='left')


In [None]:
data.head()

In [None]:
data=pd.merge(data, user, on='User-ID', how='left')

In [None]:
data.head()

In [None]:
data.drop(["Image-URL-S","Image-URL-M","Image-URL-L"],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
#missingno.matrix(data)

In [None]:
data.shape

In [None]:
data["User-ID"].unique().shape

In [None]:
data[data.ISBN=="034545104X"].head()

In [None]:
# For simplification of the problem we gonna drop the age column too

data.drop("Age",axis=1,inplace=True)

In [None]:
data.head()

## Cleaning the Year Column

In [None]:
data["Year-Of-Publication"].unique()

In [None]:
data[data["Year-Of-Publication"]=="DK Publishing Inc"]

In [None]:
data["Year-Of-Publication"].replace({"DK Publishing Inc":2000},inplace=True)

In [None]:
data["Year-Of-Publication"].unique()

In [None]:
data[data["Year-Of-Publication"]=="Gallimard"]

In [None]:
data["Year-Of-Publication"].replace({"Gallimard":2003},inplace=True)

In [None]:
data[data["Year-Of-Publication"]==0].head()

In [None]:
data["Year-Of-Publication"].mode()

In [None]:
data["Year-Of-Publication"].replace({0:2002},inplace=True)

In [None]:
data["Year-Of-Publication"].unique()

In [None]:
data["Year-Of-Publication"]=data["Year-Of-Publication"].astype(int)

In [None]:
list1=[]
for i in data["Year-Of-Publication"]:
    if i >2016:
        i=2016
    list1.append(i)
#out = np.where(data.values <= q_05,q_05, np.where(data >= q_95, q_95, data))


In [None]:
for i in list1:
    if i>2016:
        print(i)

In [None]:
data["Year-Of-Publication"]=list1

In [None]:
data["Year-Of-Publication"].value_counts().sort_index().head()

In [None]:
list2=[]
for i in data["Year-Of-Publication"]:
    if i <1376:
        i=1376
    list2.append(i)

In [None]:
data["Year-Of-Publication"]=list2

In [None]:
data["Year-Of-Publication"].unique()

In [None]:
data.head()

## Cleaning the Publisher column

In [None]:
data.Publisher.isnull().sum()

In [None]:
data.Publisher.unique()

In [None]:
data["Publisher"].sort_values().head()

In [None]:
data[data.Publisher.isnull()]

In [None]:
data.Publisher.iloc[824289]="Editions P. Terrail"

In [None]:
data.Publisher.iloc[824598]="Editions P. Terrail"

In [None]:
data.head()

In [None]:
data.isnull().sum()

## Cleaning the Book Author Column

In [None]:
data[data["Book-Author"].isnull()]

In [None]:
data["Book-Author"].value_counts().sort_values(ascending=False).head()

In [None]:
data["Book-Author"].iloc[929219]="Stephen King"

In [None]:
data["Book-Author"].isnull().sum()

In [None]:
data["Book-Author"].head()

In [None]:
# Droping the location column

In [None]:
data.drop("Location",axis=1,inplace=True)

In [None]:
data.head()

## Cleaning the User Id column

In [None]:
data["User-ID"].isnull().sum()

#### It is problematic to fill out user ids randomly so we gonna drop the null values

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.shape

#### Now We are left with around 10 lakh rows of data to work with

In [None]:
data.head()

## lets start working on making our system now but before that distribution of ratings should be viewed

In [None]:
sns.countplot(data["Book-Rating"])

In [None]:
data["Book-Rating"].value_counts()

### we gonna be assuming that the rating "0" means that the book hasn't recived any ratings from the user,hence lets replace it with None

In [None]:
data.head()

In [None]:
ratng=data.copy()

In [None]:
ratng["Book-Rating"].replace(0.0,None,inplace=True)

In [None]:
ratng.head()

In [None]:
ratng[ratng["Book-Rating"]==0]

In [None]:
ratng["Book-Rating"].iloc[0]=5

In [None]:
### lets check the distribution again

In [None]:
sns.countplot(ratng["Book-Rating"])

# Recommender 1 : Popularity Based

## To make our first recommender we gonna be making recommendations using only those books that have been rated by the users

In [None]:
rec1=pd.DataFrame(ratng.groupby(["ISBN","Book-Title","Book-Author"])["Book-Rating"].sum().sort_values(ascending=False).head(10))

In [None]:
rec1

# The Above Dataframe shows the top 10 books that have received highest number of ratings and hence they are to be recommended as popular choice based on ratings sum

#                              ------------------------------------------------------------------------------------------------------------------------------

# Recommender 2 : Collabarative Filtering Based 

#### The main idea behind UB-CF is that people with similar characteristics share similar taste.
#### For example, if you are interested in recommending a movie to our friend Bob, suppose Bob and I have seen many movies together and we rated them almost identically.
#### It makes sense to think that in future as well we would continue to like similar movies and use this similarity metric to recommend movies.

In [None]:
top_index=ratng["User-ID"].value_counts().sort_values(ascending=False).head(500).index
top_index

In [None]:
df=ratng[ratng["User-ID"].isin(top_index)]
df.head()

In [None]:
df.head()

In [None]:
idcount=df["User-ID"].value_counts()

In [None]:
idcount.shape

In [None]:
df121=df[df["User-ID"].isin(idcount[idcount>=1500].index)]

In [None]:
zxc=df121.groupby("Book-Title")["Book-Rating"].sum().reset_index()
zxc.head()

In [None]:
zxc=zxc[zxc["Book-Rating"]>200]

In [None]:
zxc["Book-Rating"].max()

In [None]:
df_mat=df121[df121["Book-Title"].isin(zxc["Book-Title"])]

In [None]:
matrix=df_mat.pivot(index="User-ID",columns="ISBN",values="Book-Rating")

In [None]:
matrix.fillna(0,inplace=True)

In [None]:
matrix

## Importing the library to calculate cosine similiarity between the users

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
cos_sim = cosine_similarity(matrix)
np.fill_diagonal(cos_sim,0)        # zero here means that both ids are same,it should be 1 here but i am using 0 so as to ease further coding process
rec_cos=pd.DataFrame(cos_sim,index=matrix.index)
rec_cos.columns=matrix.index
rec_cos.head()

#### Validating our result

In [None]:
df_mat[df_mat["User-ID"]==16795.0][["Book-Title","Book-Rating"]].head()

In [None]:
df_mat[df_mat["User-ID"]==135149.0][["Book-Title","Book-Rating"]].head()

### Building a function to show top 10 users that are similiar to input user

In [None]:
def sim(userid,n):          # userid is the id for which recommendations has to be made, n represents total no. of similiar users wanted 
    print(np.array(rec_cos[userid].sort_values(ascending=False).head(n).index))

In [None]:
print(np.array(rec_cos[98391.0].sort_values(ascending=False).head(10).index))

In [None]:
sim(98391.0,20)        # .0 has to be added in front of every id as it is working column wise instead of row wise

In [None]:
def book_recommender():              # userid is the id for which recommendations has to be made, n represents total no. of similiar users wanted 
    print()
    print()
    userid = int(input("Enter the user id to whom you want to recommend : "))
    print()
    print()
    n= int(input("Enter how many books you want to recommend : "))
    print()
    print()
    arr=np.array(rec_cos[userid].sort_values(ascending=False).head(5).index)
    recom_arr=[]

    for i in arr:
        recom_arr.append(df_mat[df_mat["User-ID"]==i][["Book-Title","Book-Rating"]].sort_values(by="Book-Rating",ascending=False))
    
    return(pd.Series(recom_arr[0].append([recom_arr[1],recom_arr[2],recom_arr[3],recom_arr[4]]).groupby("Book-Title")["Book-Rating"].mean().sort_values(ascending=False).index).head(n))

In [None]:
book_recommender()

# This is only a basic recommender system and much more work still has to be done 