In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis

In [None]:
# import all necessary library..
import numpy as np # for array calculations
import pandas as pd # for dataframe manipulations

# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# for not write again and again to show the graph
%matplotlib inline



In [None]:
# Load dataframe
df = pd.read_csv('/kaggle/input/goodreadsbooks/books.csv',error_bad_lines= False)
df.head()

## Descriptive Analysis

In [None]:
# difffernt features..
df.columns

### Features Description:

* bookID
    Contains the unique ID for each book/series
* title
    contains the titles of the books
* authors
    contains the author of the particular book
* average_rating 
    the average rating of the books, as decided by the users
* ISBN ISBN(10) 
    number, tells the information about a book - such as edition and publisher
* ISBN 13 
    The new format for ISBN, implemented in 2007. 13 digits
* language_code 
    Tells the language for the books
* Num_pages 
    Contains the number of pages for the book
* Ratings_count 
    Contains the number of ratings given for the book
* text_reviews_count 
    Has the count of reviews left by users

In [None]:
# check null values..
df.isnull().sum()

### there is no nan values in any of its attribute. 

In [None]:
# about dataframe 
df.info()

In [None]:
#  numerical summary of dataframe 
df.describe()

# Visualization

### Top 15 Rated Books

In [None]:
top_fifteen = df[df['ratings_count'] > 1000000]
top_fifteen.sort_values(by='average_rating', ascending=False)
top_fifteen.head(15)

As you seen above there are top 15 rated books . We saw that the maximum rating in our dataframe was 5.0 but we dont see any books in the above result with 5.0 rating. This is because we filtered these books on the basis of the number of ratings. We made sure that all the books that we have in the above results have a decent amount of rating. There can be books in the data that can have only 1 or 2 ratings can be rated 5.0. We want to avoid such books hence this sort of filtering. 

### Let's go ahead and visualize this outcome in form of a graph.






In [None]:
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(10, 10))

data = top_fifteen.sort_values(by='average_rating', ascending=False).head(15)
gr = sns.barplot(x="average_rating", y="title", data=data, palette="CMRmap_r")

for i in gr.patches:
    gr.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

### Top 15 authors present in our data

In [None]:
top_15_authors = df.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(15).set_index('authors')
top_15_authors.head(15)

Let's go ahead and take a look at some top 15 authors present in our data. We will rank them according to the number of books they have written provided these books are present in the data.

In [None]:

plt.figure(figsize=(15,10))
ax = sns.barplot(top_15_authors['title'], top_15_authors.index, palette='CMRmap_r')

ax.set_title("Top 10 authors with most books")
ax.set_xlabel("Total number of books")
totals = []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
    ax.text(i.get_width()+.2, i.get_y()+.2,str(round(i.get_width())), fontsize=15,color='black')
plt.show()

According to our graphs, Stephen king and P.G. Wodehouse have the most number of books in the data. Both the authors have 40 books in our data set followed by Rumiko Takahashi and Orson scott Card.

## Relationship between avrage rating and rating count

In [None]:
ax = sns.relplot(data=df,
                 x="ratings_count",
                 y="average_rating",
                 color = '#95a3c3',
                 sizes=(400, 600), 
                 height=7, 
                 marker='o')

* ### Language Distribution

In [None]:
plt.figure(figsize=(15, 7))
ax = sns.countplot(x=df.language_code, data=df)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x()-0.05, p.get_height()+100))

As you can see here most of the books are the language of english.. so,in the features selection section we colud remove non english rows. in the dataframe for accuracy.

### Top 15 publisher

In [None]:
top_15_publisher = df.groupby('publisher')['title'].count().reset_index().sort_values('title', ascending=False).head(15).set_index('publisher')
top_15_publisher.head(15)

To get more about the publisher using visualizations

In [None]:

plt.figure(figsize=(15,10))
ax = sns.barplot(top_15_publisher['title'], top_15_publisher.index, palette='CMRmap_r')

ax.set_title("Top 15 publisher with most books")
ax.set_xlabel("Total number of books")
totals = []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
    ax.text(i.get_width()+.2, i.get_y()+.2,str(round(i.get_width())), fontsize=15,color='black')
plt.show()

## Distribution of average_rating

In [None]:
df.average_rating = df.average_rating.astype(float)
fig, ax = plt.subplots(figsize=[15,10])
sns.distplot(df['average_rating'],ax=ax)
ax.set_title('Average rating distribution for all books',fontsize=20)
ax.set_xlabel('Average rating',fontsize=13)

it almost follow gussian distributions curve. so, it is very good for model training.

After comparing the average rating with the different columns, we can go ahead with using the language and the Rating counts for our recommender system. Rest other colummns weren't making much sense and using them might not help us in a big way so we can omit them

# Feature Engineering.

In [None]:
df.columns

### 1. Imputation

In [None]:
threshold = 0.7
#Dropping columns with missing value rate higher than threshold
df = df[df.columns[df.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
df = df.loc[df.isnull().mean(axis=1) < threshold]

In [None]:
df.head()

### 2.Handling Outliers

In [None]:
# correlation between the features
corrmat = df.corr() 
  
f, ax = plt.subplots(figsize =(9, 8)) 
sns.heatmap(corrmat, ax = ax, cmap ="YlGnBu", linewidths = 0.1) 

here you can see that text_reviews_count is highly correlated with ratings_count . so, you can you either of these features.

In [None]:
df2 =df.copy()

We will now create a new column called 'rating_between'. We will divide our average rating column into various categories such as rating between 0 and 1, 1 and 2 and so on. This will work as one of the features that we will feed to our model so that it can make better predictions.

In [None]:
df2.loc[ (df2['average_rating'] >= 0) & (df2['average_rating'] <= 1), 'rating_between'] = "between 0 and 1"
df2.loc[ (df2['average_rating'] > 1) & (df2['average_rating'] <= 2), 'rating_between'] = "between 1 and 2"
df2.loc[ (df2['average_rating'] > 2) & (df2['average_rating'] <= 3), 'rating_between'] = "between 2 and 3"
df2.loc[ (df2['average_rating'] > 3) & (df2['average_rating'] <= 4), 'rating_between'] = "between 3 and 4"
df2.loc[ (df2['average_rating'] > 4) & (df2['average_rating'] <= 5), 'rating_between'] = "between 4 and 5"

In [None]:
df2.head()

In [None]:
rating_df = pd.get_dummies(df2['rating_between'])
rating_df.head()

In [None]:
l_code_df = pd.get_dummies(df2['language_code'])
l_code_df.head()

In [None]:
## now we combine these two in the dataframe 


features = pd.concat([l_code_df, rating_df, df2['average_rating'], df2['ratings_count']], axis=1)
features.head()

Now that we have our features ready, we will now use the Min-Max scaler to scale these values down. It will help in reducing the bias for some of the books that have too many features. It will basically find the median for all and equalize it,

## Model Building

In [None]:
# import necessary pakages for k-nearest-neighbour

from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(features)

In [None]:
model = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
model.fit(features)
dist, idlist = model.kneighbors(features)

In [None]:
def book_recommendation_engine(book_name):
    book_list_name = []
    book_id = df2[df2['title'] == book_name].index
    book_id = book_id[0]
#     print('book_id', book_id)
    for newid in idlist[book_id]:
#         print(newid)
        book_list_name.append(df2.loc[newid].title)
#         print(new_data.loc[newid].title)
    return book_list_name

In [None]:
book_list_name = book_recommendation_engine('Little Women')
book_list_name

## Working well

              with the help of such a wonderfull notebook
             1. https://www.kaggle.com/aayushmishra1512ll
              
             2. https://www.kaggle.com/snanilim
             
              Krish naik sir.. thanks all of you!!!