In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/goodreadsbooks/books.csv", error_bad_lines = False)
data.head()

Columns Description:

* **bookID** Contains the unique ID for each book/series
* **title** contains the titles of the books
* **authors** contains the author of the particular book
* **average_rating** the average rating of the books, as decided by the users
* **ISBN** ISBN(10) number, tells the information about a book - such as edition and publisher
* **ISBN 13** The new format for ISBN, implemented in 2007. 13 digits
* **language_code** Tells the language for the books
* **Num_pages** Contains the number of pages for the book
* **Ratings_count** Contains the number of ratings given for the book
* **text_reviews_count** Has the count of reviews left by users

In [None]:
data.isnull().sum()

# Goodreads Books Analysis

## Top 20 Top Rated Books

In [None]:
top_books = data[data['ratings_count'] > 1000000]
top_books = top_books.sort_values(by='average_rating', ascending=False).head(20)
# top_books

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="average_rating", y="title", data=top_books, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

## Top 20 Top Voted Books

In [None]:
top_vote = data.sort_values(by='ratings_count', ascending=False).head(20)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="ratings_count", y="title", data=top_vote, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

## Top rated but not in top voted

In [None]:
list(set(top_books['title'].values) - set(top_vote['title'].values))

## Top voted but not in top rated

In [None]:
list(set(top_vote['title'].values) - set(top_books['title'].values))

## Relationship between rating and vote

In [None]:
ax = sns.relplot(data=data, x="ratings_count", y="average_rating", color = '#95a3c3', sizes=(100, 200), height=7, marker='o')

In [None]:
data.head()

In [None]:
new_data = data.copy()

In [None]:
def fun_only_author(text):
    arlen = text.split('/')
    return arlen[0]

In [None]:
new_data['only_author'] = new_data['authors'].apply(lambda x : fun_only_author(x))

In [None]:
total_rating = new_data.drop_duplicates(subset=['only_author', 'title'], keep='first')
total_rating = total_rating.groupby(by=['only_author']).agg({'average_rating': ['sum']})
total_rating.columns = ['total_rating']
total_rating.reset_index(inplace=True)
total_rating = total_rating.sort_values(by=['total_rating'], ascending=False)
total_rating

In [None]:
total_book = new_data.groupby(by=['only_author']).agg({'title': ['nunique']})
total_book.columns = ['total_book']
total_book.reset_index(inplace=True)
total_book = total_book.sort_values(by=['total_book'], ascending=False)
total_book

In [None]:
avg_author = pd.merge(total_book, total_rating, on='only_author', how='outer')
avg_author['average_rating'] = round(avg_author['total_rating'] / avg_author['total_book'], 2)
avg_author = avg_author[avg_author['total_book'] > 26]
avg_author = avg_author.sort_values(by=['average_rating'], ascending=False)
avg_author

## Weighted Rating calculation

<img src="https://miro.medium.com/max/736/1*fGziZl2Do-VyQXSCPq_Y2Q.png" />

In [None]:
total_vote = new_data.drop_duplicates(subset=['only_author', 'title'], keep='first')
total_vote.reset_index(inplace=True)
total_vote = total_vote[['only_author', 'title', 'average_rating', 'ratings_count']]
total_vote

In [None]:
C = total_vote.average_rating.mean()
C

In [None]:
m = total_vote.ratings_count.quantile(0.9)
m

In [None]:
total_vote = total_vote[total_vote['ratings_count'] >= m]
total_vote.head()

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['ratings_count']
    R = x['average_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
total_vote['score'] = total_vote.apply(weighted_rating, axis=1)

In [None]:
total_vote = total_vote.sort_values(by='score', ascending=False).head(20)
total_vote

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="score", y="title", data=total_vote, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.title("Top 20 Weighted Rating Books")
plt.show()

## most books of an author

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="total_book", y="only_author", data=avg_author, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

## average rating of an author

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="average_rating", y="only_author", data=avg_author, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

## Language Distribution

In [None]:
plt.figure(figsize=(15, 7))
ax = sns.countplot(x=data.language_code, data=data)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x()-0.05, p.get_height()+100))

## Top 20 Top Voted Books

In [None]:
top_pages = data.sort_values(by='  num_pages', ascending=False).head(20)

In [None]:
sns.set(style="darkgrid")
plt.figure(figsize=(10, 10))

color = sns.color_palette("Set2")
ax = sns.barplot(x="  num_pages", y="title", data=top_pages, palette=color)

for i in ax.patches:
    ax.text(i.get_width() + .05, i.get_y() + 0.5, str(i.get_width()), fontsize = 10, color = 'k')
plt.show()

## Relationship between rating and pages

In [None]:
ax = sns.relplot(data=data, x="average_rating", y="  num_pages", color = '#95a3c3', sizes=(100, 200), height=7, marker='o')

## Relationship between pages and ratings_count

In [None]:
ax = sns.relplot(data=data, x="  num_pages", y="ratings_count", color = '#95a3c3', sizes=(100, 200), height=7, marker='o')

## Data cloud of All title

In [None]:
title_value = data.title.unique()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
plt.subplots(figsize=(15,15))
wordcloud = WordCloud(
                          background_color='#000',
                          width=650,
                          height=550,
                          stopwords=STOPWORDS,
                         ).generate(" ".join(title_value))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('graph.png')

plt.figtext(.5,.91,'Data cloud of All title', color='#062175', fontsize=25, ha='center')
plt.show()

For Recommendation taking too much knowledge from this notebook. [https://www.kaggle.com/hoshi7/goodreads-analysis-and-recommending-books](https://www.kaggle.com/hoshi7/goodreads-analysis-and-recommending-books)

# Recommendation Engine

In [None]:
len(new_data.only_author.unique())

## create new feature

In [None]:
new_data.loc[ (new_data['average_rating'] >= 0) & (new_data['average_rating'] <= 1), 'rating_between'] = "between_0_to_1"
new_data.loc[ (new_data['average_rating'] > 1) & (new_data['average_rating'] <= 2), 'rating_between'] = "between_1_to_2"
new_data.loc[ (new_data['average_rating'] > 2) & (new_data['average_rating'] <= 3), 'rating_between'] = "between_2_to_3"
new_data.loc[ (new_data['average_rating'] > 3) & (new_data['average_rating'] <= 4), 'rating_between'] = "between_3_to_4"
new_data.loc[ (new_data['average_rating'] > 4) & (new_data['average_rating'] <= 5), 'rating_between'] = "between_4_to_5"

In [None]:
new_data.head(2)

In [None]:
trial = new_data[['average_rating', 'ratings_count']]
data_model = np.asarray([np.asarray(trial['average_rating']), np.asarray(trial['ratings_count'])]).T
data_model

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Elbow Method

score = []
x = data_model
for cluster in range(1,41):
    kmeans = KMeans(n_clusters = cluster, init="k-means++", random_state=40)
    kmeans.fit(x)
    score.append(kmeans.inertia_)

In [None]:
# plotting the score
plt.figure(figsize=(15, 10))
plt.plot(range(1,41), score)
plt.title('The Elbow Method')
# plt.xlabel('no of clusters')
# plt.ylabel('wcss')
plt.show()

In [None]:
rating_between_df = new_data['rating_between'].str.get_dummies(sep=",")
rating_between_df.head()

In [None]:
lang_df = new_data['language_code'].str.get_dummies(sep=",")
lang_df.head()

In [None]:
engine_features = pd.concat([rating_between_df, lang_df, new_data['average_rating'], new_data['ratings_count']], axis=1)
engine_features.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
min_max_scaler = MinMaxScaler()
engine_features = min_max_scaler.fit_transform(engine_features)

In [None]:
from sklearn import neighbors

In [None]:
engine_model = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')

In [None]:
engine_model.fit(engine_features)

In [None]:
dist, idlist = engine_model.kneighbors(engine_features)

In [None]:
def book_recommendation_engine(book_name):
    book_list_name = []
    book_id = new_data[new_data['title'] == book_name].index
    book_id = book_id[0]
#     print('book_id', book_id)
    for newid in idlist[book_id]:
#         print(newid)
        book_list_name.append(new_data.loc[newid].title)
#         print(new_data.loc[newid].title)
    return book_list_name

In [None]:
book_list_name = book_recommendation_engine('The Da Vinci Code (Robert Langdon  #2)')
book_list_name

#### Not too much but happy with this answer. I think we need book category to make it better. Try it yourself and let me know if you found the book of your choice.