In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import os

In [None]:
books = pd.read_csv('../input/books.csv', error_bad_lines=False, index_col='bookID')

Goodreads books dataset contains information about 13714 books.<br>
Information like, Book title, Book's author(s), average rating of book on Goodreads, ISBN, ISBN13, language of book, number of pages in book, number of rating for book and number of text review of book.

In [None]:
books.head()

Types od Data:
1. Continous Numeric - average_rating
1. Descrete Numeric - # num_pages, ratings_count and text_reviews_count
1. Categorical - authors, language_code
1. Text - title

In [None]:
books.info()

Mean, min, max, standard deviation and distribution of numerical data.

In [None]:
books[['average_rating', '# num_pages', 'ratings_count', 'text_reviews_count']].describe()

* Rating with more rating count must have more weightage than same rating with lesser rating count. As rating count increases, distribution of rating on rating scale also increases. Hence, book with more rating count weights higher than book with lesser rating count with same rating.
* Number of ratings for books can be used as weight for calculating weighted mean of average rating.
* Weighted mean provides more realistic estimate.

In [None]:
print('weighted(rating_count) mean ratings: ', np.average(a=books.average_rating, axis=0, weights=books.ratings_count))

In [None]:
fg, ax = plt.subplots(1,2, figsize=(10,10))

sns.boxplot(y=books['average_rating'], data=books, ax=ax[0], color='g')
ax[0].set_title('Average Rating')

sns.boxplot(y=books['# num_pages'], data=books, ax=ax[1], color='r')
ax[1].set_title('Number of Pages')

plt.show()

There is huge range of number of pages. Outliers with more than 2000 pages are mostly collection of books from series of books.
<br> Books with less than 2000 pages are <b>99.20%</b> of total books. Hence, It is still OK that, books with pages more than 2000 can be dropped for some the calculation.

In [None]:
valid = books[(books['# num_pages'] > 0) & (books['# num_pages'] < 2000)]
print(len(valid)/len(books)*100, '% books')

After removing outlier books with more than 2000 pages. Distribution of number of pages looks better.

In [None]:
fg, ax = plt.subplots(1,2, figsize=(10,10))

sns.boxplot(y=valid['average_rating'], data=valid, ax=ax[0], color='g')
ax[0].set_title('Average Rating')

sns.boxplot(y=valid['# num_pages'], data=valid, ax=ax[1], color='r')
ax[1].set_title('Number of Pages')

plt.show()

In [None]:
fg, ax = plt.subplots(1,2, figsize=(20,10))

sns.distplot(valid['average_rating'], ax=ax[0], color='g')
ax[0].set_title('Average Rating')

sns.distplot(valid['# num_pages'], ax=ax[1], color='r')
ax[1].set_title('Number of Pages')

plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.kdeplot(valid.average_rating, valid['# num_pages'], cmap='Blues', shade=True, shade_lowest=True)
plt.show()

We can see there is a high positive correlation between ratings_count and text_reviews_count.

In [None]:
correlation = books[['average_rating', '# num_pages', 'ratings_count', 'text_reviews_count']].corr()
sns.heatmap(correlation, annot=True, vmax=1, vmin=-1, center=0)
plt.show()

There books in total 30 different languages. There are books in 4 variants of english language like eng, en-US, en-GB, en-CA.

In [None]:
books['language_code'].unique()

More than 90% books are in English language or its variants.

In [None]:
lang_freq_table = pd.DataFrame(books.language_code.value_counts())
lang_freq_table

In [None]:
lang_freq_table.plot(kind='pie', subplots=True, figsize=(10,10))
plt.show()

Languages which has more than 200 books very similar interquartile range for average rating.

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(y=books.average_rating, x=books.language_code)
plt.show()

Books in written in <b>enm</b> language are significantly larger as compare to books in other languages. This may not be true in general since we have just 3 sample for <b>enm</b> language.
<br>Books in <b>jpn</b> languages have very narrow distibution for number of pages even when er have 63 samples. That implies that books in <b>jpn</b> language have approximately similar number of pages.

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(y=valid['# num_pages'], x=valid.language_code)
plt.show()

In [None]:
cons_lang = lang_freq_table[lang_freq_table['language_code']>10].index
lang_wise = books[books['language_code'].isin(cons_lang)].groupby('language_code').median()
lang_wise['book_count'] = lang_freq_table.loc[cons_lang]['language_code']

Since many language has very few samples (less than 10), here is the language wise median of average rating, number of pages, ratings count and text review count for language which has more than 10 samples.

In [None]:
lang_wise[['book_count', 'average_rating', '# num_pages', 'ratings_count', 'text_reviews_count']]

Since more than 80% books are in English language, overall variability estimaror (median) is highly influenced by variability estimator (median) of books in english language.

Median of rating of all books (3.96) is similar to median of rating of books in english language.
<br>Overall median of books (3.96) is shown as vertical line in the plot.

In [None]:
lang_wise.sort_values('average_rating', ascending=False, inplace=True)
plt.figure(figsize=(20,10))
a = sns.barplot(x='average_rating', y=lang_wise.index, data=lang_wise, orient='h')
a.plot([3.96,3.96],[0, len(lang_wise)], linewidth=2)
plt.show()

Median of number of pages of all books is shown as vertical line at 301 in the plot.
<br><b>zho</b> language has books with fewer panges whereas <b>mul</b> language has books with more pages as compare to books in other languages.

In [None]:
lang_wise.sort_values('# num_pages', ascending=False, inplace=True)
plt.figure(figsize=(20,10))
a = sns.barplot(x='# num_pages', y=lang_wise.index, data=lang_wise, orient='h')
a.plot([301,301],[0, len(lang_wise)], linewidth=2)
plt.show()

Clearly english books has much much more rating counts than any other language books.

In [None]:
lang_wise.sort_values('ratings_count', ascending=False, inplace=True)
plt.figure(figsize=(20,10))
a = sns.barplot(x='ratings_count', y=lang_wise.index, data=lang_wise, orient='h')
a.plot([630.5,630.5],[0, len(lang_wise)], linewidth=2)
plt.show()

Again, english books has much much more text reviews than any other language books.

In [None]:
lang_wise.sort_values('text_reviews_count', ascending=False, inplace=True)
plt.figure(figsize=(20,10))
a = sns.barplot(x='text_reviews_count', y=lang_wise.index, data=lang_wise, orient='h')
a.plot([40,40],[0, len(lang_wise)], linewidth=2)
plt.show()

In [None]:
new_dict = {}

for lang in cons_lang:
    df = books[books['language_code']==lang]
    wgt_mean_rating = np.average(a=df.average_rating, axis=0, weights=df.ratings_count)
    new_dict[lang] = wgt_mean_rating
wgt_mean = pd.DataFrame.from_dict(new_dict, orient='index', columns=['wgt_mean_rating'])

As i have aforementioned weighted mean is better variability estimator in this case.
<br>Here is the language wise weighted mean ratings for books. Ratings count is used as a weight.

In [None]:
wgt_mean = wgt_mean.sort_values('wgt_mean_rating', ascending=False)
wgt_mean

<b>zho, mul, jpn,</b> and <b>ita</b> has higher weighted mean rating than over weighted mean.

In [None]:
plt.figure(figsize=(20,10))
a = sns.barplot(x='wgt_mean_rating', y=wgt_mean.index, data=wgt_mean, orient='h')
a.plot([4.024,4.024],[0, len(lang_wise)], linewidth=2)
plt.show()

Word cloud of books' titles

In [None]:
text = ''.join(title for title in books.title)
wc = WordCloud(max_font_size=70, max_words=100, background_color='white').generate(text)
plt.figure(figsize=(16,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

Word cloud of authors name.

In [None]:
text = ''.join(title for title in books.authors)
wc = WordCloud(max_font_size=70, max_words=100, background_color='white').generate(text)
plt.figure(figsize=(16,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
auth_freq_table = pd.DataFrame(books.authors.value_counts())

In [None]:
cons_auth = auth_freq_table[auth_freq_table['authors']>0].index
authors_wise = books[books['authors'].isin(cons_auth)].groupby('authors').mean()
authors_wise['book_count'] = auth_freq_table.loc[cons_auth]['authors']

In [None]:
authors_wise = authors_wise[['book_count', 'average_rating', '# num_pages', 'ratings_count', 'text_reviews_count']]

<b>Top 10 authors according to number of books written by them</b>

In [None]:
top_10_auth_book_count = authors_wise.sort_values(['book_count', 'average_rating'], ascending=False)[:12]
top_10_auth_book_count

<b>Top 10 authors according to average rating of their books</b>
<br>Only those authors are considered who has written more then 5 books and who has average rating count more than 100000.

In [None]:
top_10_authors = authors_wise[(authors_wise['book_count']>5) & (authors_wise['ratings_count'] > 1e5) ].sort_values(['average_rating', 'book_count', 'ratings_count', '# num_pages', 'text_reviews_count'], ascending=False)[:10]
top_10_authors

<b>Top 10 books according to average rating</b>
<br>Only those books are considered who has average rating count more than 100000.

In [None]:
top_10_books = books[books['ratings_count']>1e5].sort_values(['average_rating', 'ratings_count', '# num_pages', 'text_reviews_count'], ascending=False)[:11]
top_10_books

**Please add you comment on this kernel**
<br>**If you like this kernel, then Do not forget to Vote.**