In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the right libraries


In [None]:
import pandas as pd
pd.set_option('max_columns', 100)
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from collections import Counter
from wordcloud import WordCloud
from datetime import datetime
import nltk
from nltk.corpus import stopwords

# Loading Data

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.head()

In [None]:
df.tail()

# Data cleaning

In [None]:
df.info()

There seems to be no missing data in this dataset. 

In [None]:
df = df.drop_duplicates(subset = ['Name'])
df.tail()

I decided to remove books that are of the same title but published in a different year. 

In [None]:
df.isnull().sum()

In [None]:
df.describe()

Using a describe function can catch some anomalies from the very start. Look at the minimum price for some books. They are basically free and those are anomalies. 

In [None]:
df[df['Price'] == 0]

In [None]:
df = df[df['Price'] != 0]

Books that are unpriced should be removed from the data set. 

# Data Visualisation 

**Price**

In [None]:
fig = plt.figure(figsize = (15,5))

fig.add_subplot(1,3,1)
sns.distplot(df['Price'])

fig.add_subplot(1,3,2)
sns.distplot(np.log1p(df['Price']))
plt.xlabel('log1p(price)')

fig.add_subplot(1,3,3)
sns.distplot(np.log(df['Price']))
plt.xlabel('log(price)')

plt.tight_layout()
plt.show()

I plotted a total of three distributions of price. As seen from the first plot, the distribution is skewed. Therefore, i used a log function to create a more even distribution as seen from the second and third plot. 

In [None]:
df['log(price)'] = np.log1p(df['Price'])

I added log(price) into our dataframe.

**Name**

In [None]:
def preprocess(rows):
    title = str(rows).lower()
    title = re.sub("[^a-zA-Z_]", ' ', title)
    #title = re.sub('[+-\/|]', ' ', title)
    #title = re.sub('[!#\"~*)(,.:;?]', ' ', title)
    #title = "".join(re.findall('[a-zA-Z0-9\s]', title))
    return title

df['process_name'] = df['Name'].apply(preprocess)
df['process_name'].head()

Firstly, I needed to clean the titles. Then, we are able to obtain character length and word length. 

In [None]:
df['name_character_length'] = df['process_name'].apply(lambda x : len(x.strip()))
df['name_word_length'] = df['process_name'].apply(lambda x : len(x.split(' ')))

In [None]:
fig = plt.figure(figsize = (15,10))

fig.add_subplot(3,2,1)
sns.distplot(df['name_character_length'])
plt.title('Distribution of character length in name')

fig.add_subplot(3,2,2)
sns.distplot(df['name_word_length'])
plt.title('Distribution of word length in name')


fig.add_subplot(3,2,3)
sns.scatterplot(df['name_character_length'], df['Price'])

fig.add_subplot(3,2,4)
sns.scatterplot(df['name_word_length'], df['Price'])

fig.add_subplot(3,2,5)
sns.scatterplot(df['name_character_length'], df['log(price)'])


fig.add_subplot(3,2,6)
sns.scatterplot(df['name_word_length'], df['log(price)'])

plt.tight_layout()
plt.show()


Most titles have around 25 characters and about a total of 5 words. I attempted to visualise the relationship between price and words. However, as seen from the scatterplot, there seems to be no clear relationship.

In [None]:
name_words = []
english_stopwords = stopwords.words('english')
#english_stopwords.append('book', 'books')
for element in df['process_name'].values:
    name_words.extend(element.split(' '))

name_words = [word for word in name_words if word not in english_stopwords]

In [None]:
print("Total no of words : ", len(name_words))
print("Total unique words : ", len(set(name_words)))

In [None]:
name_reqs = {i[0] : i[1] for i in Counter(name_words).most_common(100)}
plt.figure(figsize = (20,20))
wordcloud = WordCloud(width=800,height=600,min_font_size=10, background_color = 'white').generate_from_frequencies(name_reqs)
plt.imshow(wordcloud)
plt.tight_layout()

From the wordcloud, you could see the most common words. 

In [None]:
list_word = Counter(name_words).most_common(30)
df_words = pd.DataFrame(list_word, columns = ['word', 'frequency'])
df_words = df_words.drop(df.index[0])
df_words.head()

In [None]:
temp_df = df[['process_name', 'Price', 'log(price)']]

In [None]:
words_list = df_words.word.values.tolist()
for word in words_list:
    temp_df[word] = temp_df['process_name'].apply(lambda x : 1 if word in x else 0)
    
temp_df.head()

In [None]:
words = dict()
words['word'] = []
words['mean_price'] = []
words['median_price'] = []

for word in words_list:
    words['word'].append(word)
    words['mean_price'].append(temp_df[temp_df[word] == 1]['Price'].mean())
    words['median_price'].append(temp_df[temp_df[word] == 1]['Price'].median())
    

In [None]:
words = pd.DataFrame(words)

In [None]:
fig = plt.figure(figsize = (15, 10))

fig.add_subplot(2,1,1)
sns.barplot(words['word'], words['mean_price'], label = "average price of the books with words", order = words['word'])
plt.axhline(df['Price'].mean(), linestyle = ":", label = "average mean price of all the books")
plt.xticks(rotation = 45)
plt.title("Plot showing average price of books with most frequent words")
plt.legend()

fig.add_subplot(2,1,2)
sns.barplot(words['word'], words['median_price'], label = "median price of the books with words")
plt.axhline(df['Price'].median(), linestyle = ":", label = "median mean price of all the books")
plt.xticks(rotation = 45)
plt.title("Plot showing Median price of books with most frequent words")
plt.legend()

plt.tight_layout()
plt.show()

The visualisations above shows you the average price of the book containing the most frequent words. Books that have different editions are more expensive on average. Perhaps because the books are textbooks (usually many editions printed) which are usually already more expensive. 

In [None]:
plt.figure(figsize = (15, 5))
sns.distplot(df['log(price)'], label = "Price distribution of overall dataset")
sns.distplot(np.log1p(words['mean_price']), label = "Average Price distribution records with frequent words")
sns.distplot(np.log1p(words['median_price']), label = "Median Price distribution of records with frequent words")
plt.legend()
plt.grid(linestyle = ":")
plt.xlabel("Price Distribution")
plt.title("Price Distribution of books having top 30 most frequent words in name vs Overall data")
plt.show()

From the distribution above, there is no clear relationship that price is affected by the words used. 

**Author**

In [None]:
df_author = df[['Author', 'Price']]

authors = dict()
authors['author'] = []
authors['mean_price'] = []

for n in df_author['Author'].unique().tolist():
    authors['author'].append(n)
    authors['mean_price'].append(df_author[df_author['Author'] == n]['Price'].mean())

In [None]:
authors = pd.DataFrame(authors)
authors = authors.sort_values('mean_price', ascending = False).reset_index(drop = True)

In [None]:
plt.figure(figsize =(100,100))
sns.barplot(authors['author'], authors['mean_price'], label = 'mean price for each author')
plt.axhline(df['Price'].mean(), label = 'overall mean price')
plt.axhline(df['Price'].median(), label = 'overall median price')
plt.xticks(rotation = 90)
plt.title('Average price of books sold by each author')
plt.legend()
plt.tight_layout()
plt.show()




In [None]:
authors.iloc[0:10,]

**User Rating**

In [None]:
fig = plt.figure(figsize = (15,5))
sns.distplot(df['User Rating'])
plt.title('Distribution of user ratings')
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize = (15,5))

fig.add_subplot(2,1,1)
sns.scatterplot(df['User Rating'], df['Price'])
plt.title('price against user rating')

fig.add_subplot(2,1,2)
sns.scatterplot(np.log1p(df['User Rating']), np.log1p(df['Price']))
plt.title('log(price) against log(user rating)')

plt.tight_layout()

Price seems to increase with rating. But it may not be conclusive as data is also cluttered at higher ratings. 

**Reviews**

In [None]:
fig = plt.figure(figsize = (15,5))
sns.distplot(df['Reviews'])
plt.title('Distribution of reviews')
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize = (15,5))

fig.add_subplot(2,1,1)
sns.scatterplot(df['Reviews'], df['Price'])
plt.title('price against reviews')

fig.add_subplot(2,1,2)
sns.scatterplot(np.log1p(df['Reviews']), np.log1p(df['Price']))
plt.title('log(price) against log(reviews)')

plt.tight_layout()

Again, there seem to be no relationship between reviews and price as seen from the scatter plot. Is there any way to draw relationships? Am i doing this wrong?

**Genre**

There are only 2 main genres. Fiction and non-fiction. There are more non-fiction books than fiction books in this dataset.

In [None]:
plt.figure(figsize = (15,5))
plt.pie(df['Genre'].value_counts(), labels = ['Non Fiction', 'Fiction'], autopct="%.1f%%")
plt.title('distribution of genres')
plt.tight_layout()


We can also use groupby functions to split genres and count the number of reviews each genre has. 

In [None]:
genre = df.groupby('Genre')['Reviews'].agg(['count', 'mean', 'median'])
genre

In [None]:
fig = plt.figure(figsize = (15,5))

fig.add_subplot(1,2,1)
sns.barplot(y= 'mean' , x= genre.index, data=genre)
plt.title('mean reviews across genres')

fig.add_subplot(1,2,2)
sns.barplot(y= 'median' , x= genre.index, data=genre)
plt.title('median reviews across genres')

plt.tight_layout()


#how to combine using seaborn? send help

In [None]:
fig = plt.figure(figsize = (15, 5))

fig.add_subplot(1,2,1)
sns.boxplot(x = df['Genre'], y = df['Reviews'])

fig.add_subplot(1,2,2)
sns.violinplot(x = df['Genre'], y = df['Reviews'])

plt.tight_layout()

**Year**

In this section, I attempted to use a pivot table for the first time. Indexing by year and genre, we can get the average of each other features. 

In [None]:
#plt.figure(figsize = (15, 10))
pivot = pd.pivot_table(df, index =('Year','Genre'), aggfunc = np.mean)
pivot
#sns.heatmap(pivot, annot = True, fmt = 'g')
#plt.xticks(labels = ['Non Fiction', 'Fiction'])
#plt.show()

Using 'values' = price would give you the price for each data point. 

In [None]:
pivot = pd.pivot_table(df, index = 'Year', columns = 'Genre', values = 'Price' )
pivot

It is also possible to better visualise a pivot table through a heatmap which i thought was pretty cool. 

In [None]:
plt.figure(figsize = (15,5))
sns.heatmap(pivot, annot = True)
plt.tight_layout()