In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

In [None]:
df = pd.read_csv('../input/amazon-top-50-bestselling-books-2009-2019/bestsellers with categories.csv')
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
genre = df['Genre']
genre.value_counts().plot(kind='pie',autopct='%.2f',labels=genre.unique())
plt.show()

In [None]:
sns.histplot(x=df['User Rating'],hue=df['Genre'],element='poly')
plt.show()

In [None]:
years = list(range(2009,2020))
plt.figure(figsize=(15,5))
sns.scatterplot(x=df['Year'],y=df['User Rating'],hue=df['Genre'])
plt.xticks(ticks=years)
plt.title('User Ratings (2009-2019)')
plt.show()

# **Top 10 Most Reviewed Books**

In [None]:
top10_alltime = df[['Name', 'Author', 'Reviews','User Rating']]
top10_alltime.sort_values('Reviews',ascending=False)[:10]

## **As we can see the dataset contains Duplicate Values**
## **Let's Remove them**

In [None]:
#Dropping duplicate rows
df.drop_duplicates(inplace=True,subset=['Name'])
df.shape

In [None]:
#Top 10  Most Reviewed books
top10_alltime = df[['Name', 'Author', 'Reviews','User Rating']]
top10_alltime.sort_values('Reviews',ascending=False)[:10]

In [None]:
years = sorted(df['Year'].unique())
years

# **Number of Books in each year**

In [None]:
year_group = df.groupby('Year')

In [None]:
num_of_books = []
genre = []
for year in years:
    books = year_group.get_group(year)['Genre']
    genre.append(books.value_counts())
    num_of_books.append(books.value_counts().sum())

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(y=num_of_books,x=years)
plt.xlabel('Year')
plt.ylabel('Number of Books')
plt.show()

# **Genre Ratio of each year**

In [None]:
def genre_ratio_year_wise():
    i = 0
    for g in genre:
        plt.title(f'{years[i]} books.')
        plt.pie(g,autopct='%.2f',labels=g.index)
        plt.xlabel(f'Total Books{g.sum()}')
        plt.show()

        i += 1
        print('\n')


In [None]:
genre_ratio_year_wise()

# **Top 5 Most Reviewed Books of each year**

In [None]:
def top_5_books_year_wise():
    year_top5_dfs = dict()
    for year in years:
        yr_df = year_group.get_group(year)
        top5 = yr_df[['Name', 'Author', 'Reviews','User Rating','Year','Genre']]
        year_top5_dfs[year] = top5.sort_values('Reviews',ascending=False)[:5]
    return year_top5_dfs

In [None]:
top5 = top_5_books_year_wise()

# **Top 5 books in 2009**

In [None]:
top5[2009]

# **Top 5 books in 2015**

In [None]:
top5[2015]

# **Top5 books in 2019**

In [None]:
top5[2019]

**Similarly you can check Other years as well**

# **Book Recommendation**

In [None]:
df.columns

### **Features used for Recommending a book**

In [None]:
df.drop(['Name','Year'], axis=1).columns

In [None]:
sum(df['Author'].value_counts()[df['Author'].value_counts() >= 5])

In [None]:
sum(df['Author'].value_counts()[df['Author'].value_counts() <5])

# **Setting Author Names to others who wrote less than 5 books**

In [None]:
others = df['Author'].value_counts()[df['Author'].value_counts() <5].keys()
df['Author'].replace(others,'Others',inplace=True)
df['Author'].value_counts()

In [None]:
author = df['Author']
author = pd.get_dummies(author)
author.sample(5)

In [None]:
author.shape

In [None]:
df.drop('Author',axis=1,inplace=True)

In [None]:
df = pd.concat([df,author],axis=1)

In [None]:
df.head()

In [None]:
df['Genre'].replace(['Non Fiction','Fiction'],[0, 1],inplace=True)
df['Genre'].unique()

In [None]:
book_names = pd.DataFrame({'Name':df['Name']})
book_names.shape

In [None]:
book_names.index = [i for i in range(351)]

In [None]:
df.set_index('Name',inplace=True)
df.head()

# **Feature Scaling**

In [None]:
#cols to scale
cols = df.columns[:4]
cols

In [None]:
clf = MinMaxScaler()
scalled = clf.fit_transform(df[cols])

In [None]:
i = 0
for col in cols:
    df[col] = scalled[:,i]
    i += 1

In [None]:
df.head()

In [None]:
def getRecommendation_books_for(book_name,no_of_books=6,get_similarity_rate=False):
    #NearestNeighbors Algorithm to get Nearest  Book values
    kn = NearestNeighbors(n_neighbors=no_of_books+1,metric='manhattan')
    kn.fit(df)
    print(f'Similar Books for "{book_name[0]}":')
    
    #indices Indicated the index value of the recommended books
    distances, indices = kn.kneighbors(df.loc[book_name])
    nearest_books = [book_names.loc[i][0] for i in indices.flatten()][1:]
    
    if not get_similarity_rate:
        return nearest_books
    
    sim_rates = []
    for book in nearest_books:
        sim = cosine_similarity(df.loc[book_name],[df.loc[book]]).flatten()
        sim_rates.append(sim[0])
    recommended_books = pd.DataFrame({'Recommended Books':nearest_books,'Similarity':sim_rates})
    recommended_books.sort_values(by='Similarity',ascending=True)
    return recommended_books

# **Predicing Recommended Books**

In [None]:
getRecommendation_books_for(book_names.loc[339],5,True)

In [None]:
getRecommendation_books_for(book_names.loc[34],5,True)

In [None]:
getRecommendation_books_for(book_names.loc[10],5)

In [None]:
getRecommendation_books_for(book_names.loc[199],5,True)