# 1. About Dataset

![](https://images.unsplash.com/photo-1507842217343-583bb7270b66?ixlib=rb-1.2.1&w=1000&q=80)

**The dataset contains information about book title, authors, publisher, user and their ratings.we have to find out what book you should be reading next ( there are very few free content recommendation systems that suggest books last I checked ), what are the details of every book you have read, create a word cloud from the books you want to read - all possible approaches to explore the dataset.**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**1.1 import Libraries**

In [None]:
import plotly.graph_objs as go
from plotly.offline import  init_notebook_mode, iplot
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
import plotly_express as px
%matplotlib inline

In [None]:
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

**1.2 Read CSV Data**

In [None]:
# goodreads data

books_data = pd.read_csv('/kaggle/input/goodbooks-10k/books.csv',error_bad_lines = False)
tags_data = pd.read_csv('/kaggle/input/goodbooks-10k/book_tags.csv')
ratings_data = pd.read_csv('/kaggle/input/goodbooks-10k/ratings.csv')
book_tags = pd.read_csv('/kaggle/input/goodbooks-10k/tags.csv')

# book crossing data

user_cols = ['user_id', 'location', 'age']
cross_users_data = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Users.csv', sep=';', names=user_cols, encoding='latin-1', low_memory=False, skiprows=1)
book_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
cross_books_data = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Books.csv', sep=';', names=book_cols, encoding='latin-1', low_memory=False, skiprows=1)
rating_cols = ['user_id', 'isbn', 'rating']
cross_ratings_data = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Book-Ratings.csv', sep=';', names=rating_cols, encoding='latin-1', low_memory=False, skiprows=1)

In [None]:
books_data.head()

In [None]:
cross_books_data.head()

In [None]:
books_data = books_data.drop(columns=['id', 'best_book_id', 'work_id', 'isbn', 'isbn13', 'title','work_ratings_count',
                                   'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 
                                    'image_url','small_image_url'])

1.3 **drop unnecessary data**

In [None]:
books_data = books_data.dropna()
cross_books_data = cross_books_data.drop(columns=['img_s', 'img_m', 'img_l'])

**1.4 Drop Duplicates from all the dataset**

In [None]:
ratings_data = ratings_data.sort_values("user_id")
ratings_data.drop_duplicates(subset =["user_id","book_id"], keep = False, inplace = True) 
books_data.drop_duplicates(subset='original_title',keep=False,inplace=True)
book_tags.drop_duplicates(subset='tag_id',keep=False,inplace=True)
tags_data.drop_duplicates(subset=['tag_id','goodreads_book_id'],keep=False,inplace=True)
cross_ratings_data.drop_duplicates(subset =["user_id","isbn"], keep = False, inplace = True) 
cross_books_data.drop_duplicates(subset='book_title',keep=False,inplace=True)

**1.5 clean the text**

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
cross_books_data['book_title'] = cross_books_data['book_title'].apply(lambda x:clean_text(x))

In [None]:
merge_data = pd.merge(cross_books_data, cross_ratings_data, on='isbn')
merge_data =  merge_data.sort_values('isbn', ascending=True)
merge_data.head()

# 2. Data Visualisation

**2.1 Good Reads Visualisation**

2.1.1 Top Rated

In [None]:
top_rated = books_data.sort_values('average_rating', ascending=False)
tf_top_rated = top_rated[:25]
fig = px.bar(tf_top_rated, x="average_rating", y="original_title", title='Top Rated Books and Their Ratings',
             orientation='h', color='original_title', width=1500, height=700)
fig.show()

In [None]:
fig = px.treemap(tf_top_rated, path=['original_title'], values='average_rating',title='Top Rated Books', width=1000, height=700)
fig.show()

2.1.2 Popular Book

In [None]:
top_popular = books_data.sort_values('ratings_count', ascending=False)
fifty_top_popular = top_popular[:50]
fig = px.bar(fifty_top_popular, x="ratings_count", y="original_title", title='Top Popular Books', orientation='h', color='original_title',
             width=1500, height=700)
fig.show()

In [None]:
fig = px.treemap(fifty_top_popular, path=['original_title'], values='ratings_count',title='Popular Books', width=1000, height=700)
fig.show()

2.1.3 Top Popular Authors

In [None]:
fifty_top_authors = top_rated[:50]
fig = px.treemap(fifty_top_authors, path=['authors'], values='average_rating',title='Popular Authors', width=1000, height=700)
fig.show()

2.1.4 Top author(frequencies of books)

In [None]:
top_author_counts = books_data['authors'].value_counts().reset_index()
top_author_counts.columns = ['value', 'count']
top_author_counts['value'] = top_author_counts['value']
top_author_counts = top_author_counts.sort_values('count')
fig = px.bar(top_author_counts.tail(50), x="count", y="value", title='Top Authors', orientation='h', color='value',
             width=1000, height=700)
fig.show()

**2.2 Cross Book Visualisation**

2.2.1 Top Years of Publishing

In [None]:
cross_typ = merge_data['year_of_publication'].value_counts().reset_index()
cross_typ.columns = ['value', 'count']
cross_typ['value'] = cross_typ['value'] + ' year'
cross_typ = cross_typ.sort_values('count')
fig = px.bar(cross_typ.tail(50), x="count", y="value", title='Top Years of Publishing', orientation='h', color='value',
             width=1000, height=700)
fig.show()

2.2.2 top authors(frequency of book)

In [None]:
cross_author_counts = merge_data['book_author'].value_counts().reset_index()
cross_author_counts.columns = ['value', 'count']
cross_author_counts['value'] = cross_author_counts['value']
cross_author_counts = cross_author_counts.sort_values('count')
fig = px.bar(cross_author_counts.tail(50), x="count", y="value", title='Top Authors', orientation='h', color='value',
             width=1000, height=700)
fig.show()

2.2.3 Top Books

In [None]:
top_book_counts = merge_data['book_title'].value_counts().reset_index()
top_book_counts.columns = ['value', 'count']
top_book_counts['value'] = top_book_counts['value']
top_book_counts = top_book_counts.sort_values('count')
fig = px.bar(top_book_counts.tail(20), x="count", y="value", title='Top Books', orientation='h', color='value',
             width=1000, height=700)
fig.show()

2.2.4 Let's see Rating Distribution

In [None]:
merge_data['rating'].value_counts().iplot(kind='bar',
                                         xTitle='Rating',
                                         yTitle='Counts',
                                         title='Rating Distribution',
                                         color='blue')

**2.3 Wordclouds**

In [None]:
stop_words=set(STOPWORDS)
author_string = " ".join(books_data['authors'])
title_string = " ".join(books_data['original_title'])
cross_author_string = " ".join(merge_data['book_author'].astype(str))
cross_title_string = " ".join(merge_data['book_title'].astype(str))
cross_publisher_string = " ".join(merge_data['publisher'].astype(str))

In [None]:
def wordcloud(string):
    wc = WordCloud(width=800,height=500,mask=None,random_state=21, max_font_size=110,stopwords=stop_words).generate(string)
    fig=plt.figure(figsize=(16,8))
    plt.axis('off')
    plt.imshow(wc)

In [None]:
wordcloud(author_string)

In [None]:
wordcloud(title_string)

In [None]:
wordcloud(cross_author_string)

In [None]:
wordcloud(cross_title_string)

In [None]:
wordcloud(cross_publisher_string)

# 3. content based recommondation

**This method uses attributes of the content to recommend similar content. It doesn’t have a cold-start problem because it works through attributes or tags of the content, such as book title, authors or rating, so that new book can be recommended right away.**

In [None]:
content_data = books_data[['original_title','authors','average_rating']]
content_data = content_data.astype(str)

In [None]:
content_data['content'] = content_data['original_title'] + ' ' + content_data['authors'] + ' ' + content_data['average_rating']

In [None]:
content_data = content_data.reset_index()
indices = pd.Series(content_data.index, index=content_data['original_title'])

**3.1 content based recommodation author**

In [None]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(content_data['authors'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

*The advantage of TF-IDF encoding is that it will weigh a term (a tag for a book in our example) according to the importance of the term within the document: The more frequently the term appears, the larger its weight will be. At the same time, it weighs the item inversely to the frequency of this term across the entire dataset: It will emphasise terms that are relatively rare occurrences in the general dataset but of importance to the specific content at hand. That means that words such as ‘is’, ‘are’, ‘by’ or ‘a’ which are likely to show up in every book content but aren’t useful for our user-recommendation, will be weighed less than words that are more unique to the content that we are recommending.*

**Compute the cosine similarity matrix**

*We are going to use a simple similarity-based method called cosine similarity*

In [None]:
cosine_sim_author = linear_kernel(tfidf_matrix, tfidf_matrix)

**author wise recommodation**

In [None]:
def get_recommendations_books(title, cosine_sim=cosine_sim_author):
    idx = indices[title]

    # Get the pairwsie similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim_author[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return list(content_data['original_title'].iloc[book_indices])

In [None]:
def author_book_shows(book):
    for book in book:
        print(book)

In [None]:
books1 = get_recommendations_books('The Hobbit', cosine_sim_author)
author_book_shows(books1)

In [None]:
books2 =get_recommendations_books('Shadow Kiss', cosine_sim_author)
author_book_shows(books2)

In [None]:
books3 = get_recommendations_books('Harry Potter and the Goblet of Fire', cosine_sim_author)
author_book_shows(books3)

**3.2 content based filtering on multiple matrix**

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(content_data['content'])

cosine_sim_content = cosine_similarity(count_matrix, count_matrix)

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim_content):
    idx = indices[title]

    # Get the pairwsie similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim_content[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return list(content_data['original_title'].iloc[book_indices])

In [None]:
def book_shows(book):
    for book in book:
        print(book)

In [None]:
books4 = get_recommendations('The Hobbit', cosine_sim_content)
book_shows(books4)

In [None]:
books5 =get_recommendations('Shadow Kiss', cosine_sim_content)
book_shows(books5)

In [None]:
books6 =get_recommendations('The Two Towers', cosine_sim_content)
book_shows(books6)

In [None]:
books7 = get_recommendations('Harry Potter and the Goblet of Fire', cosine_sim_content)
book_shows(books7)

# 4. colloaborative Recommendation

**In collaborative-filtering items are recommended, for example books, based on how similar your user profile is to other users’, finds the users that are most similar to you and then recommends items that they have shown a preference for. This method suffers from the so-called cold-start problem: If there is a new book, no-one else would’ve yet liked or watched it, so you’re not going to have this in your list of recommended books, even if you’d love it.**

In [None]:
merge_data = merge_data[:40000]

**pivot table**

In [None]:
book_rating = pd.pivot_table(merge_data, index='user_id', values='rating', columns='book_title', fill_value=0)
book_rating

**find correlation b/w books**

In [None]:
book_corr = np.corrcoef(book_rating.T)

In [None]:
book_corr.shape

In [None]:
book_list=  list(book_rating)
book_titles =[] 
for i in range(len(book_list)):
    book_titles.append(book_list[i])

**Define Recommendation function**

In [None]:
def get_recommendation_collabarative(books_list):
    similar_books = np.zeros(book_corr.shape[0])
    
    for book in books_list:    
        book_index = book_titles.index(book)
        similar_books += book_corr[book_index] 
    book_preferences = []
    for i in range(len(book_titles)):
        book_preferences.append((book_titles[i],similar_books[i]))
        
    return sorted(book_preferences, key= lambda x: x[1], reverse=True)

**make a book list**

In [None]:
list_of_books = ['one hundred years of solitude',
                 'stardust',
                 'mogs christmas',
                 'dragonmede',
                 'twopence to cross the mersey',
                 'the candywine development']

In [None]:
books8 = get_recommendation_collabarative(list_of_books)

**top similar books collabarative**

In [None]:
i=0
n =0
while n < 9:
    similar_books_to_read= books8[i][0]
    i += 1
    if similar_books_to_read in list_of_books:
        continue
    else:
        print(similar_books_to_read)
        n += 1

# If you found this kernel helpful, please upvote it.