<a href="https://colab.research.google.com/github/sarthak-314/Book-Recommender-System/blob/master/TF-IDF%20Based%20Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **TF-IDF FILTERING**
***

In [None]:
import pandas as pd
import os

%cd /content/drive/My\ Drive

/content/drive/My Drive


In [None]:
#Load the dataframes
DF_PATH = './Colab Notebooks/CADABRA/Recommender System/goodreads-10k'

books = pd.read_csv(os.path.join(DF_PATH, 'books.csv'))
book_tags = pd.read_csv(os.path.join(DF_PATH, 'book_tags.csv'))
ratings = pd.read_csv(os.path.join(DF_PATH, 'ratings.csv'))
tags = pd.read_csv(os.path.join(DF_PATH, 'tags.csv'))
to_read = pd.read_csv(os.path.join(DF_PATH, 'to_read.csv'))

***
## **Common Tags Recommender**: Recommend books with similar tags
Recommend books with similar tags where the tags are weighted by IDF.

In [None]:
TOTAL_BOOKS = book_tags.goodreads_book_id.nunique()
TOTAL_TAGS = book_tags.tag_id.nunique()
TOTAL_TAG_COUNTS_SUM = book_tags.groupby('tag_id')['count'].sum().sum()

In [None]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [None]:
book_tags.tag_id.value_counts().describe()

count    34252.000000
mean        29.192806
std        277.254715
min          1.000000
25%          1.000000
50%          1.000000
75%          5.000000
max       9983.000000
Name: tag_id, dtype: float64

Almost half of the tags are unique with only 1 tag. We have to be extra careful with noise

In [None]:
import numpy as np
tag_counts = book_tags.groupby('tag_id')['count'].sum()
NOISE_REDUCE_TERM = 25
def get_idf(tag):
    tag_count = tag_counts[tag]
    #Last 
    return np.log(TOTAL_TAG_COUNTS_SUM/(tag_count+NOISE_REDUCE_TERM))

book_tags['idf'] = book_tags.tag_id.apply(get_idf)

In [None]:
book_tags.head(5)

Unnamed: 0,goodreads_book_id,tag_id,count,idf
0,1,30574,167697,0.394859
1,1,11305,37174,4.075187
2,1,11557,34173,3.836834
3,1,8717,12986,3.325655
4,1,33114,12716,4.727339


In [None]:
#Get the name of the tags in book_tags
book_tags = book_tags.merge(tags)
book_tags.head(3)

Unnamed: 0,goodreads_book_id,tag_id,count,idf,tag_name
0,1,30574,167697,0.394859,to-read
1,2,30574,24549,0.394859,to-read
2,3,30574,496107,0.394859,to-read


In [None]:
#returns tag_id, count, tag_name for the book with sorted order of count
def get_tags_by_title(title):
    row = books.loc[books.title==title]
    goodreads_book_id = int(row.goodreads_book_id)
    tag_ids_df = book_tags.loc[book_tags.goodreads_book_id==goodreads_book_id]
    tag_ids = tag_ids_df.drop('goodreads_book_id', axis=1).sort_values(by='count', ascending=False).reset_index(drop=True)
    return tag_ids

In [None]:
get_tags_by_title('The Hunger Games (The Hunger Games, #1)').head()

Unnamed: 0,tag_id,count,idf,tag_name
0,11557,50755,3.836834,favorites
1,8717,35418,3.325655,currently-reading
2,33114,25968,4.727339,young-adult
3,11743,13819,4.036309,fiction
4,10064,12985,6.609202,dystopian


In [None]:
def compare_books(tags_A, tags_B):
    merged = tags_A.merge(tags_B, on='tag_name')
    if merged.empty: 
        return 0
    try:
        common_tf = ((merged['count_x'] + merged['count_y']) * merged['idf_x']).sum()
        total_tf = (tags_A['count'] * tags_A['idf']).sum() + (tags_B['count']*tags_B['idf']).sum()
        similarity_index = common_tf / total_tf
        return similarity_index
    except: 
        return 0

In [None]:
book1 = 'The Hunger Games (The Hunger Games, #1)'
book2 = 'Catching Fire (The Hunger Games, #2)'
book3 = 'Thinking, Fast and Slow'

x = get_tags_by_title(book1)
y = get_tags_by_title(book2)
z = get_tags_by_title(book3)

print('Similarity between \x1b[32m{}\x1b[0m and \x1b[32m{}\x1b[0m is : \x1b[32m{:.4f}\x1b[0m'.format(book1, book2, compare_books(x, y)))
print('Similarity between \x1b[32m{}\x1b[0m and \x1b[32m{}\x1b[0m is : \x1b[32m{:.4f}\x1b[0m'.format(book1, book3, compare_books(x, z)))

Similarity between [32mThe Hunger Games (The Hunger Games, #1)[0m and [32mCatching Fire (The Hunger Games, #2)[0m is : [32m0.8831[0m
Similarity between [32mThe Hunger Games (The Hunger Games, #1)[0m and [32mThinking, Fast and Slow[0m is : [32m0.3065[0m


The similarity between two books is a value between 0 and 1 where 1 means they are the same book sharing all the tags and 0 means they are completely diffrent with no common tags

In [None]:
def get_n_similar_books(title, n):
    tags_for_book = get_tags_by_title(title)
    def similarity_with_title(x):
        try:
            tags_x = get_tags_by_title(x['title'])
            similarity_index = compare_books(tags_for_book, tags_x)
            return similarity_index
        except TypeError:
            return 0
    similarity_with_all_books = books.dropna().apply(similarity_with_title, axis=1)
    top_n_similar_book_rows = similarity_with_all_books.sort_values(ascending=False).head(n).index
    n_similar_titles = books.iloc[top_n_similar_book_rows].title.tolist()
    return n_similar_titles[1:]

In [None]:
#Let's get the recommendations
get_n_similar_books('The Hunger Games (The Hunger Games, #1)', n=5)

['Mockingjay (The Hunger Games, #3)',
 'The Hunger Games Trilogy Boxset (The Hunger Games, #1-3)',
 'Divergent (Divergent, #1)',
 'Insurgent (Divergent, #2)']

The TF-IDF filtering works, boista!

**Same Author Recommender** - Recommend the books by the same author

In [None]:
def same_author_recommender(title):
    row = books.loc[books.title==title]
    authors = row.authors.values
    def is_author_in_book(x):
        for author in authors:
            if author in str(x):
                return True
        return False
    books_by_author = books[books.authors.apply(is_author_in_book)].title.values
    return books_by_author

In [None]:
same_author_recommender('The Hunger Games (The Hunger Games, #1)')

array(['The Hunger Games (The Hunger Games, #1)',
       'Catching Fire (The Hunger Games, #2)',
       'Mockingjay (The Hunger Games, #3)',
       'The Hunger Games Trilogy Boxset (The Hunger Games, #1-3)',
       'Gregor the Overlander (Underland Chronicles, #1)',
       'Gregor and the Code of Claw (Underland Chronicles, #5)',
       'Gregor and the Curse of the Warmbloods (Underland Chronicles, #3)',
       'Gregor and the Prophecy of Bane (Underland Chronicles, #2)',
       'Gregor and the Marks of Secret (Underland Chronicles, #4)'],
      dtype=object)