In [1]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
book_description = pd.read_csv('description.csv', encoding = 'latin-1')

In [3]:
book_description.head()

Unnamed: 0,book_id,name,description
0,4833.0,The Glass Castle,"A tender, moving tale of unconditional love in..."
1,590.0,"Night (The Night Trilogy, #1)","Born into a Jewish ghetto in Hungary, as a chi..."
2,4264.0,"Angela's Ashes (Frank McCourt, #1)",Imbued on every page with Frank McCourt's asto...
3,3361.0,"Eat, Pray, Love","A celebrated writer's irresistible, candid, an..."
4,4535.0,Into Thin Air: A Personal Account of the Mount...,A bank of clouds was assembling on the not-so-...


In [4]:
books_tfidf = TfidfVectorizer(stop_words='english')
book_description['description'] = book_description['description'].fillna('')
book_description_matrix = books_tfidf.fit_transform(book_description['description'])


In [7]:
book_description_matrix.shape
print(book_description_matrix)

  (0, 3752)	0.049955073644452376
  (0, 3253)	0.06987145007848536
  (0, 1809)	0.06450244729893673
  (0, 996)	0.06450244729893673
  (0, 2625)	0.06450244729893673
  (0, 3887)	0.06069307920354963
  (0, 3599)	0.07393466934825911
  (0, 1630)	0.06987145007848536
  (0, 128)	0.06450244729893673
  (0, 1006)	0.06450244729893673
  (0, 1041)	0.051514708328613906
  (0, 2002)	0.06987145007848536
  (0, 3754)	0.06987145007848536
  (0, 1736)	0.06987145007848536
  (0, 2105)	0.04508581281997952
  (0, 285)	0.06987145007848536
  (0, 1834)	0.051514708328613906
  (0, 2178)	0.06069307920354963
  (0, 3187)	0.05773830375361193
  (0, 1487)	0.05773830375361193
  (0, 412)	0.06450244729893673
  (0, 4076)	0.06987145007848536
  (0, 3675)	0.06987145007848536
  (0, 1466)	0.06450244729893673
  (0, 3440)	0.06987145007848536
  :	:
  (142, 4087)	0.15890859974694924
  (142, 3991)	0.16888241155386877
  (142, 1589)	0.16888241155386877
  (142, 1537)	0.16888241155386877
  (142, 2214)	0.16888241155386877
  (142, 3727)	0.168882411

In [6]:
cosine_similarity = linear_kernel(book_description_matrix, book_description_matrix)

In [10]:
indices = pd.Series(book_description['name'].index)
def recommend(index, cosine_sim=cosine_similarity):
    id = indices[index]
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]
    books_index = [i[0] for i in similarity_scores]
    return book_description['name'].iloc[books_index]
print('Top five books similar to the book at index:',2)
print(recommend(2))

Top five books similar to the book at index 2
6                                 Running with Scissors 
29                            The Diary of a Young Girl 
116    It's St. Patrick's Day (Turtleback School & Li...
11     Persepolis: The Story of a Childhood (Persepol...
20     Maus I: A Survivor's Tale: My Father Bleeds Hi...
Name: name, dtype: object


In [11]:
print(recommend(6))

2            Angela's Ashes (Frank McCourt, #1) 
9         A Child Called "It" (Dave Pelzer, #1) 
21         Wild Swans: Three Daughters of China 
22    A Long Way Gone: Memoirs of a Boy Soldier 
29                    The Diary of a Young Girl 
Name: name, dtype: object
