In [22]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [11]:
books = pd.read_csv('books.csv',encoding = "ISO-8859-1")

In [12]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [13]:
books.shape
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

In [14]:
ratings = pd.read_csv('ratings.csv', encoding = "ISO-8859-1")
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [17]:
book_tags = pd.read_csv('book_tags.csv', encoding = "ISO-8859-1")
book_tags.head()
tags = pd.read_csv('tags.csv')
tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


In [18]:
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [20]:
to_read = pd.read_csv('to_read.csv')
to_read.head()

Unnamed: 0,user_id,book_id
0,1,112
1,1,235
2,1,533
3,1,1198
4,1,1874


TfidfVectorizer function from scikit-learn, which transforms text to feature vectors that can be used as input to estimator.

Cosine Similarity to calculate a numeric value that denotes the similarity between two books.

In [23]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [63]:
# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of book authors
def authors_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices]

In [65]:
pf = authors_recommendations('The Hobbit').head(20)

In [66]:
pf['corpus']

18      J.R.R. Tolkien to-read fantasy favorites curre...
154     J.R.R. Tolkien to-read fantasy favorites curre...
160     J.R.R. Tolkien to-read fantasy favorites curre...
188     J.R.R. Tolkien to-read fantasy favorites curre...
963     J.R.R. Tolkien to-read fantasy favorites curre...
4975    J.R.R. Tolkien, Christopher Tolkien to-read fa...
2308    J.R.R. Tolkien, Christopher Tolkien, Alan Lee ...
610     J.R.R. Tolkien, Christopher Tolkien, Ted Nasmi...
8271    Robert Foster, J.R.R. Tolkien to-read fantasy ...
1128    John D. Rateliff, J.R.R. Tolkien to-read fanta...
465     Chuck Dixon, J.R.R. Tolkien, David Wenzel, Sea...
0       Suzanne Collins to-read fantasy favorites curr...
1       J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2       Stephenie Meyer to-read fantasy favorites curr...
3       Harper Lee to-read favorites currently-reading...
4       F. Scott Fitzgerald to-read favorites currentl...
5       John Green to-read favorites currently-reading...
7       J.D. S

In [27]:
#Recommandation using tags and authors

#Recommendation of books using the authors and tags attributes 
#for better results. Creating corpus of features and calculating the 
#TF-IDF on the corpus of attributes for gettings better recommendations.

In [29]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')

temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...
2,3,to-read fantasy favorites currently-reading yo...
3,5,to-read fantasy favorites currently-reading yo...
4,6,to-read fantasy young-adult fiction harry-pott...


In [30]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,to-read fantasy favorites currently-reading yo...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,to-read favorites currently-reading young-adul...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,to-read favorites currently-reading young-adul...


In [31]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

In [58]:
indices1 = pd.Series(books.index, index=books['title'])

tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices]



In [59]:
corpus_recommendations("Twilight (Twilight, #1)")

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name,corpus
51,52,428263,428263,2675454,185,316160202,9780316000000.0,Stephenie Meyer,2007.0,Eclipse,...,35216,83094,124293,260763,309358,399134,https://images.gr-assets.com/books/1361038355m...,https://images.gr-assets.com/books/1361038355s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
48,49,49041,49041,3203964,194,316160199,9780316000000.0,Stephenie Meyer,2006.0,"New Moon (Twilight, #2)",...,44020,102837,160660,294207,290612,350684,https://images.gr-assets.com/books/1361039440m...,https://images.gr-assets.com/books/1361039440s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
991,992,3090465,3090465,6440505,36,739352350,9780739000000.0,"Stephenie Meyer, Ilyana Kadushin, Matt Walters",2005.0,The Twilight Saga,...,3641,7478,7649,16077,23209,42398,https://images.gr-assets.com/books/1327930511m...,https://images.gr-assets.com/books/1327930511s...,to-read fantasy favorites currently-reading yo...,"Stephenie Meyer, Ilyana Kadushin, Matt Walters..."
833,834,4502877,4502877,4551869,1,,,Stephenie Meyer,2008.0,Midnight Sun (Partial Draft),...,9189,6209,9089,23237,35433,64149,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
731,732,7937462,7937462,11342428,93,031612558X,9780316000000.0,Stephenie Meyer,2009.0,The Short Second Life of Bree Tanner,...,9885,7314,21317,50829,43629,36680,https://images.gr-assets.com/books/1274335680m...,https://images.gr-assets.com/books/1274335680s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
1618,1619,8726744,8726744,14194990,11,031613290X,9780316000000.0,Stephenie Meyer,2005.0,,...,335,2262,2306,6572,10599,36050,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,to-read fantasy currently-reading young-adult ...,Stephenie Meyer to-read fantasy currently-read...
4087,4088,3609763,3609763,3652514,35,316043125,9780316000000.0,Stephenie Meyer,2011.0,The Twilight Saga: The Official Guide,...,851,631,1246,4040,5055,14120,https://images.gr-assets.com/books/1344265906m...,https://images.gr-assets.com/books/1344265906s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
2020,2021,690926,690926,3187048,5,316003727,9780316000000.0,Stephenie Meyer,2007.0,"The Twilight Collection (Twilight, #1-3)",...,1127,4560,3613,6924,8607,18679,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
72,73,1656001,1656001,3328799,161,316068047,9780316000000.0,Stephenie Meyer,2008.0,The Host,...,39778,44215,62501,154906,227180,288758,https://images.gr-assets.com/books/1318009171m...,https://images.gr-assets.com/books/1318009171s...,to-read fantasy favorites currently-reading yo...,Stephenie Meyer to-read fantasy favorites curr...
219,220,3609760,3609760,3652511,20,316043133,9780316000000.0,Mark Cotta Vaz,,Twilight: The Complete Illustrated Movie Compa...,...,532,6862,11019,48008,67939,158071,https://images.gr-assets.com/books/1352539022m...,https://images.gr-assets.com/books/1352539022s...,to-read fantasy favorites currently-reading yo...,Mark Cotta Vaz to-read fantasy favorites curre...


In [60]:
df = corpus_recommendations("The Fault in Our Stars")

In [62]:
df['corpus']

73      John Green to-read favorites currently-reading...
274     John Green to-read favorites currently-reading...
87      John Green to-read favorites young-adult ficti...
3654    Morgan Matson to-read favorites currently-read...
474     Gayle Forman to-read favorites currently-readi...
323     Rainbow Rowell to-read favorites currently-rea...
362     Stephanie Perkins to-read favorites currently-...
163     Rainbow Rowell to-read favorites currently-rea...
381     John Green, David Levithan to-read favorites c...
67      Stephen Chbosky to-read favorites currently-re...
148     Gayle Forman to-read fantasy favorites current...
2453    Jenny Downham to-read favorites currently-read...
434     Lauren Oliver to-read fantasy favorites curren...
1574    John Green, Maureen Johnson, Lauren Myracle to...
829     Kody Keplinger to-read favorites currently-rea...
1296    Jennifer E. Smith to-read favorites currently-...
11      Veronica Roth to-read fantasy favorites curren...
146     Jay As