In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
def load_files():
    books = pd.read_csv('../goodbooks-10k/books.csv', encoding = "ISO-8859-1")
    ratings = pd.read_csv('../goodbooks-10k/ratings.csv', encoding = "ISO-8859-1")
    book_tags = pd.read_csv('../goodbooks-10k/book_tags.csv', encoding = "ISO-8859-1")
    tags = pd.read_csv('../goodbooks-10k/tags.csv')
    return (books, ratings, book_tags, tags)

In [4]:
books, ratings, book_tags, tags = load_files()

In [5]:
joined_tags = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')

In [6]:
joined_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [7]:
books_with_tags = pd.merge(books, joined_tags, left_on='book_id', right_on='goodreads_book_id', how='inner')


In [8]:
def create_book_corpus(books, ratings, book_tags, tags):
    joined_tags = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
    books_with_tags = pd.merge(books, joined_tags, left_on='book_id', right_on='goodreads_book_id', how='inner')
    temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
    books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')
    books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                        .fillna('')
                        .values.tolist()
                        ).str.join(' '))
    tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
    cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)
    # Build a 1-dimensional array with ids of books
    return cosine_sim_corpus

In [9]:
cosine_sim_corpus = create_book_corpus(books, ratings, book_tags, tags)

In [10]:
# Function that get book recommendations based on the cosine similarity score of books tags and authors
def corpus_recommendations(id):
    sim_scores = list(enumerate(cosine_sim_corpus[id-1]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:101]
    book_indices = [i[0] for i in sim_scores]
    return books['title'].iloc[book_indices].head(10) #To give top 10 recommended books

In [11]:
print(corpus_recommendations(4))

7                  The Catcher in the Rye
4                        The Great Gatsby
31                        Of Mice and Men
27                      Lord of the Flies
130                   The Grapes of Wrath
129               The Old Man and the Sea
781                         The Awakening
467          Their Eyes Were Watching God
57     The Adventures of Huckleberry Finn
128       One Flew Over the Cuckoo's Nest
Name: title, dtype: object


In [12]:
books.head(10)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
5,6,11870085,11870085,16827462,226,525478817,9780525000000.0,John Green,2012.0,The Fault in Our Stars,...,2346404,2478609,140739,47994,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m...,https://images.gr-assets.com/books/1360206420s...
6,7,5907,5907,1540236,969,618260307,9780618000000.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,...,2071616,2196809,37653,46023,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m...,https://images.gr-assets.com/books/1372847500s...
7,8,5107,5107,3036731,360,316769177,9780317000000.0,J.D. Salinger,1951.0,The Catcher in the Rye,...,2044241,2120637,44920,109383,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m...,https://images.gr-assets.com/books/1398034300s...
8,9,960,960,3338963,311,1416524797,9781417000000.0,Dan Brown,2000.0,Angels & Demons,...,2001311,2078754,25112,77841,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m...,https://images.gr-assets.com/books/1303390735s...
9,10,1885,1885,3060926,3455,679783261,9780680000000.0,Jane Austen,1813.0,Pride and Prejudice,...,2035490,2191465,49152,54700,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m...,https://images.gr-assets.com/books/1320399351s...


In [13]:
print(corpus_recommendations(505))

3285                  Tribulation Force (Left Behind, #2)
3613                            Nicolae (Left Behind, #3)
4288                           Apollyon (Left Behind, #5)
3917    Soul Harvest: The World Takes Sides (Left Behi...
5029                        Desecration (Left Behind, #9)
4541                          Assassins (Left Behind, #6)
4547                           The Mark (Left Behind, #8)
6822    Armageddon: The Cosmic Battle of the Ages (Lef...
5242                       The Remnant (Left Behind, #10)
4631                     The Indwelling (Left Behind, #7)
Name: title, dtype: object


In [14]:
print(corpus_recommendations(3)) #Twilight

51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
991                    The Twilight Saga (Twilight, #1-4)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
1618    The Twilight Saga Complete Collection  (Twilig...
4087    The Twilight Saga: The Official Illustrated Gu...
2020             The Twilight Collection (Twilight, #1-3)
72                                The Host (The Host, #1)
219     Twilight: The Complete Illustrated Movie Compa...
Name: title, dtype: object


In [15]:
results = pd.DataFrame(columns=['id','recommendations'])

In [16]:
for i in corpus_recommendations(6703): 
    results= results.append({'id': 6703,'recommendations':i},ignore_index=True)

In [17]:
results

Unnamed: 0,id,recommendations
0,6703,"The Soulkeepers (The Soulkeepers, #1)"
1,6703,"The Mind Readers (Mind Readers, #1)"
2,6703,Long Time Coming
3,6703,"Branded (Fall of Angels, #1)"
4,6703,The Lake (The Lake Trilogy #1)
5,6703,"Fire Burn and Cauldron Bubble (Jolie Wilkins, #1)"
6,6703,"Maid for the Billionaire (Legacy Collection, #1)"
7,6703,"Trouble in Mudbug (Ghost-in-Law, #1)"
8,6703,"Captured (The Captive, #1)"
9,6703,Twenty-Eight and a Half Wishes (Rose Gardner M...


In [18]:
for i in corpus_recommendations(6646): 
    results= results.append({'id': 6646,'recommendations':i},ignore_index=True)

In [19]:
for i in corpus_recommendations(8072): 
    results= results.append({'id': 8072,'recommendations':i},ignore_index=True)

In [20]:
for i in corpus_recommendations(7487): 
    results= results.append({'id': 7487,'recommendations':i},ignore_index=True)

In [21]:
for i in corpus_recommendations(4829): 
    results= results.append({'id': 4829,'recommendations':i},ignore_index=True)

In [22]:
results

Unnamed: 0,id,recommendations
0,6703,"The Soulkeepers (The Soulkeepers, #1)"
1,6703,"The Mind Readers (Mind Readers, #1)"
2,6703,Long Time Coming
3,6703,"Branded (Fall of Angels, #1)"
4,6703,The Lake (The Lake Trilogy #1)
5,6703,"Fire Burn and Cauldron Bubble (Jolie Wilkins, #1)"
6,6703,"Maid for the Billionaire (Legacy Collection, #1)"
7,6703,"Trouble in Mudbug (Ghost-in-Law, #1)"
8,6703,"Captured (The Captive, #1)"
9,6703,Twenty-Eight and a Half Wishes (Rose Gardner M...


In [23]:
results.to_csv('results.csv')

In [26]:
results = pd.DataFrame(columns=['id','recommendations'])

In [28]:
for i in books.id:
    for j in corpus_recommendations(i): 
        results= results.append({'id': i,'recommendations':j},ignore_index=True)