# Libraries

In [1]:
import numpy as np
import pandas as pd
import ast
import difflib
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import pickle
import requests
from bs4 import BeautifulSoup

# Making the DataFrame with CSV Files

In [2]:
# books = pd.concat(
#     map(pd.read_csv, ['Datasets/sf_aliens.csv', 'Datasets/sf_alternate_history.csv',
#                      'Datasets/sf_alternate_universe.csv', 'Datasets/sf_apocalyptic.csv',
#                      'Datasets/sf_cyberpunk.csv','Datasets/sf_dystopia.csv',
#                      'Datasets/sf_hard.csv','Datasets/sf_military.csv',
#                      'Datasets/sf_robots.csv','Datasets/sf_space_opera.csv',
#                      'Datasets/sf_steampunk.csv', 'Datasets/sf_time_travel.csv']), ignore_index=True)

In [3]:
books = pd.concat(
    map(pd.read_csv, ['Datasets/sf_aliens.csv']), ignore_index=True)

In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Book_Title           1250 non-null   object 
 1   Original_Book_Title  1250 non-null   object 
 2   Author_Name          1250 non-null   object 
 3   Edition_Language     1250 non-null   object 
 4   Rating_score         1250 non-null   float64
 5   Rating_votes         1250 non-null   int64  
 6   Review_number        1250 non-null   int64  
 7   Book_Description     1249 non-null   object 
 8   Year_published       1250 non-null   int64  
 9   Genres               1250 non-null   object 
 10  url                  1250 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 107.5+ KB


In [6]:
books.head(-5)

Unnamed: 0,Book_Title,Original_Book_Title,Author_Name,Edition_Language,Rating_score,Rating_votes,Review_number,Book_Description,Year_published,Genres,url
0,Obsidian,Obsidian,Jennifer L. Armentrout,English,4.17,236780,18161,Starting over sucks.When we moved to West Virg...,2011,"{'Young Adult': 3439, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,Onyx,Jennifer L. Armentrout,English,4.27,153429,10497,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,2012,"{'Young Adult': 2271, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,The 5th Wave,Rick Yancey,English,4.03,400600,29990,"After the 1st wave, only darkness remains. Aft...",2013,"{'Young Adult': 5436, 'Science Fiction': 3327,...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,The Host,Stephenie Meyer,English,3.84,915026,41673,Melanie Stryder refuses to fade away. The eart...,2008,"{'Young Adult': 4529, 'Science Fiction': 4285,...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,Opal,Jennifer L. Armentrout,,4.27,129006,9463,No one is like Daemon Black.When he set out to...,2012,"{'Young Adult': 1855, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/13362536-opal
...,...,...,...,...,...,...,...,...,...,...,...
1240,Spice ‘n’ Solace,English,K.C. Burn,English,3.74,887,142,The Galactic Alliance’s most important negotia...,2011,"{'Romance (M M Romance) ': 105, 'Science Ficti...",https://www.goodreads.com/book/show/10324166-s...
1241,Enemy Overnight,\n 1419922564\n ...,Robin L. Rotham,English,3.86,778,41,The last time Jasmine King was aboard the Hept...,2009,"{'Erotica (BDSM) ': 21, 'Science Fiction': 21,...",https://www.goodreads.com/book/show/7049022-en...
1242,The Absolute,The Absolute,K.A. Applegate,English,3.87,1696,55,The Yeerks have taken over units of the Nation...,2001,"{'Science Fiction': 62, 'Young Adult': 47, 'Fa...",https://www.goodreads.com/book/show/363390.The...
1243,The Familiar,The Familiar,K.A. Applegate,English,3.67,1939,67,Jake wakes up one morning to find he is sudden...,2000,"{'Science Fiction': 70, 'Young Adult': 55, 'Fi...",https://www.goodreads.com/book/show/363352.The...


# Removing Null and Dupliactes

In [7]:
books.isnull().sum()

Book_Title             0
Original_Book_Title    0
Author_Name            0
Edition_Language       0
Rating_score           0
Rating_votes           0
Review_number          0
Book_Description       1
Year_published         0
Genres                 0
url                    0
dtype: int64

In [8]:
books.duplicated().sum()

0

In [9]:
books.dropna(inplace=True)

In [10]:
books.isnull().sum()

Book_Title             0
Original_Book_Title    0
Author_Name            0
Edition_Language       0
Rating_score           0
Rating_votes           0
Review_number          0
Book_Description       0
Year_published         0
Genres                 0
url                    0
dtype: int64

In [11]:
books.drop_duplicates(inplace=True)

In [12]:
books.duplicated().sum()

0

# Keeping only the needed Informations

In [13]:
books = books[['Book_Title','Author_Name','Book_Description','Genres','url']]

In [14]:
books.iloc[0].Genres

"{'Young Adult': 3439, 'Fantasy (Paranormal) ': 2652, 'Fantasy': 2545, 'Romance': 2507, 'Science Fiction (Aliens) ': 1648, 'Science Fiction': 1170, 'Romance (Paranormal Romance) ': 849, 'Fantasy (Supernatural) ': 494, 'Fantasy (Urban Fantasy) ': 411, 'Fiction': 347}"

# Removing Null and Dupliactes

In [15]:
books.duplicated(subset = ['Book_Title','Author_Name']).sum()

1

In [16]:
books.drop_duplicates(subset = ['Book_Title','Author_Name'], inplace=True)

In [17]:
books.duplicated(subset = ['Book_Title','Author_Name']).sum()

0

In [18]:
books = books.reset_index(drop = True)

In [19]:
books.head(-5)

Unnamed: 0,Book_Title,Author_Name,Book_Description,Genres,url
0,Obsidian,Jennifer L. Armentrout,Starting over sucks.When we moved to West Virg...,"{'Young Adult': 3439, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,Jennifer L. Armentrout,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,"{'Young Adult': 2271, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,Rick Yancey,"After the 1st wave, only darkness remains. Aft...","{'Young Adult': 5436, 'Science Fiction': 3327,...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,Stephenie Meyer,Melanie Stryder refuses to fade away. The eart...,"{'Young Adult': 4529, 'Science Fiction': 4285,...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,Jennifer L. Armentrout,No one is like Daemon Black.When he set out to...,"{'Young Adult': 1855, 'Fantasy (Paranormal) ':...",https://www.goodreads.com/book/show/13362536-opal
...,...,...,...,...,...
1238,Spice ‘n’ Solace,K.C. Burn,The Galactic Alliance’s most important negotia...,"{'Romance (M M Romance) ': 105, 'Science Ficti...",https://www.goodreads.com/book/show/10324166-s...
1239,Enemy Overnight,Robin L. Rotham,The last time Jasmine King was aboard the Hept...,"{'Erotica (BDSM) ': 21, 'Science Fiction': 21,...",https://www.goodreads.com/book/show/7049022-en...
1240,The Absolute,K.A. Applegate,The Yeerks have taken over units of the Nation...,"{'Science Fiction': 62, 'Young Adult': 47, 'Fa...",https://www.goodreads.com/book/show/363390.The...
1241,The Familiar,K.A. Applegate,Jake wakes up one morning to find he is sudden...,"{'Science Fiction': 70, 'Young Adult': 55, 'Fi...",https://www.goodreads.com/book/show/363352.The...


# Making Functions to Apply later

In [20]:
def genres_extract(n):
    n = ast.literal_eval(n)
    n = list(n.keys())
    m = []
    for i in n:
        i = i.replace("(","")
        i = i.replace(")","")
        i = i.replace(" ","")
        m.append(i)
    return m

In [21]:
def spl(n):
    n = n.replace("."," ")
    n = n.replace(","," ")
    return n.split()

In [22]:
def auth_spl(n):
    n = n.replace(" ","")
    n = n.replace(".","")
    return n.split()

In [23]:
def cover_scrape(url):
    code = ""
    for i in url:
        if i.isnumeric():
            code = code + i
        elif code != "":
            break

    response = requests.get(url)
    html_page = BeautifulSoup(response.text, 'html.parser')
    images = html_page.find_all("img")

    for index, image in enumerate(images):
        image_url = image.get("src")
        if image_url == None:
            continue
        if code in image_url:
            return image_url

# Applying the Functions

In [24]:
books['Genres'] = books['Genres'].apply(genres_extract)

In [25]:
books.head()

Unnamed: 0,Book_Title,Author_Name,Book_Description,Genres,url
0,Obsidian,Jennifer L. Armentrout,Starting over sucks.When we moved to West Virg...,"[YoungAdult, FantasyParanormal, Fantasy, Roman...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,Jennifer L. Armentrout,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,"[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,Rick Yancey,"After the 1st wave, only darkness remains. Aft...","[YoungAdult, ScienceFiction, ScienceFictionDys...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,Stephenie Meyer,Melanie Stryder refuses to fade away. The eart...,"[YoungAdult, ScienceFiction, Fantasy, Romance,...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,Jennifer L. Armentrout,No one is like Daemon Black.When he set out to...,"[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13362536-opal


In [26]:
books['Book_Description'] = books['Book_Description'].apply(spl)

In [27]:
books.head()

Unnamed: 0,Book_Title,Author_Name,Book_Description,Genres,url
0,Obsidian,Jennifer L. Armentrout,"[Starting, over, sucks, When, we, moved, to, W...","[YoungAdult, FantasyParanormal, Fantasy, Roman...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,Jennifer L. Armentrout,"[BEING, CONNECTED, TO, DAEMON, BLACK, SUCKS…, ...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,Rick Yancey,"[After, the, 1st, wave, only, darkness, remain...","[YoungAdult, ScienceFiction, ScienceFictionDys...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,Stephenie Meyer,"[Melanie, Stryder, refuses, to, fade, away, Th...","[YoungAdult, ScienceFiction, Fantasy, Romance,...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,Jennifer L. Armentrout,"[No, one, is, like, Daemon, Black, When, he, s...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13362536-opal


In [28]:
books['Author_Name'] = books['Author_Name'].apply(auth_spl)

In [29]:
books.head()

Unnamed: 0,Book_Title,Author_Name,Book_Description,Genres,url
0,Obsidian,[JenniferLArmentrout],"[Starting, over, sucks, When, we, moved, to, W...","[YoungAdult, FantasyParanormal, Fantasy, Roman...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,[JenniferLArmentrout],"[BEING, CONNECTED, TO, DAEMON, BLACK, SUCKS…, ...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,[RickYancey],"[After, the, 1st, wave, only, darkness, remain...","[YoungAdult, ScienceFiction, ScienceFictionDys...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,[StephenieMeyer],"[Melanie, Stryder, refuses, to, fade, away, Th...","[YoungAdult, ScienceFiction, Fantasy, Romance,...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,[JenniferLArmentrout],"[No, one, is, like, Daemon, Black, When, he, s...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13362536-opal


# Making the "tags" Column

In [30]:
books['tags'] = books['Author_Name'] + books['Book_Description'] + books['Genres']

In [31]:
books.head()

Unnamed: 0,Book_Title,Author_Name,Book_Description,Genres,url,tags
0,Obsidian,[JenniferLArmentrout],"[Starting, over, sucks, When, we, moved, to, W...","[YoungAdult, FantasyParanormal, Fantasy, Roman...",https://www.goodreads.com/book/show/12578077-o...,"[JenniferLArmentrout, Starting, over, sucks, W..."
1,Onyx,[JenniferLArmentrout],"[BEING, CONNECTED, TO, DAEMON, BLACK, SUCKS…, ...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13047090-onyx,"[JenniferLArmentrout, BEING, CONNECTED, TO, DA..."
2,The 5th Wave,[RickYancey],"[After, the, 1st, wave, only, darkness, remain...","[YoungAdult, ScienceFiction, ScienceFictionDys...",https://www.goodreads.com/book/show/16101128-t...,"[RickYancey, After, the, 1st, wave, only, dark..."
3,The Host,[StephenieMeyer],"[Melanie, Stryder, refuses, to, fade, away, Th...","[YoungAdult, ScienceFiction, Fantasy, Romance,...",https://www.goodreads.com/book/show/1656001.Th...,"[StephenieMeyer, Melanie, Stryder, refuses, to..."
4,Opal,[JenniferLArmentrout],"[No, one, is, like, Daemon, Black, When, he, s...","[YoungAdult, FantasyParanormal, Romance, Fanta...",https://www.goodreads.com/book/show/13362536-opal,"[JenniferLArmentrout, No, one, is, like, Daemo..."


In [32]:
books_data = books[['Book_Title','tags','url']]

In [33]:
books_data.head(-1)

Unnamed: 0,Book_Title,tags,url
0,Obsidian,"[JenniferLArmentrout, Starting, over, sucks, W...",https://www.goodreads.com/book/show/12578077-o...
1,Onyx,"[JenniferLArmentrout, BEING, CONNECTED, TO, DA...",https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,"[RickYancey, After, the, 1st, wave, only, dark...",https://www.goodreads.com/book/show/16101128-t...
3,The Host,"[StephenieMeyer, Melanie, Stryder, refuses, to...",https://www.goodreads.com/book/show/1656001.Th...
4,Opal,"[JenniferLArmentrout, No, one, is, like, Daemo...",https://www.goodreads.com/book/show/13362536-opal
...,...,...,...
1242,Turning Point,"[LisanneNorman, Cut, off, from, Earth, by, ali...",https://www.goodreads.com/book/show/148282.Tur...
1243,The Color of Distance,"[AmyThomson, Juna, is, the, sole, survivor, of...",https://www.goodreads.com/book/show/466371.The...
1244,Aliens: Genocide,"[DavidBischoff, Their, queen, is, dead, and, t...",https://www.goodreads.com/book/show/2186184.Al...
1245,The Pool of Fire,"[JohnChristopher, Alternate, cover, edition, c...",https://www.goodreads.com/book/show/80491.The_...


In [34]:
books_data = books_data.reset_index(drop = True)

In [35]:
books_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1248 entries, 0 to 1247
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Book_Title  1248 non-null   object
 1   tags        1248 non-null   object
 2   url         1248 non-null   object
dtypes: object(3)
memory usage: 29.4+ KB


In [36]:
books_data['tags'] = books_data['tags'].apply(lambda x:" ".join(x))

In [37]:
books_data['tags'] = books_data['tags'].apply(lambda x:x.lower())

In [38]:
books_data.head(-1)

Unnamed: 0,Book_Title,tags,url
0,Obsidian,jenniferlarmentrout starting over sucks when w...,https://www.goodreads.com/book/show/12578077-o...
1,Onyx,jenniferlarmentrout being connected to daemon ...,https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,rickyancey after the 1st wave only darkness re...,https://www.goodreads.com/book/show/16101128-t...
3,The Host,stepheniemeyer melanie stryder refuses to fade...,https://www.goodreads.com/book/show/1656001.Th...
4,Opal,jenniferlarmentrout no one is like daemon blac...,https://www.goodreads.com/book/show/13362536-opal
...,...,...,...
1242,Turning Point,lisannenorman cut off from earth by alien conq...,https://www.goodreads.com/book/show/148282.Tur...
1243,The Color of Distance,amythomson juna is the sole survivor of a team...,https://www.goodreads.com/book/show/466371.The...
1244,Aliens: Genocide,davidbischoff their queen is dead and the hive...,https://www.goodreads.com/book/show/2186184.Al...
1245,The Pool of Fire,johnchristopher alternate cover edition can be...,https://www.goodreads.com/book/show/80491.The_...


# Removing Stop Words

In [39]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/saman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
en_stops = set(stopwords.words('english'))
print(en_stops)

{'he', 'or', 'this', 'who', 'each', 's', "don't", "hasn't", 'very', 'yourself', 'me', 'wasn', 'as', 'into', 'them', 'is', "doesn't", "won't", 'few', 'had', 'will', 'from', 'before', 'to', 'because', 'both', 'over', "needn't", 'any', 'my', 'where', "you'll", 'itself', 'there', 'most', 'd', 'whom', 'been', 'after', 'why', 'that', 'won', 'needn', 'should', "shouldn't", 'on', 'haven', 'own', 'they', 'once', 'shan', 'too', "couldn't", 'then', 'such', 'here', 'other', "hadn't", 'out', 'themselves', 'about', 'mightn', 'him', 'yourselves', 'was', 'only', 'down', 'off', 'when', 'no', 'shouldn', 'these', 'doesn', "it's", 'y', 'those', "haven't", "isn't", 'have', 'a', 'up', 'same', 'now', "mustn't", 'but', 'ma', 'your', 'do', 'her', 'his', 'hers', "should've", 'aren', 'in', 'ain', 'which', 'having', 'between', 'so', 'theirs', 'm', 'has', 'yours', 'it', 'until', "shan't", "weren't", 'be', 'mustn', 'further', 'couldn', 'how', 're', 'hasn', 'doing', 'ours', 'for', 'all', "that'll", "she's", 'don', '

In [41]:
def removestop(n):
    n = n.split()
    new = []
    for i in n:
        if i not in en_stops:
            new.append(i)
    return " ".join(new)

In [42]:
books_data['tags'][0]

'jenniferlarmentrout starting over sucks when we moved to west virginia right before my senior year i’d pretty much resigned myself to thick accents dodgy internet access and a whole lot of boring… until i spotted my hot neighbor with his looming height and eerie green eyes things were looking up and then he opened his mouth daemon is infuriating arrogant stab-worthy we do not get along at all but when a stranger attacks me and daemon literally freezes time with a wave of his hand well something… unexpected happens the hot alien living next door marks me you heard me alien turns out daemon and his sister have a galaxy of enemies wanting to steal their abilities and daemon’s touch has me lit up like the vegas strip the only way i’m getting out of this alive is by sticking close to daemon until my alien mojo fades if i don’t kill him first that is youngadult fantasyparanormal fantasy romance sciencefictionaliens sciencefiction romanceparanormalromance fantasysupernatural fantasyurbanfant

In [43]:
books_data['tags'] = books_data['tags'].apply(removestop)

In [44]:
books_data.head(-1)

Unnamed: 0,Book_Title,tags,url
0,Obsidian,jenniferlarmentrout starting sucks moved west ...,https://www.goodreads.com/book/show/12578077-o...
1,Onyx,jenniferlarmentrout connected daemon black suc...,https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,rickyancey 1st wave darkness remains 2nd lucky...,https://www.goodreads.com/book/show/16101128-t...
3,The Host,stepheniemeyer melanie stryder refuses fade aw...,https://www.goodreads.com/book/show/1656001.Th...
4,Opal,jenniferlarmentrout one like daemon black set ...,https://www.goodreads.com/book/show/13362536-opal
...,...,...,...
1242,Turning Point,lisannenorman cut earth alien conquerors human...,https://www.goodreads.com/book/show/148282.Tur...
1243,The Color of Distance,amythomson juna sole survivor team surveyors m...,https://www.goodreads.com/book/show/466371.The...
1244,Aliens: Genocide,davidbischoff queen dead hive mind left flound...,https://www.goodreads.com/book/show/2186184.Al...
1245,The Pool of Fire,johnchristopher alternate cover edition found ...,https://www.goodreads.com/book/show/80491.The_...


In [45]:
books_data.head(5)

Unnamed: 0,Book_Title,tags,url
0,Obsidian,jenniferlarmentrout starting sucks moved west ...,https://www.goodreads.com/book/show/12578077-o...
1,Onyx,jenniferlarmentrout connected daemon black suc...,https://www.goodreads.com/book/show/13047090-onyx
2,The 5th Wave,rickyancey 1st wave darkness remains 2nd lucky...,https://www.goodreads.com/book/show/16101128-t...
3,The Host,stepheniemeyer melanie stryder refuses fade aw...,https://www.goodreads.com/book/show/1656001.Th...
4,Opal,jenniferlarmentrout one like daemon black set ...,https://www.goodreads.com/book/show/13362536-opal


In [46]:
# This usually takes a lot of time
books_data['url'] = books_data['url'].apply(cover_scrape)

In [47]:
books_data.head(-1)

Unnamed: 0,Book_Title,tags,url
0,Obsidian,jenniferlarmentrout starting sucks moved west ...,https://i.gr-assets.com/images/S/compressed.ph...
1,Onyx,jenniferlarmentrout connected daemon black suc...,https://i.gr-assets.com/images/S/compressed.ph...
2,The 5th Wave,rickyancey 1st wave darkness remains 2nd lucky...,https://i.gr-assets.com/images/S/compressed.ph...
3,The Host,stepheniemeyer melanie stryder refuses fade aw...,https://i.gr-assets.com/images/S/compressed.ph...
4,Opal,jenniferlarmentrout one like daemon black set ...,https://i.gr-assets.com/images/S/compressed.ph...
...,...,...,...
1242,Turning Point,lisannenorman cut earth alien conquerors human...,https://i.gr-assets.com/images/S/compressed.ph...
1243,The Color of Distance,amythomson juna sole survivor team surveyors m...,https://i.gr-assets.com/images/S/compressed.ph...
1244,Aliens: Genocide,davidbischoff queen dead hive mind left flound...,https://i.gr-assets.com/images/S/compressed.ph...
1245,The Pool of Fire,johnchristopher alternate cover edition found ...,https://i.gr-assets.com/images/S/compressed.ph...


# Exporting the created CSV File (Optional)

In [48]:
books_data.to_csv('x.csv',index = False)

In [49]:
books_data['tags']

0       jenniferlarmentrout starting sucks moved west ...
1       jenniferlarmentrout connected daemon black suc...
2       rickyancey 1st wave darkness remains 2nd lucky...
3       stepheniemeyer melanie stryder refuses fade aw...
4       jenniferlarmentrout one like daemon black set ...
                              ...                        
1243    amythomson juna sole survivor team surveyors m...
1244    davidbischoff queen dead hive mind left flound...
1245    johnchristopher alternate cover edition found ...
1246    larryniven safety mankind aliens called moties...
1247    larryniven "this rousing sequel classic ringwo...
Name: tags, Length: 1248, dtype: object

# Vectorizing

In [50]:
vec = TfidfVectorizer()

In [51]:
feature_v = vec.fit_transform(books_data['tags'])

In [52]:
print(feature_v)

  (0, 5392)	0.03066035550137788
  (0, 5258)	0.06816100129620213
  (0, 5256)	0.07033139577421904
  (0, 12110)	0.0331341962388734
  (0, 12444)	0.01856817690647368
  (0, 12445)	0.02005480788107261
  (0, 12099)	0.023824652723204748
  (0, 5249)	0.020200617621693163
  (0, 5255)	0.02800216759070968
  (0, 16109)	0.045772660891402805
  (0, 5473)	0.04492968279120888
  (0, 7977)	0.06816100129620213
  (0, 4224)	0.07369832523669542
  (0, 5198)	0.12787191851736585
  (0, 9388)	0.12264024240098485
  (0, 2703)	0.07707570620374016
  (0, 13602)	0.12264024240098485
  (0, 581)	0.07165940294372014
  (0, 5984)	0.07339267741258312
  (0, 15688)	0.04795614729389517
  (0, 13709)	0.12264024240098485
  (0, 15372)	0.11858223914069946
  (0, 8421)	0.04707956231002475
  (0, 8481)	0.13524555186210918
  (0, 14608)	0.07497503302817808
  :	:
  (1247, 8201)	0.24718917821480876
  (1247, 8611)	0.27422125725617585
  (1247, 12153)	0.27422125725617585
  (1247, 3096)	0.22015709917344173
  (1247, 14392)	0.18314386260444585
  (124

In [53]:
similarity = cosine_similarity(feature_v)

In [54]:
print(similarity)

[[1.         0.39635347 0.04520418 ... 0.00796935 0.00377249 0.00449293]
 [0.39635347 1.         0.03556547 ... 0.02261726 0.01050099 0.01741799]
 [0.04520418 0.03556547 1.         ... 0.05382372 0.01616429 0.01048684]
 ...
 [0.00796935 0.02261726 0.05382372 ... 1.         0.01284045 0.02679165]
 [0.00377249 0.01050099 0.01616429 ... 0.01284045 1.         0.10455315]
 [0.00449293 0.01741799 0.01048684 ... 0.02679165 0.10455315 1.        ]]


In [55]:
print(similarity.shape)

(1248, 1248)


# Finding the Book

In [69]:
bookname = "gravity " #user input

In [70]:
titleslist = books_data['Book_Title'].tolist()

In [71]:
closematch = difflib.get_close_matches(bookname, titleslist)[0]
print(closematch)

Gravity


In [72]:
index = books_data.index
cond = books_data['Book_Title'] == closematch
index = index[cond][0]
print(index)

30


# Extracting similarity score for that Book

In [73]:
similarity_score = list(enumerate(similarity[index]))

In [74]:
print(similarity_score)

[(0, 0.02993614989464271), (1, 0.04892691455606883), (2, 0.02852659148012477), (3, 0.008543027574379983), (4, 0.026850531310336278), (5, 0.02603030260253778), (6, 0.04764682831013372), (7, 0.022161748615361996), (8, 0.008760806196944847), (9, 0.054510276281180194), (10, 0.06204931299171919), (11, 0.03473113037598872), (12, 0.016568542656681125), (13, 0.007285273768992565), (14, 0.025732508249869206), (15, 0.005809217768647441), (16, 0.038654047241063555), (17, 0.022803363554890706), (18, 0.008802744865562463), (19, 0.02601421843336329), (20, 0.02034212350748339), (21, 0.0365488815030757), (22, 0.018600704835989308), (23, 0.03690569091088004), (24, 0.025090288513197008), (25, 0.021417695817101234), (26, 0.02636299640063382), (27, 0.02358807109141803), (28, 0.034237587611122965), (29, 0.01265808288147004), (30, 1.0000000000000002), (31, 0.03290727015600754), (32, 0.04495232863421622), (33, 0.0386975136558105), (34, 0.023866808264841256), (35, 0.030244836720032506), (36, 0.019919571291715

In [75]:
sorted_sim = sorted(similarity_score, key = lambda x:x[1], reverse = True)

In [76]:
print(sorted_sim)

[(30, 1.0000000000000002), (431, 0.44046423460404455), (166, 0.27145342752472434), (1101, 0.17635409466953972), (1189, 0.15511731720804964), (1230, 0.14135676793446966), (108, 0.14118463905195977), (1139, 0.12418069570592412), (1094, 0.09496557342050148), (1098, 0.09000901990766665), (1061, 0.08884381913492914), (50, 0.085139059566272), (92, 0.07334308083731496), (263, 0.0729593755336789), (239, 0.07053342660342687), (897, 0.06802002351141062), (1204, 0.06716110429539862), (1241, 0.06491569275797092), (196, 0.06460550437544071), (556, 0.06362511612808824), (1099, 0.06295757424261132), (686, 0.062095106188789044), (10, 0.06204931299171919), (777, 0.06190124106218305), (715, 0.06172678550993223), (237, 0.0612905086506011), (881, 0.06069330945317959), (785, 0.06066971702761683), (86, 0.060538363356398686), (1127, 0.06047593633840689), (581, 0.060253108805028315), (1179, 0.05988125377439448), (948, 0.05929809608809786), (328, 0.058745816469108286), (259, 0.05857654932994534), (350, 0.05835

# Extracting top 5 Similar Books

In [77]:
i = 1
l = []
for book in sorted_sim:
    index = book[0]
    l.append(index)
    i+=1;
    if i > 6:
        break

In [78]:
print(l)

[30, 431, 166, 1101, 1189, 1230]


# Finished Result

In [79]:
print('Books suggested for you:','\n')
r = 0
for i in l:
    if r == 0:
        print('Chosen Book: ',end="")
        r+=1
    print(books_data.iloc[i][0])

Books suggested for you: 

Chosen Book: Gravity
Collide
Hover
Enemy Within
Imprisoned
Crash


# Pickle Dump

In [67]:
pickle.dump(books_data.to_dict(), open('books.pkl','wb'))

In [68]:
pickle.dump(similarity, open('similarity.pkl','wb'))