# Model Building

### Getting Ready

Importing libraries

In [1]:
import pandas as pd
import numpy as np
import ast
from IPython.display import display, Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
N_NEIGHBORS = 6

Load the dataset

In [41]:
df = pd.read_csv(r'..\datasets\dataset_with_tags.csv')
df.head()

Unnamed: 0,title,author,desc,genre,rating,reviews,totalratings,pages,img,link,isbn,score,tags
0,Between Two Fires: American Indians in the Civ...,['Laurence M. Hauptman'],Reveals that several hundred thousand Indians ...,"['History', 'Military History', 'Civil War', '...",3.52,5,33,0,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001053.Betwee...,002914180X,0.0,laurencem.hauptman history militaryhistory civ...
1,Fashion Sourcebook 1920s,"['Charlotte Fiell', 'Emmanuelle Dirix']",Fashion Sourcebook - 1920s is the first book i...,"['Couture', 'Fashion', 'Historical', 'Art', 'N...",4.51,6,41,576,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/10010552-fashi...,1906863482,0.0,charlottefiell emmanuelledirix couture fashion...
2,Hungary 56,['Andy Anderson'],The seminal history and analysis of the Hungar...,"['Politics', 'History']",4.15,2,26,124,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001077.Hungar...,948984147,0.0,andyanderson politics history the seminal hist...
3,All-American Anarchist: Joseph A. Labadie and ...,['Carlotta R. Anderson'],"""All-American Anarchist"" chronicles the life a...","['Labor', 'History']",3.83,1,6,324,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001079.All_Am...,814327079,0.0,carlottar.anderson labor history all american...
4,Les oiseaux gourmands,['Jean Leveille'],"Aujourdhui, loiseau nous invite sa table, tab...",[],4.0,1,1,177,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/10010880-les-o...,2761920813,0.0,jeanleveille aujourdhui loiseau nous invite s...


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87687 entries, 0 to 87686
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         87687 non-null  object 
 1   author        87687 non-null  object 
 2   desc          87687 non-null  object 
 3   genre         87687 non-null  object 
 4   rating        87687 non-null  float64
 5   reviews       87687 non-null  int64  
 6   totalratings  87687 non-null  int64  
 7   pages         87687 non-null  int64  
 8   img           87687 non-null  object 
 9   link          87687 non-null  object 
 10  isbn          87687 non-null  object 
 11  score         87687 non-null  float64
 12  tags          87687 non-null  object 
dtypes: float64(2), int64(3), object(8)
memory usage: 8.7+ MB


Function taken from [eda.ipynb](./eda.ipynb)

In [43]:
def display_books_info(books: pd.DataFrame) -> None:
    """
    Takes a `books` dataframe and displays each book's title, author, link and image.
    """
    for index, row in books.iterrows():
        print(f"Title: {row['title']}")
        if type(row['author']) == list:
            print(f"Author: {", ".join(row['author'])}")
        else:
            print(f"Author: {", ".join(ast.literal_eval(row['author']))}")
        print(f"Pages: {row['pages']}")
        print(f"Link: {row['link']}")
        display(Image(url=row['img'], width=200, height=300))
        print("\n")
    

### Approach 1. `TfidfVectorizer` + `NearestNeighbors`

Initialize `TfidfVectorizer`

In [44]:
vectorizer = TfidfVectorizer(stop_words='english')

Vectorize the `tags` column

In [45]:
tfidf_matrix = vectorizer.fit_transform(df['tags'])

Initialize `NearestNeighbors` and fit the vector matrix

In [46]:
model = NearestNeighbors(n_neighbors=N_NEIGHBORS, metric='cosine')
model.fit(tfidf_matrix)

0,1,2
,n_neighbors,6
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


Create `pandas.Series` for fast index and title lookup

In [47]:
title2index = pd.Series(df.index, index=df['title'].str.lower())
index2title = pd.Series(df['title'], index=df.index)

Save all objects

In [48]:
with open('../models/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('../models/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('../models/title2index.pkl', 'wb') as f:
    pickle.dump(title2index, f)

with open('../models/index2title.pkl', 'wb') as f:
    pickle.dump(index2title, f)


Main `recommend` function

In [49]:
def recommend(title: str, model: NearestNeighbors=model):
    idx = title2index[title.lower()]
    distances, indices = model.kneighbors(tfidf_matrix[idx])
    indices = indices[0][1:]
    return index2title.iloc[indices]

In [58]:
recommend('a game of thrones').index

Index([7864, 28138, 44282, 37902, 66741], dtype='int64')

In [62]:
df[df['score'] > 0.2].sample(1)

Unnamed: 0,title,author,desc,genre,rating,reviews,totalratings,pages,img,link,isbn,score,tags
14606,Steve Jobs,['Walter Isaacson'],"Walter Isaacson's ""enthralling"" (The New Yorke...","['Biography', 'Nonfiction', 'Business', 'Scien...",4.14,19077,1026052,627,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/11084145-steve...,1451648537,0.265,walterisaacson biography nonfiction business s...


In [63]:
display_books_info(df[df['title'].str.lower() == 'Steve Jobs'.lower()])

Title: Steve Jobs
Author: Walter Isaacson
Pages: 627
Link: https://goodreads.com/book/show/11084145-steve-jobs






In [64]:
display_books_info(df.iloc[recommend('Steve Jobs').index])

Title: I, Steve: Steve Jobs In His Own Words
Author: George Beahm
Pages: 160
Link: https://goodreads.com/book/show/12634780-i-steve




Title: Leading Apple with Steve Jobs: Management Lessons from a Controversial Genius
Author: Jay Elliot
Pages: 192
Link: https://goodreads.com/book/show/13839025-leading-apple-with-steve-jobs




Title: The Steve Jobs Way: iLeadership for a New Generation
Author: Jay Elliot, William L. Simon
Pages: 0
Link: https://goodreads.com/book/show/10589332-the-steve-jobs-way




Title: Einstein: His Life and Universe
Author: Walter Isaacson
Pages: 675
Link: https://goodreads.com/book/show/10884.Einstein




Title: The Zen of Steve Jobs
Author: Caleb Melby, JESS3
Pages: 80
Link: https://goodreads.com/book/show/13078116-the-zen-of-steve-jobs




