#### Import Libraries

In [1]:
import pandas as pd
import re
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

#### Load data

In [2]:
books = pd.read_json('nyt2.json', lines=True)

In [3]:
books.head()

Unnamed: 0,_id,amazon_product_url,author,bestsellers_date,description,price,published_date,publisher,rank,rank_last_week,title,weeks_on_list
0,{'$oid': '5b4aa4ead3089013507db18b'},http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,Dean R Koontz,{'$date': {'$numberLong': '1211587200000'}},"Odd Thomas, who can communicate with the dead,...",{'$numberInt': '27'},{'$date': {'$numberLong': '1212883200000'}},Bantam,{'$numberInt': '1'},{'$numberInt': '0'},ODD HOURS,{'$numberInt': '1'}
1,{'$oid': '5b4aa4ead3089013507db18c'},http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,{'$date': {'$numberLong': '1211587200000'}},Aliens have taken control of the minds and bod...,{'$numberDouble': '25.99'},{'$date': {'$numberLong': '1212883200000'}},"Little, Brown",{'$numberInt': '2'},{'$numberInt': '1'},THE HOST,{'$numberInt': '3'}
2,{'$oid': '5b4aa4ead3089013507db18d'},http://www.amazon.com/Love-Youre-With-Emily-Gi...,Emily Giffin,{'$date': {'$numberLong': '1211587200000'}},A woman's happy marriage is shaken when she en...,{'$numberDouble': '24.95'},{'$date': {'$numberLong': '1212883200000'}},St. Martin's,{'$numberInt': '3'},{'$numberInt': '2'},LOVE THE ONE YOU'RE WITH,{'$numberInt': '2'}
3,{'$oid': '5b4aa4ead3089013507db18e'},http://www.amazon.com/The-Front-Garano-Patrici...,Patricia Cornwell,{'$date': {'$numberLong': '1211587200000'}},A Massachusetts state investigator and his tea...,{'$numberDouble': '22.95'},{'$date': {'$numberLong': '1212883200000'}},Putnam,{'$numberInt': '4'},{'$numberInt': '0'},THE FRONT,{'$numberInt': '1'}
4,{'$oid': '5b4aa4ead3089013507db18f'},http://www.amazon.com/Snuff-Chuck-Palahniuk/dp...,Chuck Palahniuk,{'$date': {'$numberLong': '1211587200000'}},An aging porn queens aims to cap her career by...,{'$numberDouble': '24.95'},{'$date': {'$numberLong': '1212883200000'}},Doubleday,{'$numberInt': '5'},{'$numberInt': '0'},SNUFF,{'$numberInt': '1'}


In [4]:
books.shape

(10195, 12)

#### Helper Functions

In [5]:
# get number from dictionary
def get_number(x):
    try:
        return int(x['$numberInt'])
    except:
        return 0

# get number of zeros or missing values in column
def get_zeros(column_name):
    return (books[column_name] == 0).sum()

# get keywords from description
def get_keywords(description):
    # Instantiate Rake, by default it uses english stopwords from NLTK and discards all punctuation characters as well
    r = Rake()

    try:
        # extract words by passing the text
        r.extract_keywords_from_text(description)
        
        # getting the dictionary with keywords as keys and their scores as values
        key_words_dict_scores = r.get_word_degrees()
        
        # return the key words to the new column for the corresponding book description
        return str(list(key_words_dict_scores.keys()))
    
    except:
        return ''
    
# Remove punctuations
def remove_punctuation(s):
    return re.sub(r'[^\w\s]','',s)

#### Convert Dictionary to Integers

In [6]:
books['price'] = books['price'].apply(lambda x:get_number(x))
books['rank'] = books['rank'].apply(lambda x:get_number(x))
books['rank_last_week'] = books['rank_last_week'].apply(lambda x:get_number(x))
books['weeks_on_list'] = books['weeks_on_list'].apply(lambda x:get_number(x))

In [7]:
books.head()

Unnamed: 0,_id,amazon_product_url,author,bestsellers_date,description,price,published_date,publisher,rank,rank_last_week,title,weeks_on_list
0,{'$oid': '5b4aa4ead3089013507db18b'},http://www.amazon.com/Odd-Hours-Dean-Koontz/dp...,Dean R Koontz,{'$date': {'$numberLong': '1211587200000'}},"Odd Thomas, who can communicate with the dead,...",27,{'$date': {'$numberLong': '1212883200000'}},Bantam,1,0,ODD HOURS,1
1,{'$oid': '5b4aa4ead3089013507db18c'},http://www.amazon.com/The-Host-Novel-Stephenie...,Stephenie Meyer,{'$date': {'$numberLong': '1211587200000'}},Aliens have taken control of the minds and bod...,0,{'$date': {'$numberLong': '1212883200000'}},"Little, Brown",2,1,THE HOST,3
2,{'$oid': '5b4aa4ead3089013507db18d'},http://www.amazon.com/Love-Youre-With-Emily-Gi...,Emily Giffin,{'$date': {'$numberLong': '1211587200000'}},A woman's happy marriage is shaken when she en...,0,{'$date': {'$numberLong': '1212883200000'}},St. Martin's,3,2,LOVE THE ONE YOU'RE WITH,2
3,{'$oid': '5b4aa4ead3089013507db18e'},http://www.amazon.com/The-Front-Garano-Patrici...,Patricia Cornwell,{'$date': {'$numberLong': '1211587200000'}},A Massachusetts state investigator and his tea...,0,{'$date': {'$numberLong': '1212883200000'}},Putnam,4,0,THE FRONT,1
4,{'$oid': '5b4aa4ead3089013507db18f'},http://www.amazon.com/Snuff-Chuck-Palahniuk/dp...,Chuck Palahniuk,{'$date': {'$numberLong': '1211587200000'}},An aging porn queens aims to cap her career by...,0,{'$date': {'$numberLong': '1212883200000'}},Doubleday,5,0,SNUFF,1


#### Select Relevant columns

In [8]:
books = books[['title','author','description','price','rank','rank_last_week','weeks_on_list']]

#### Get number of missing values for fields price,rank,rank_last_week,weeks_on_list

In [9]:
get_zeros('price')

9217

In [10]:
get_zeros('rank')

0

In [11]:
get_zeros('rank_last_week')

4174

In [12]:
get_zeros('weeks_on_list')

1965

####  Drop price and rank_last_week since most of the values are missing

In [13]:
books.drop(['price','rank_last_week'], axis=1, inplace=True)

#### Get important keywords from book description

In [14]:
books['description'] = books['description'].apply(lambda x:get_keywords(x))

In [15]:
books['bag_of_words'] = books['description'].map(str)

In [16]:
books['bag_of_words'] = books['bag_of_words'].str.lower()

In [17]:
books['bag_of_words'] = books['bag_of_words'].apply(lambda x:remove_punctuation(x))

#### Drop unwanted columns

In [18]:
books.drop(['author','description','rank','weeks_on_list'], axis=1, inplace=True)

#### Drop duplicates

In [19]:
books = books.drop_duplicates(subset='title')
books = books.reset_index(drop=True)

#### Generate the cosine similarity matrix for recommending similar books

In [20]:
# instantiating and generating the count matrix

count = CountVectorizer()
count_matrix = count.fit_transform(books['bag_of_words'])

# generate the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [21]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.11952286, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.11952286, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

#### Define the function that takes the book title as the input and returns the top 10 recommended books

In [22]:
def recommendations(title, cosine_sim=cosine_sim):
    
    #initialize the list of empty books
    recommended_books = []
    
    #get the index of the book that  matches the title
    idx = books.index[books['title'] == title][0]
    
    #creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    
    #getting the indices of the 10 most similar books
    top_10_indices = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indices:
        recommended_books.append(books.loc[i][0])
        
    return recommended_books

## Recommendations in Action!!!

In [23]:
recommendations('THE PRESIDENT IS MISSING')

['ISTANBUL PASSAGE',
 'SPYMASTER',
 'THE HELLFIRE CLUB',
 'MOONGLOW',
 'RAIN GODS',
 'PORCH LIGHTS',
 'THE LADY OF THE RIVERS',
 'THE SPIES OF WARSAW',
 'THINK TWICE',
 'OUR SOULS AT NIGHT']