### Imports

In [36]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
from lxml import html
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime



# Data acquisition

In [37]:
isbn = pd.read_csv('../data/data_acquisition/international_for_download.csv')
isbn = isbn.applymap(str)
isbn.dtypes

isbn        object
title       object
authors     object
overview    object
dtype: object

In [38]:
isbn = isbn[0:3000]
isbn

Unnamed: 0,isbn,title,authors,overview
0,9781626921108.0,,,
1,9781626921092.0,,,
2,9781626920941.0,,,
3,9781626920880.0,,,
4,9781626920873.0,,,
5,9781626920828.0,,,
6,9781626920798.0,,,
7,9781626920781.0,,,
8,9781626920675.0,,,
9,9781626920651.0,,,


In [39]:
for j in range(len(isbn)):

    header = {'Authorization': '44023_23ab132f3977ad9849e8f1a5d7dc73bf'}
    base_url = ('https://api2.isbndb.com/book/')
    response = requests.get(base_url + isbn['isbn'][j], headers=header)
    payload = response.json()
      
    try:
        isbn['title'][j] = payload['book']['title']
    
    except:
        isbn['title'][j] = np.nan
    
    try:
        isbn['authors'][j] = payload['book']['authors']
    
    except:
        isbn['authors'][j] = np.nan
              
    try:
        isbn['overview'][j] = payload['book']['overview']
    
    except:
        isbn['overview'][j] = np.nan
  
    print('Info downloaded for book ' + str(j + 1) + ' of ' +  str(len(isbn)) + ' books.')
              
    time.sleep(1)
    

Info downloaded for book 1 of 3000 books.
Info downloaded for book 2 of 3000 books.
Info downloaded for book 3 of 3000 books.
Info downloaded for book 4 of 3000 books.
Info downloaded for book 5 of 3000 books.
Info downloaded for book 6 of 3000 books.
Info downloaded for book 7 of 3000 books.
Info downloaded for book 8 of 3000 books.
Info downloaded for book 9 of 3000 books.
Info downloaded for book 10 of 3000 books.
Info downloaded for book 11 of 3000 books.
Info downloaded for book 12 of 3000 books.
Info downloaded for book 13 of 3000 books.
Info downloaded for book 14 of 3000 books.
Info downloaded for book 15 of 3000 books.
Info downloaded for book 16 of 3000 books.
Info downloaded for book 17 of 3000 books.
Info downloaded for book 18 of 3000 books.
Info downloaded for book 19 of 3000 books.
Info downloaded for book 20 of 3000 books.
Info downloaded for book 21 of 3000 books.
Info downloaded for book 22 of 3000 books.
Info downloaded for book 23 of 3000 books.
Info downloaded for 

Info downloaded for book 190 of 3000 books.
Info downloaded for book 191 of 3000 books.
Info downloaded for book 192 of 3000 books.
Info downloaded for book 193 of 3000 books.
Info downloaded for book 194 of 3000 books.
Info downloaded for book 195 of 3000 books.
Info downloaded for book 196 of 3000 books.
Info downloaded for book 197 of 3000 books.
Info downloaded for book 198 of 3000 books.
Info downloaded for book 199 of 3000 books.
Info downloaded for book 200 of 3000 books.
Info downloaded for book 201 of 3000 books.
Info downloaded for book 202 of 3000 books.
Info downloaded for book 203 of 3000 books.
Info downloaded for book 204 of 3000 books.
Info downloaded for book 205 of 3000 books.
Info downloaded for book 206 of 3000 books.
Info downloaded for book 207 of 3000 books.
Info downloaded for book 208 of 3000 books.
Info downloaded for book 209 of 3000 books.
Info downloaded for book 210 of 3000 books.
Info downloaded for book 211 of 3000 books.
Info downloaded for book 212 of 

Info downloaded for book 377 of 3000 books.
Info downloaded for book 378 of 3000 books.
Info downloaded for book 379 of 3000 books.
Info downloaded for book 380 of 3000 books.
Info downloaded for book 381 of 3000 books.
Info downloaded for book 382 of 3000 books.
Info downloaded for book 383 of 3000 books.
Info downloaded for book 384 of 3000 books.
Info downloaded for book 385 of 3000 books.
Info downloaded for book 386 of 3000 books.
Info downloaded for book 387 of 3000 books.
Info downloaded for book 388 of 3000 books.
Info downloaded for book 389 of 3000 books.
Info downloaded for book 390 of 3000 books.
Info downloaded for book 391 of 3000 books.
Info downloaded for book 392 of 3000 books.
Info downloaded for book 393 of 3000 books.
Info downloaded for book 394 of 3000 books.
Info downloaded for book 395 of 3000 books.
Info downloaded for book 396 of 3000 books.
Info downloaded for book 397 of 3000 books.
Info downloaded for book 398 of 3000 books.
Info downloaded for book 399 of 

Info downloaded for book 564 of 3000 books.
Info downloaded for book 565 of 3000 books.
Info downloaded for book 566 of 3000 books.
Info downloaded for book 567 of 3000 books.
Info downloaded for book 568 of 3000 books.
Info downloaded for book 569 of 3000 books.
Info downloaded for book 570 of 3000 books.
Info downloaded for book 571 of 3000 books.
Info downloaded for book 572 of 3000 books.
Info downloaded for book 573 of 3000 books.
Info downloaded for book 574 of 3000 books.
Info downloaded for book 575 of 3000 books.
Info downloaded for book 576 of 3000 books.
Info downloaded for book 577 of 3000 books.
Info downloaded for book 578 of 3000 books.
Info downloaded for book 579 of 3000 books.
Info downloaded for book 580 of 3000 books.
Info downloaded for book 581 of 3000 books.
Info downloaded for book 582 of 3000 books.
Info downloaded for book 583 of 3000 books.
Info downloaded for book 584 of 3000 books.
Info downloaded for book 585 of 3000 books.
Info downloaded for book 586 of 

Info downloaded for book 751 of 3000 books.
Info downloaded for book 752 of 3000 books.
Info downloaded for book 753 of 3000 books.
Info downloaded for book 754 of 3000 books.
Info downloaded for book 755 of 3000 books.
Info downloaded for book 756 of 3000 books.
Info downloaded for book 757 of 3000 books.
Info downloaded for book 758 of 3000 books.
Info downloaded for book 759 of 3000 books.
Info downloaded for book 760 of 3000 books.
Info downloaded for book 761 of 3000 books.
Info downloaded for book 762 of 3000 books.
Info downloaded for book 763 of 3000 books.
Info downloaded for book 764 of 3000 books.
Info downloaded for book 765 of 3000 books.
Info downloaded for book 766 of 3000 books.
Info downloaded for book 767 of 3000 books.
Info downloaded for book 768 of 3000 books.
Info downloaded for book 769 of 3000 books.
Info downloaded for book 770 of 3000 books.
Info downloaded for book 771 of 3000 books.
Info downloaded for book 772 of 3000 books.
Info downloaded for book 773 of 

Info downloaded for book 938 of 3000 books.
Info downloaded for book 939 of 3000 books.
Info downloaded for book 940 of 3000 books.
Info downloaded for book 941 of 3000 books.
Info downloaded for book 942 of 3000 books.
Info downloaded for book 943 of 3000 books.
Info downloaded for book 944 of 3000 books.
Info downloaded for book 945 of 3000 books.
Info downloaded for book 946 of 3000 books.
Info downloaded for book 947 of 3000 books.
Info downloaded for book 948 of 3000 books.
Info downloaded for book 949 of 3000 books.
Info downloaded for book 950 of 3000 books.
Info downloaded for book 951 of 3000 books.
Info downloaded for book 952 of 3000 books.
Info downloaded for book 953 of 3000 books.
Info downloaded for book 954 of 3000 books.
Info downloaded for book 955 of 3000 books.
Info downloaded for book 956 of 3000 books.
Info downloaded for book 957 of 3000 books.
Info downloaded for book 958 of 3000 books.
Info downloaded for book 959 of 3000 books.
Info downloaded for book 960 of 

Info downloaded for book 1122 of 3000 books.
Info downloaded for book 1123 of 3000 books.
Info downloaded for book 1124 of 3000 books.
Info downloaded for book 1125 of 3000 books.
Info downloaded for book 1126 of 3000 books.
Info downloaded for book 1127 of 3000 books.
Info downloaded for book 1128 of 3000 books.
Info downloaded for book 1129 of 3000 books.
Info downloaded for book 1130 of 3000 books.
Info downloaded for book 1131 of 3000 books.
Info downloaded for book 1132 of 3000 books.
Info downloaded for book 1133 of 3000 books.
Info downloaded for book 1134 of 3000 books.
Info downloaded for book 1135 of 3000 books.
Info downloaded for book 1136 of 3000 books.
Info downloaded for book 1137 of 3000 books.
Info downloaded for book 1138 of 3000 books.
Info downloaded for book 1139 of 3000 books.
Info downloaded for book 1140 of 3000 books.
Info downloaded for book 1141 of 3000 books.
Info downloaded for book 1142 of 3000 books.
Info downloaded for book 1143 of 3000 books.
Info downl

Info downloaded for book 1305 of 3000 books.
Info downloaded for book 1306 of 3000 books.
Info downloaded for book 1307 of 3000 books.
Info downloaded for book 1308 of 3000 books.
Info downloaded for book 1309 of 3000 books.
Info downloaded for book 1310 of 3000 books.
Info downloaded for book 1311 of 3000 books.
Info downloaded for book 1312 of 3000 books.
Info downloaded for book 1313 of 3000 books.
Info downloaded for book 1314 of 3000 books.
Info downloaded for book 1315 of 3000 books.
Info downloaded for book 1316 of 3000 books.
Info downloaded for book 1317 of 3000 books.
Info downloaded for book 1318 of 3000 books.
Info downloaded for book 1319 of 3000 books.
Info downloaded for book 1320 of 3000 books.
Info downloaded for book 1321 of 3000 books.
Info downloaded for book 1322 of 3000 books.
Info downloaded for book 1323 of 3000 books.
Info downloaded for book 1324 of 3000 books.
Info downloaded for book 1325 of 3000 books.
Info downloaded for book 1326 of 3000 books.
Info downl

Info downloaded for book 1488 of 3000 books.
Info downloaded for book 1489 of 3000 books.
Info downloaded for book 1490 of 3000 books.
Info downloaded for book 1491 of 3000 books.
Info downloaded for book 1492 of 3000 books.
Info downloaded for book 1493 of 3000 books.
Info downloaded for book 1494 of 3000 books.
Info downloaded for book 1495 of 3000 books.
Info downloaded for book 1496 of 3000 books.
Info downloaded for book 1497 of 3000 books.
Info downloaded for book 1498 of 3000 books.
Info downloaded for book 1499 of 3000 books.
Info downloaded for book 1500 of 3000 books.
Info downloaded for book 1501 of 3000 books.
Info downloaded for book 1502 of 3000 books.
Info downloaded for book 1503 of 3000 books.
Info downloaded for book 1504 of 3000 books.
Info downloaded for book 1505 of 3000 books.
Info downloaded for book 1506 of 3000 books.
Info downloaded for book 1507 of 3000 books.
Info downloaded for book 1508 of 3000 books.
Info downloaded for book 1509 of 3000 books.
Info downl

Info downloaded for book 1671 of 3000 books.
Info downloaded for book 1672 of 3000 books.
Info downloaded for book 1673 of 3000 books.
Info downloaded for book 1674 of 3000 books.
Info downloaded for book 1675 of 3000 books.
Info downloaded for book 1676 of 3000 books.
Info downloaded for book 1677 of 3000 books.
Info downloaded for book 1678 of 3000 books.
Info downloaded for book 1679 of 3000 books.
Info downloaded for book 1680 of 3000 books.
Info downloaded for book 1681 of 3000 books.
Info downloaded for book 1682 of 3000 books.
Info downloaded for book 1683 of 3000 books.
Info downloaded for book 1684 of 3000 books.
Info downloaded for book 1685 of 3000 books.
Info downloaded for book 1686 of 3000 books.
Info downloaded for book 1687 of 3000 books.
Info downloaded for book 1688 of 3000 books.
Info downloaded for book 1689 of 3000 books.
Info downloaded for book 1690 of 3000 books.
Info downloaded for book 1691 of 3000 books.
Info downloaded for book 1692 of 3000 books.
Info downl

Info downloaded for book 1854 of 3000 books.
Info downloaded for book 1855 of 3000 books.
Info downloaded for book 1856 of 3000 books.
Info downloaded for book 1857 of 3000 books.
Info downloaded for book 1858 of 3000 books.
Info downloaded for book 1859 of 3000 books.
Info downloaded for book 1860 of 3000 books.
Info downloaded for book 1861 of 3000 books.
Info downloaded for book 1862 of 3000 books.
Info downloaded for book 1863 of 3000 books.
Info downloaded for book 1864 of 3000 books.
Info downloaded for book 1865 of 3000 books.
Info downloaded for book 1866 of 3000 books.
Info downloaded for book 1867 of 3000 books.
Info downloaded for book 1868 of 3000 books.
Info downloaded for book 1869 of 3000 books.
Info downloaded for book 1870 of 3000 books.
Info downloaded for book 1871 of 3000 books.
Info downloaded for book 1872 of 3000 books.
Info downloaded for book 1873 of 3000 books.
Info downloaded for book 1874 of 3000 books.
Info downloaded for book 1875 of 3000 books.
Info downl

Info downloaded for book 2037 of 3000 books.
Info downloaded for book 2038 of 3000 books.
Info downloaded for book 2039 of 3000 books.
Info downloaded for book 2040 of 3000 books.
Info downloaded for book 2041 of 3000 books.
Info downloaded for book 2042 of 3000 books.
Info downloaded for book 2043 of 3000 books.
Info downloaded for book 2044 of 3000 books.
Info downloaded for book 2045 of 3000 books.
Info downloaded for book 2046 of 3000 books.
Info downloaded for book 2047 of 3000 books.
Info downloaded for book 2048 of 3000 books.
Info downloaded for book 2049 of 3000 books.
Info downloaded for book 2050 of 3000 books.
Info downloaded for book 2051 of 3000 books.
Info downloaded for book 2052 of 3000 books.
Info downloaded for book 2053 of 3000 books.
Info downloaded for book 2054 of 3000 books.
Info downloaded for book 2055 of 3000 books.
Info downloaded for book 2056 of 3000 books.
Info downloaded for book 2057 of 3000 books.
Info downloaded for book 2058 of 3000 books.
Info downl

Info downloaded for book 2220 of 3000 books.
Info downloaded for book 2221 of 3000 books.
Info downloaded for book 2222 of 3000 books.
Info downloaded for book 2223 of 3000 books.
Info downloaded for book 2224 of 3000 books.
Info downloaded for book 2225 of 3000 books.
Info downloaded for book 2226 of 3000 books.
Info downloaded for book 2227 of 3000 books.
Info downloaded for book 2228 of 3000 books.
Info downloaded for book 2229 of 3000 books.
Info downloaded for book 2230 of 3000 books.
Info downloaded for book 2231 of 3000 books.
Info downloaded for book 2232 of 3000 books.
Info downloaded for book 2233 of 3000 books.
Info downloaded for book 2234 of 3000 books.
Info downloaded for book 2235 of 3000 books.
Info downloaded for book 2236 of 3000 books.
Info downloaded for book 2237 of 3000 books.
Info downloaded for book 2238 of 3000 books.
Info downloaded for book 2239 of 3000 books.
Info downloaded for book 2240 of 3000 books.
Info downloaded for book 2241 of 3000 books.
Info downl

Info downloaded for book 2403 of 3000 books.
Info downloaded for book 2404 of 3000 books.
Info downloaded for book 2405 of 3000 books.
Info downloaded for book 2406 of 3000 books.
Info downloaded for book 2407 of 3000 books.
Info downloaded for book 2408 of 3000 books.
Info downloaded for book 2409 of 3000 books.
Info downloaded for book 2410 of 3000 books.
Info downloaded for book 2411 of 3000 books.
Info downloaded for book 2412 of 3000 books.
Info downloaded for book 2413 of 3000 books.
Info downloaded for book 2414 of 3000 books.
Info downloaded for book 2415 of 3000 books.
Info downloaded for book 2416 of 3000 books.
Info downloaded for book 2417 of 3000 books.
Info downloaded for book 2418 of 3000 books.
Info downloaded for book 2419 of 3000 books.
Info downloaded for book 2420 of 3000 books.
Info downloaded for book 2421 of 3000 books.
Info downloaded for book 2422 of 3000 books.
Info downloaded for book 2423 of 3000 books.
Info downloaded for book 2424 of 3000 books.
Info downl

Info downloaded for book 2586 of 3000 books.
Info downloaded for book 2587 of 3000 books.
Info downloaded for book 2588 of 3000 books.
Info downloaded for book 2589 of 3000 books.
Info downloaded for book 2590 of 3000 books.
Info downloaded for book 2591 of 3000 books.
Info downloaded for book 2592 of 3000 books.
Info downloaded for book 2593 of 3000 books.
Info downloaded for book 2594 of 3000 books.
Info downloaded for book 2595 of 3000 books.
Info downloaded for book 2596 of 3000 books.
Info downloaded for book 2597 of 3000 books.
Info downloaded for book 2598 of 3000 books.
Info downloaded for book 2599 of 3000 books.
Info downloaded for book 2600 of 3000 books.
Info downloaded for book 2601 of 3000 books.
Info downloaded for book 2602 of 3000 books.
Info downloaded for book 2603 of 3000 books.
Info downloaded for book 2604 of 3000 books.
Info downloaded for book 2605 of 3000 books.
Info downloaded for book 2606 of 3000 books.
Info downloaded for book 2607 of 3000 books.
Info downl

Info downloaded for book 2769 of 3000 books.
Info downloaded for book 2770 of 3000 books.
Info downloaded for book 2771 of 3000 books.
Info downloaded for book 2772 of 3000 books.
Info downloaded for book 2773 of 3000 books.
Info downloaded for book 2774 of 3000 books.
Info downloaded for book 2775 of 3000 books.
Info downloaded for book 2776 of 3000 books.
Info downloaded for book 2777 of 3000 books.
Info downloaded for book 2778 of 3000 books.
Info downloaded for book 2779 of 3000 books.
Info downloaded for book 2780 of 3000 books.
Info downloaded for book 2781 of 3000 books.
Info downloaded for book 2782 of 3000 books.
Info downloaded for book 2783 of 3000 books.
Info downloaded for book 2784 of 3000 books.
Info downloaded for book 2785 of 3000 books.
Info downloaded for book 2786 of 3000 books.
Info downloaded for book 2787 of 3000 books.
Info downloaded for book 2788 of 3000 books.
Info downloaded for book 2789 of 3000 books.
Info downloaded for book 2790 of 3000 books.
Info downl

Info downloaded for book 2952 of 3000 books.
Info downloaded for book 2953 of 3000 books.
Info downloaded for book 2954 of 3000 books.
Info downloaded for book 2955 of 3000 books.
Info downloaded for book 2956 of 3000 books.
Info downloaded for book 2957 of 3000 books.
Info downloaded for book 2958 of 3000 books.
Info downloaded for book 2959 of 3000 books.
Info downloaded for book 2960 of 3000 books.
Info downloaded for book 2961 of 3000 books.
Info downloaded for book 2962 of 3000 books.
Info downloaded for book 2963 of 3000 books.
Info downloaded for book 2964 of 3000 books.
Info downloaded for book 2965 of 3000 books.
Info downloaded for book 2966 of 3000 books.
Info downloaded for book 2967 of 3000 books.
Info downloaded for book 2968 of 3000 books.
Info downloaded for book 2969 of 3000 books.
Info downloaded for book 2970 of 3000 books.
Info downloaded for book 2971 of 3000 books.
Info downloaded for book 2972 of 3000 books.
Info downloaded for book 2973 of 3000 books.
Info downl

In [40]:
now = datetime.now()
dt = now.strftime("%d-%m-%Y_%H-%M-%S")

isbn.to_csv('../data/saved/isbn' + dt +'.csv', index = False)

### EDA

In [None]:
# df download_jpg(url, file_path, file_name):
#     full_path = file_path + file_name + '.jpg'
#     urllib.request.urlretrieve(url, full_path)
    
# url =
# file_name = 'https://images.isbndb.com/covers/94/86/9781934759486.jpg'

### Canadian

In [4]:
canadian = pd.read_csv('../data/processed/canadian_processed.csv')

In [5]:
canadian.dtypes

isbn            object
title           object
author          object
Unnamed: 3     float64
description     object
dtype: object

In [10]:
canadian.drop(columns=['Unnamed: 3'], inplace = True)

In [11]:
canadian.shape

(205, 4)

In [12]:
canadian.head(40)

Unnamed: 0,isbn,title,author,description
0,9780773524927,Two Solitudes,Hugh McLennan,"“[A] powerful saga, [Two Solitudes is the stor..."
1,9781552453056,Fifteen Dogs,Andre Alexis,“A bet between the gods Hermes and Apollo lead...
2,9780771030130,Bear,Marian Engel,“A librarian is called to a remote Canadian is...
3,9781554685257,"Green Grass, Running Water",Thomas King,“Alberta is a university professor who would l...
4,9780771055706,No Great Mischief,Alistair MacLeod,"“Alexander, orphaned as a child by a horrific ..."
5,9780312054366,Generation X,Douglas Coupland,"“Andy, Dag and Claire have been handed a socie..."
6,9780676977738,The Birth House,Ami McKay,“As a child in an isolated village in Nova Sco...
7,9780062468475,Lullabies for Little Criminals,Heather O'Neill,"“At thirteen, Baby vacillates between childhoo..."
8,9780006393108,Lost Girls,Andrew Pyper,“Attorney Bartholomew Crane doesn’t belong in ...
9,9781443451352,Birdie,Tracey Lindberg,"“Bernice Meetoos, a Cree woman, leaves her hom..."


### International

### Content accumulation

In [13]:
international = pd.read_csv('../data/processed/international_processed.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 12: invalid start byte

In [None]:
# isbn = pd.read_csv('../data/isbnlist.csv', 'isbn' : str)
# base_url = 'https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=zvo3vpz7uulcuajs&EAN='
# for isbn in isbn['isbn']:
#     target_url = base_url + isbn

### Preprocessing


In [None]:
# Preprocess the posting content; this should take under 2 minutes
# start an empty list to hold preprocessed postings

for i in range(len(df)):  # for each description
    desc = re.sub('[^a-zA-Z]', ' ', df['description'][i]) # remove non text characters
    desc = desc.lower() # lower-case everything
    desc = desc.split() # split into words
    desc = ' '.join(desc) # reassemble the string
    df['description'][i] = desc


## Modeling

### TF-IDF

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

### Recommender

In [None]:
results = {}

for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['id'][i]) for i in similar_indices]
    results[row['id']] = similar_items[1:]

In [None]:
def item(id):
    return df.loc[df['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [None]:
def recommend(item_id, num):
    print("Here " + str(num) + " books that are similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
print('How many new books would you like to learn about?')
num_books = input()

In [None]:
recommend(item_id=109, num=3)

### Visualization

## References

To come