### Imports

In [1]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
from lxml import html
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime



# Data acquisition

In [2]:
isbn = pd.read_csv('../data/data_acquisition/international2_for_download.csv')
isbn = isbn.applymap(str)
isbn.dtypes

isbn        object
title       object
authors     object
overview    object
dtype: object

In [3]:
isbn = isbn[:15000]
isbn

Unnamed: 0,isbn,title,authors,overview
0,9780152017743,,,
1,9780152017750,,,
2,9780152017866,,,
3,9780152017873,,,
4,9780152017903,,,
...,...,...,...,...
14995,9780226735429,,,
14996,9780226735573,,,
14997,9780226736280,,,
14998,9780226736334,,,


In [4]:
for j in range(len(isbn)):

    header = {'Authorization': '44023_23ab132f3977ad9849e8f1a5d7dc73bf'}
    base_url = ('https://api2.isbndb.com/book/')
    response = requests.get(base_url + isbn['isbn'][j], headers=header)
    payload = response.json()
      
    try:
        isbn['title'][j] = payload['book']['title']
    
    except:
        isbn['title'][j] = np.nan
    
    try:
        isbn['authors'][j] = payload['book']['authors']
    
    except:
        isbn['authors'][j] = np.nan
              
    try:
        isbn['overview'][j] = payload['book']['overview']
    
    except:
        isbn['overview'][j] = np.nan
  
    print('Info downloaded for book ' + str(j + 1) + ' of ' +  str(len(isbn)) + ' books.')
              
    time.sleep(1)
    

Info downloaded for book 1 of 15000 books.
Info downloaded for book 2 of 15000 books.
Info downloaded for book 3 of 15000 books.
Info downloaded for book 4 of 15000 books.
Info downloaded for book 5 of 15000 books.
Info downloaded for book 6 of 15000 books.
Info downloaded for book 7 of 15000 books.
Info downloaded for book 8 of 15000 books.
Info downloaded for book 9 of 15000 books.
Info downloaded for book 10 of 15000 books.
Info downloaded for book 11 of 15000 books.
Info downloaded for book 12 of 15000 books.
Info downloaded for book 13 of 15000 books.
Info downloaded for book 14 of 15000 books.
Info downloaded for book 15 of 15000 books.
Info downloaded for book 16 of 15000 books.
Info downloaded for book 17 of 15000 books.
Info downloaded for book 18 of 15000 books.
Info downloaded for book 19 of 15000 books.
Info downloaded for book 20 of 15000 books.
Info downloaded for book 21 of 15000 books.
Info downloaded for book 22 of 15000 books.
Info downloaded for book 23 of 15000 book

Info downloaded for book 186 of 15000 books.
Info downloaded for book 187 of 15000 books.
Info downloaded for book 188 of 15000 books.
Info downloaded for book 189 of 15000 books.
Info downloaded for book 190 of 15000 books.
Info downloaded for book 191 of 15000 books.
Info downloaded for book 192 of 15000 books.
Info downloaded for book 193 of 15000 books.
Info downloaded for book 194 of 15000 books.
Info downloaded for book 195 of 15000 books.
Info downloaded for book 196 of 15000 books.
Info downloaded for book 197 of 15000 books.
Info downloaded for book 198 of 15000 books.
Info downloaded for book 199 of 15000 books.
Info downloaded for book 200 of 15000 books.
Info downloaded for book 201 of 15000 books.
Info downloaded for book 202 of 15000 books.
Info downloaded for book 203 of 15000 books.
Info downloaded for book 204 of 15000 books.
Info downloaded for book 205 of 15000 books.
Info downloaded for book 206 of 15000 books.
Info downloaded for book 207 of 15000 books.
Info downl

Info downloaded for book 369 of 15000 books.
Info downloaded for book 370 of 15000 books.
Info downloaded for book 371 of 15000 books.
Info downloaded for book 372 of 15000 books.
Info downloaded for book 373 of 15000 books.
Info downloaded for book 374 of 15000 books.
Info downloaded for book 375 of 15000 books.
Info downloaded for book 376 of 15000 books.
Info downloaded for book 377 of 15000 books.
Info downloaded for book 378 of 15000 books.
Info downloaded for book 379 of 15000 books.
Info downloaded for book 380 of 15000 books.
Info downloaded for book 381 of 15000 books.
Info downloaded for book 382 of 15000 books.
Info downloaded for book 383 of 15000 books.
Info downloaded for book 384 of 15000 books.
Info downloaded for book 385 of 15000 books.
Info downloaded for book 386 of 15000 books.
Info downloaded for book 387 of 15000 books.
Info downloaded for book 388 of 15000 books.
Info downloaded for book 389 of 15000 books.
Info downloaded for book 390 of 15000 books.
Info downl

Info downloaded for book 552 of 15000 books.
Info downloaded for book 553 of 15000 books.
Info downloaded for book 554 of 15000 books.
Info downloaded for book 555 of 15000 books.
Info downloaded for book 556 of 15000 books.
Info downloaded for book 557 of 15000 books.
Info downloaded for book 558 of 15000 books.
Info downloaded for book 559 of 15000 books.
Info downloaded for book 560 of 15000 books.
Info downloaded for book 561 of 15000 books.
Info downloaded for book 562 of 15000 books.
Info downloaded for book 563 of 15000 books.
Info downloaded for book 564 of 15000 books.
Info downloaded for book 565 of 15000 books.
Info downloaded for book 566 of 15000 books.
Info downloaded for book 567 of 15000 books.
Info downloaded for book 568 of 15000 books.
Info downloaded for book 569 of 15000 books.
Info downloaded for book 570 of 15000 books.
Info downloaded for book 571 of 15000 books.
Info downloaded for book 572 of 15000 books.
Info downloaded for book 573 of 15000 books.
Info downl

Info downloaded for book 735 of 15000 books.
Info downloaded for book 736 of 15000 books.
Info downloaded for book 737 of 15000 books.
Info downloaded for book 738 of 15000 books.
Info downloaded for book 739 of 15000 books.
Info downloaded for book 740 of 15000 books.
Info downloaded for book 741 of 15000 books.
Info downloaded for book 742 of 15000 books.
Info downloaded for book 743 of 15000 books.
Info downloaded for book 744 of 15000 books.
Info downloaded for book 745 of 15000 books.
Info downloaded for book 746 of 15000 books.
Info downloaded for book 747 of 15000 books.
Info downloaded for book 748 of 15000 books.
Info downloaded for book 749 of 15000 books.
Info downloaded for book 750 of 15000 books.
Info downloaded for book 751 of 15000 books.
Info downloaded for book 752 of 15000 books.
Info downloaded for book 753 of 15000 books.
Info downloaded for book 754 of 15000 books.
Info downloaded for book 755 of 15000 books.
Info downloaded for book 756 of 15000 books.
Info downl

Info downloaded for book 918 of 15000 books.
Info downloaded for book 919 of 15000 books.
Info downloaded for book 920 of 15000 books.
Info downloaded for book 921 of 15000 books.
Info downloaded for book 922 of 15000 books.
Info downloaded for book 923 of 15000 books.
Info downloaded for book 924 of 15000 books.
Info downloaded for book 925 of 15000 books.
Info downloaded for book 926 of 15000 books.
Info downloaded for book 927 of 15000 books.
Info downloaded for book 928 of 15000 books.
Info downloaded for book 929 of 15000 books.
Info downloaded for book 930 of 15000 books.
Info downloaded for book 931 of 15000 books.
Info downloaded for book 932 of 15000 books.
Info downloaded for book 933 of 15000 books.
Info downloaded for book 934 of 15000 books.
Info downloaded for book 935 of 15000 books.
Info downloaded for book 936 of 15000 books.
Info downloaded for book 937 of 15000 books.
Info downloaded for book 938 of 15000 books.
Info downloaded for book 939 of 15000 books.
Info downl

Info downloaded for book 1098 of 15000 books.
Info downloaded for book 1099 of 15000 books.
Info downloaded for book 1100 of 15000 books.
Info downloaded for book 1101 of 15000 books.
Info downloaded for book 1102 of 15000 books.
Info downloaded for book 1103 of 15000 books.
Info downloaded for book 1104 of 15000 books.
Info downloaded for book 1105 of 15000 books.
Info downloaded for book 1106 of 15000 books.
Info downloaded for book 1107 of 15000 books.
Info downloaded for book 1108 of 15000 books.
Info downloaded for book 1109 of 15000 books.
Info downloaded for book 1110 of 15000 books.
Info downloaded for book 1111 of 15000 books.
Info downloaded for book 1112 of 15000 books.
Info downloaded for book 1113 of 15000 books.
Info downloaded for book 1114 of 15000 books.
Info downloaded for book 1115 of 15000 books.
Info downloaded for book 1116 of 15000 books.
Info downloaded for book 1117 of 15000 books.
Info downloaded for book 1118 of 15000 books.
Info downloaded for book 1119 of 1

Info downloaded for book 1277 of 15000 books.
Info downloaded for book 1278 of 15000 books.
Info downloaded for book 1279 of 15000 books.
Info downloaded for book 1280 of 15000 books.
Info downloaded for book 1281 of 15000 books.
Info downloaded for book 1282 of 15000 books.
Info downloaded for book 1283 of 15000 books.
Info downloaded for book 1284 of 15000 books.
Info downloaded for book 1285 of 15000 books.
Info downloaded for book 1286 of 15000 books.
Info downloaded for book 1287 of 15000 books.
Info downloaded for book 1288 of 15000 books.
Info downloaded for book 1289 of 15000 books.
Info downloaded for book 1290 of 15000 books.
Info downloaded for book 1291 of 15000 books.
Info downloaded for book 1292 of 15000 books.
Info downloaded for book 1293 of 15000 books.
Info downloaded for book 1294 of 15000 books.
Info downloaded for book 1295 of 15000 books.
Info downloaded for book 1296 of 15000 books.
Info downloaded for book 1297 of 15000 books.
Info downloaded for book 1298 of 1

Info downloaded for book 1456 of 15000 books.
Info downloaded for book 1457 of 15000 books.
Info downloaded for book 1458 of 15000 books.
Info downloaded for book 1459 of 15000 books.
Info downloaded for book 1460 of 15000 books.
Info downloaded for book 1461 of 15000 books.
Info downloaded for book 1462 of 15000 books.
Info downloaded for book 1463 of 15000 books.
Info downloaded for book 1464 of 15000 books.
Info downloaded for book 1465 of 15000 books.
Info downloaded for book 1466 of 15000 books.
Info downloaded for book 1467 of 15000 books.
Info downloaded for book 1468 of 15000 books.
Info downloaded for book 1469 of 15000 books.
Info downloaded for book 1470 of 15000 books.
Info downloaded for book 1471 of 15000 books.
Info downloaded for book 1472 of 15000 books.
Info downloaded for book 1473 of 15000 books.
Info downloaded for book 1474 of 15000 books.
Info downloaded for book 1475 of 15000 books.
Info downloaded for book 1476 of 15000 books.
Info downloaded for book 1477 of 1

SSLError: HTTPSConnectionPool(host='api2.isbndb.com', port=443): Max retries exceeded with url: /book/9780153178344 (Caused by SSLError(SSLError("bad handshake: SysCallError(10060, 'WSAETIMEDOUT')")))

In [5]:
now = datetime.now()
dt = now.strftime("%d-%m-%Y_%H-%M-%S")

isbn.to_csv('../data/saved/isbn' + dt +'.csv', index = False)

### Image acquisition

In [28]:
images = pd.read_csv('../data/processed/canadian_books.csv')
images = images.applymap(str)
images.dtypes

id             object
title          object
author         object
description    object
image          object
dtype: object

In [29]:
images = images[:10]
images

Unnamed: 0,id,title,author,description,image
0,1,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ...",https://images.49thshelf.com/var/ezflow_site/s...
1,2,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...,https://images.49thshelf.com/var/ezflow_site/s...
2,3,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil...",https://images.49thshelf.com/var/ezflow_site/s...
3,4,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...,https://images.49thshelf.com/var/ezflow_site/s...
4,5,10 Women,George Bowering,Ten Women is a new collection of short fiction...,https://images.49thshelf.com/var/ezflow_site/s...


In [37]:
for i in range(len(images)):

    try:
        urllib.request.urlretrieve(images['image'][i], '../img/books/' + id[i] + '.jpg')
        print('Just captured image number ' + images['id'][i])

    except:
        print('Failed to capture image number ' + images['id'][i])
        
    time.sleep(2)


Failed to capture image number 1
Failed to capture image number 2
Failed to capture image number 3
Failed to capture image number 4
Failed to capture image number 5
