# #readMoreCanlit | A Recommender System for Canadian Literature

<center><img src='../img/readMoreCanlit.png'></center>

> Shawn Syms<br>
> https://shawnsyms.github.io/ <br>
> shawn@shawnsyms.com <br>
> 416-843-4169 <br>

### Imports

In [45]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime

# Data acquisition

In [46]:
isbn = pd.read_csv('../data/isbn_for_acquisition.csv')
isbn = isbn.applymap(str)
isbn.dtypes

isbn        object
title       object
authors     object
overview    object
dtype: object

In [47]:
isbn = isbn[0:5000]
isbn

Unnamed: 0,isbn,title,authors,overview
0,9784596894830,,,
1,9784596894823,,,
2,9784596894816,,,
3,9784596893536,,,
4,9784596893529,,,
...,...,...,...,...
4995,9781935096825,,,
4996,9781935096818,,,
4997,9781935096801,,,
4998,9781935096788,,,


In [48]:
for j in range(len(isbn)):

    header = {'Authorization': '44023_23ab132f3977ad9849e8f1a5d7dc73bf'}
    base_url = ('https://api2.isbndb.com/book/')
    response = requests.get(base_url + isbn['isbn'][j], headers=header)
    payload = response.json()
      
    try:
        isbn['title'][j] = payload['book']['title']
    
    except:
        isbn['title'][j] = np.nan
    
    try:
        isbn['authors'][j] = payload['book']['authors']
    
    except:
        isbn['authors'][j] = np.nan
              
    try:
        isbn['overview'][j] = payload['book']['overview']
    
    except:
        isbn['overview'][j] = np.nan
  
    print('Info downloaded for book ' + str(j + 1) + ' of ' +  str(len(isbn)) + ' books.')
              
    time.sleep(1)
    

Info downloaded for book 1 of 5000 books.
Info downloaded for book 2 of 5000 books.
Info downloaded for book 3 of 5000 books.
Info downloaded for book 4 of 5000 books.
Info downloaded for book 5 of 5000 books.
Info downloaded for book 6 of 5000 books.
Info downloaded for book 7 of 5000 books.
Info downloaded for book 8 of 5000 books.
Info downloaded for book 9 of 5000 books.
Info downloaded for book 10 of 5000 books.
Info downloaded for book 11 of 5000 books.
Info downloaded for book 12 of 5000 books.
Info downloaded for book 13 of 5000 books.
Info downloaded for book 14 of 5000 books.
Info downloaded for book 15 of 5000 books.
Info downloaded for book 16 of 5000 books.
Info downloaded for book 17 of 5000 books.
Info downloaded for book 18 of 5000 books.
Info downloaded for book 19 of 5000 books.
Info downloaded for book 20 of 5000 books.
Info downloaded for book 21 of 5000 books.
Info downloaded for book 22 of 5000 books.
Info downloaded for book 23 of 5000 books.
Info downloaded for 

Info downloaded for book 190 of 5000 books.
Info downloaded for book 191 of 5000 books.
Info downloaded for book 192 of 5000 books.
Info downloaded for book 193 of 5000 books.
Info downloaded for book 194 of 5000 books.
Info downloaded for book 195 of 5000 books.
Info downloaded for book 196 of 5000 books.
Info downloaded for book 197 of 5000 books.
Info downloaded for book 198 of 5000 books.
Info downloaded for book 199 of 5000 books.
Info downloaded for book 200 of 5000 books.
Info downloaded for book 201 of 5000 books.
Info downloaded for book 202 of 5000 books.
Info downloaded for book 203 of 5000 books.
Info downloaded for book 204 of 5000 books.
Info downloaded for book 205 of 5000 books.
Info downloaded for book 206 of 5000 books.
Info downloaded for book 207 of 5000 books.
Info downloaded for book 208 of 5000 books.
Info downloaded for book 209 of 5000 books.
Info downloaded for book 210 of 5000 books.
Info downloaded for book 211 of 5000 books.
Info downloaded for book 212 of 

Info downloaded for book 377 of 5000 books.
Info downloaded for book 378 of 5000 books.
Info downloaded for book 379 of 5000 books.
Info downloaded for book 380 of 5000 books.
Info downloaded for book 381 of 5000 books.
Info downloaded for book 382 of 5000 books.
Info downloaded for book 383 of 5000 books.
Info downloaded for book 384 of 5000 books.
Info downloaded for book 385 of 5000 books.
Info downloaded for book 386 of 5000 books.
Info downloaded for book 387 of 5000 books.
Info downloaded for book 388 of 5000 books.
Info downloaded for book 389 of 5000 books.
Info downloaded for book 390 of 5000 books.
Info downloaded for book 391 of 5000 books.
Info downloaded for book 392 of 5000 books.
Info downloaded for book 393 of 5000 books.
Info downloaded for book 394 of 5000 books.
Info downloaded for book 395 of 5000 books.
Info downloaded for book 396 of 5000 books.
Info downloaded for book 397 of 5000 books.
Info downloaded for book 398 of 5000 books.
Info downloaded for book 399 of 

Info downloaded for book 564 of 5000 books.
Info downloaded for book 565 of 5000 books.
Info downloaded for book 566 of 5000 books.
Info downloaded for book 567 of 5000 books.
Info downloaded for book 568 of 5000 books.
Info downloaded for book 569 of 5000 books.
Info downloaded for book 570 of 5000 books.
Info downloaded for book 571 of 5000 books.
Info downloaded for book 572 of 5000 books.
Info downloaded for book 573 of 5000 books.
Info downloaded for book 574 of 5000 books.
Info downloaded for book 575 of 5000 books.
Info downloaded for book 576 of 5000 books.
Info downloaded for book 577 of 5000 books.
Info downloaded for book 578 of 5000 books.
Info downloaded for book 579 of 5000 books.
Info downloaded for book 580 of 5000 books.
Info downloaded for book 581 of 5000 books.
Info downloaded for book 582 of 5000 books.
Info downloaded for book 583 of 5000 books.
Info downloaded for book 584 of 5000 books.
Info downloaded for book 585 of 5000 books.
Info downloaded for book 586 of 

Info downloaded for book 751 of 5000 books.
Info downloaded for book 752 of 5000 books.
Info downloaded for book 753 of 5000 books.
Info downloaded for book 754 of 5000 books.
Info downloaded for book 755 of 5000 books.
Info downloaded for book 756 of 5000 books.
Info downloaded for book 757 of 5000 books.
Info downloaded for book 758 of 5000 books.
Info downloaded for book 759 of 5000 books.
Info downloaded for book 760 of 5000 books.
Info downloaded for book 761 of 5000 books.
Info downloaded for book 762 of 5000 books.
Info downloaded for book 763 of 5000 books.
Info downloaded for book 764 of 5000 books.
Info downloaded for book 765 of 5000 books.
Info downloaded for book 766 of 5000 books.
Info downloaded for book 767 of 5000 books.
Info downloaded for book 768 of 5000 books.
Info downloaded for book 769 of 5000 books.
Info downloaded for book 770 of 5000 books.
Info downloaded for book 771 of 5000 books.
Info downloaded for book 772 of 5000 books.
Info downloaded for book 773 of 

ConnectionError: HTTPSConnectionPool(host='api2.isbndb.com', port=443): Max retries exceeded with url: /book/9784041024171 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000025D027D16C8>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [49]:
isbn

Unnamed: 0,isbn,title,authors,overview
0,9784596894830,,,
1,9784596894823,,,
2,9784596894816,,,
3,9784596893536,,,
4,9784596893529,,,
...,...,...,...,...
4995,9781935096825,,,
4996,9781935096818,,,
4997,9781935096801,,,
4998,9781935096788,,,


In [44]:
now = datetime.now()
dt = now.strftime("%d-%m-%Y_%H-%M-%S")

isbn.to_csv('../data/saved/isbn' + dt +'.csv', index = False)

In [None]:
# booknetcanada

import requests
import lxml
from lxml import html
import requests



test_url = 'https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=zvo3vpz7uulcuajs&EAN=9781927655023'
catalist_url = 'https://www.bnccatalist.ca/viewtitle.aspx?ean=9781440503252'
page = requests.get(catalist_url)
content = html.fromstring(page.content)
tree = html.parse(content)
print(html.tostring(tree))



# https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=amcfxpnjwt9o6a7f&EAN=9780312573577
    
# for i in isbn['isbn']:

# i = '9780312573577'
# # header = {'Authorization': '44023_23ab132f3977ad9849e8f1a5d7dc73bf'}
# base_url = ('https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=amcfxpnjwt9o6a7f&EAN=')
# #descriptors = ['title', 'authors', 'image', 'date_published', 'pages', 'overview']
# response = requests.get(base_url + i)
# payload = response.text

# response.

#     try:
#         title = payload['book']['title']
#         print(title)
    
#         authors = payload['book']['authors']
#         print(authors)
    
#         image = payload['book']['image']
#         print(image)
    
#         overview = payload['book']['overview']
#         print(overview)
    
#     except:
#         pass
    
    # time.sleep(1)
    


In [None]:
all = tree.xpath

### EDA

In [None]:
df download_jpg(url, file_path, file_name):
    full_path = file_path + file_name + '.jpg'
    urllib.request.urlretrieve(url, full_path)
    
url =
file_name = 'https://images.isbndb.com/covers/94/86/9781934759486.jpg'

In [None]:
df = pd.read_csv('../data/readmoreCanlit.csv')

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.author.value_counts()

In [None]:
df.origin.value_counts()

### Content accumulation

In [None]:
# isbn = pd.read_csv('../data/isbnlist.csv', 'isbn' : str)
# base_url = 'https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=zvo3vpz7uulcuajs&EAN='
# for isbn in isbn['isbn']:
#     target_url = base_url + isbn

### Preprocessing


In [None]:
# Preprocess the posting content; this should take under 2 minutes
# start an empty list to hold preprocessed postings

for i in range(len(df)):  # for each description
    desc = re.sub('[^a-zA-Z]', ' ', df['description'][i]) # remove non text characters
    desc = desc.lower() # lower-case everything
    desc = desc.split() # split into words
    desc = ' '.join(desc) # reassemble the string
    df['description'][i] = desc


## Modeling

### TF-IDF

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

### Recommender

In [None]:
results = {}

for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['id'][i]) for i in similar_indices]
    results[row['id']] = similar_items[1:]

In [None]:
def item(id):
    return df.loc[df['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [None]:
def recommend(item_id, num):
    print("Here " + str(num) + " books that are similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
print('How many new books would you like to learn about?')
num_books = input()

In [None]:
recommend(item_id=109, num=3)

### Visualization

In [None]:
sns.pairplot(df)

## References

To come