### Imports

In [1]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000
# pd.set_option(display.max_columns), None

# other imports
import json
import lxml
from lxml import html
import random
import regex as re
import requests
import time
import urllib.request
from datetime import datetime



# Data acquisition

In [2]:
isbn = pd.read_csv('../data/data_acquisition/international_for_download.csv')
isbn = isbn.applymap(str)
isbn.dtypes

isbn        object
title       object
authors     object
overview    object
dtype: object

In [3]:
isbn = isbn[0:3000]
isbn

Unnamed: 0,isbn,title,authors,overview
0,9781631062360,,,
1,9781631062353,,,
2,9781631061585,,,
3,9781631060434,,,
4,9781631060427,,,
5,9781631060342,,,
6,9781631060328,,,
7,9781631060311,,,
8,9781631060304,,,
9,9781631060229,,,


In [None]:
for j in range(len(isbn)):

    header = {'Authorization': '44023_23ab132f3977ad9849e8f1a5d7dc73bf'}
    base_url = ('https://api2.isbndb.com/book/')
    response = requests.get(base_url + isbn['isbn'][j], headers=header)
    payload = response.json()
      
    try:
        isbn['title'][j] = payload['book']['title']
    
    except:
        isbn['title'][j] = np.nan
    
    try:
        isbn['authors'][j] = payload['book']['authors']
    
    except:
        isbn['authors'][j] = np.nan
              
    try:
        isbn['overview'][j] = payload['book']['overview']
    
    except:
        isbn['overview'][j] = np.nan
  
    print('Info downloaded for book ' + str(j + 1) + ' of ' +  str(len(isbn)) + ' books.')
              
    time.sleep(1)
    

Info downloaded for book 1 of 3000 books.
Info downloaded for book 2 of 3000 books.
Info downloaded for book 3 of 3000 books.
Info downloaded for book 4 of 3000 books.
Info downloaded for book 5 of 3000 books.
Info downloaded for book 6 of 3000 books.
Info downloaded for book 7 of 3000 books.
Info downloaded for book 8 of 3000 books.
Info downloaded for book 9 of 3000 books.
Info downloaded for book 10 of 3000 books.
Info downloaded for book 11 of 3000 books.
Info downloaded for book 12 of 3000 books.
Info downloaded for book 13 of 3000 books.
Info downloaded for book 14 of 3000 books.
Info downloaded for book 15 of 3000 books.
Info downloaded for book 16 of 3000 books.
Info downloaded for book 17 of 3000 books.
Info downloaded for book 18 of 3000 books.
Info downloaded for book 19 of 3000 books.
Info downloaded for book 20 of 3000 books.
Info downloaded for book 21 of 3000 books.
Info downloaded for book 22 of 3000 books.
Info downloaded for book 23 of 3000 books.
Info downloaded for 

In [30]:
now = datetime.now()
dt = now.strftime("%d-%m-%Y_%H-%M-%S")

isbn.to_csv('../data/saved/isbn' + dt +'.csv', index = False)

### EDA

In [None]:
# df download_jpg(url, file_path, file_name):
#     full_path = file_path + file_name + '.jpg'
#     urllib.request.urlretrieve(url, full_path)
    
# url =
# file_name = 'https://images.isbndb.com/covers/94/86/9781934759486.jpg'

### Canadian

In [4]:
canadian = pd.read_csv('../data/processed/canadian_processed.csv')

In [5]:
canadian.dtypes

isbn            object
title           object
author          object
Unnamed: 3     float64
description     object
dtype: object

In [10]:
canadian.drop(columns=['Unnamed: 3'], inplace = True)

In [11]:
canadian.shape

(205, 4)

In [12]:
canadian.head(40)

Unnamed: 0,isbn,title,author,description
0,9780773524927,Two Solitudes,Hugh McLennan,"“[A] powerful saga, [Two Solitudes is the stor..."
1,9781552453056,Fifteen Dogs,Andre Alexis,“A bet between the gods Hermes and Apollo lead...
2,9780771030130,Bear,Marian Engel,“A librarian is called to a remote Canadian is...
3,9781554685257,"Green Grass, Running Water",Thomas King,“Alberta is a university professor who would l...
4,9780771055706,No Great Mischief,Alistair MacLeod,"“Alexander, orphaned as a child by a horrific ..."
5,9780312054366,Generation X,Douglas Coupland,"“Andy, Dag and Claire have been handed a socie..."
6,9780676977738,The Birth House,Ami McKay,“As a child in an isolated village in Nova Sco...
7,9780062468475,Lullabies for Little Criminals,Heather O'Neill,"“At thirteen, Baby vacillates between childhoo..."
8,9780006393108,Lost Girls,Andrew Pyper,“Attorney Bartholomew Crane doesn’t belong in ...
9,9781443451352,Birdie,Tracey Lindberg,"“Bernice Meetoos, a Cree woman, leaves her hom..."


### International

### Content accumulation

In [13]:
international = pd.read_csv('../data/processed/international_processed.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 12: invalid start byte

In [None]:
# isbn = pd.read_csv('../data/isbnlist.csv', 'isbn' : str)
# base_url = 'https://www.biblioshare.ca/BNCServices/BNCServices.asmx/ONIX?Token=zvo3vpz7uulcuajs&EAN='
# for isbn in isbn['isbn']:
#     target_url = base_url + isbn

### Preprocessing


In [None]:
# Preprocess the posting content; this should take under 2 minutes
# start an empty list to hold preprocessed postings

for i in range(len(df)):  # for each description
    desc = re.sub('[^a-zA-Z]', ' ', df['description'][i]) # remove non text characters
    desc = desc.lower() # lower-case everything
    desc = desc.split() # split into words
    desc = ' '.join(desc) # reassemble the string
    df['description'][i] = desc


## Modeling

### TF-IDF

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

### Recommender

In [None]:
results = {}

for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['id'][i]) for i in similar_indices]
    results[row['id']] = similar_items[1:]

In [None]:
def item(id):
    return df.loc[df['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [None]:
def recommend(item_id, num):
    print("Here " + str(num) + " books that are similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
print('How many new books would you like to learn about?')
num_books = input()

In [None]:
recommend(item_id=109, num=3)

### Visualization

## References

To come