# #readMoreCanlit | Notebook 3: Data cleaning and visualization

### Imports

In [111]:
# pandas and numpy
import pandas as pd
import numpy as np

# nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
# nltk.download('punkt')

# sci-kit learn imports
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.pipeline import Pipeline, make_pipeline

# Presentation and visuals
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# This magic line will allow you to generate plots
# within the Jupyter notebook.
%matplotlib inline
from pprint import pprint
pd.options.display.max_seq_items = 2000
pd.options.display.max_rows = 4000

# pd.set_option(display.max_columns), None

# other imports
from collections import Counter
import json
import lxml
from lxml import html
import random
import regex as re
import requests
from string import punctuation
import time
import urllib.request
from datetime import datetime


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shawn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Data cleaning

### Canadian

In [115]:
# Create a stopwords list of meta-critical commentary vocabulary to be removed from the Canadian dataset as part of preprocessing

literary_stopwords = ['amazon best book of the year', 'arthur ellis', 'astonishing', 'author', 'award', 'award-winning', 'best', 'best book of the year', 'bestseller', 'bestselling', 'book', 'book award', 'boston globe', 'canada reads', 'category', 'character', 'classic', 'critically acclaimed', 'debut', 'entertainment weekly', 'epic', 'finalist', 'finalist', 'foremost', 'giller prize', 'giller prize', 'globe and mail', 'governor generals award', 'governor generals literary award', 'harpercollins', 'heralded', 'highly anticipated', 'kobo', 'literary', 'literature', 'longlisted', 'national bestseller', 'nationally', 'new york times', 'novel', 'prize', 'prize-winning', 'publish', 'publishers weekly', 'rogers writers trust', 'scotiabank', 'shortlisted', 'story', 'ubc', 'widely anticipated', 'winner', 'writers trust fiction prize', 'writers trust of canada']

In [78]:
# Read in the Canadian data

canadian = pd.read_csv('../data/processed/canadian_books.csv')
canadian = canadian.applymap(str)
canadian.shape

(6775, 6)

In [79]:
# Create a wordcloud from the Canadian descriptions

# text = canadian.description.values
# wordcloud = WordCloud(
#     width = 3000,
#     height = 2000,
#     background_color = 'white',
#     stopwords = STOPWORDS).generate(str(text))
# fig = plt.figure(
#     figsize = (40, 30),
#     facecolor = 'k',
#     edgecolor = 'k')
# plt.imshow(wordcloud, interpolation = 'bilinear')
# plt.axis('off')
# plt.tight_layout(pad=0)
# plt.show()

In [80]:
canadian.drop(['image'], axis=1, inplace=True)

In [81]:
canadian

Unnamed: 0,id,origin,title,author,description
0,0,Canadian,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ..."
1,1,Canadian,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...
2,2,Canadian,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil..."
3,3,Canadian,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...
4,4,Canadian,10 Women,George Bowering,Ten Women is a new collection of short fiction...
...,...,...,...,...,...
6770,6770,Canadian,Zero Day,Ezekiel Boone,"The wildly entertaining, deeply satisfying fin..."
6771,6771,Canadian,Zip's File,Shannon Maguire,Zip's File: A Romance of Silence explores the ...
6772,6772,Canadian,Zolitude,Paige Cooper,WINNER OF THE 2018 QUEBEC WRITERS' FEDERATION ...
6773,6773,Canadian,Zoo and Crowbar,David Zieroth,The Wind has mysteriously caused the death of ...


In [104]:
canadian['full_entry'] = canadian['title'] + ' by ' + canadian['author'] + ': ' + canadian['description']

In [105]:
canadian.to_csv('../data/processed/canadian.csv')

In [142]:
literary_stopwords = ['amazon best book of the year', 'arthur ellis', 'astonishing', 'author', 'award', 'award-winning', 'best', 'best book of the year', 'bestseller', 'bestselling', 'book', 'book award', 'boston globe', 'canada reads', 'category', 'character', 'classic', 'critically acclaimed', 'debut', 'entertainment weekly', 'epic', 'finalist', 'finalist', 'foremost', 'giller prize', 'giller prize', 'globe and mail', 'governor generals award', 'governor generals literary award', 'harpercollins', 'heralded', 'highly anticipated', 'kobo', 'literary', 'literature', 'longlisted', 'national bestseller', 'nationally', 'new york times', 'novel', 'prize', 'prize-winning', 'publish', 'publishers weekly', 'rogers writers trust', 'scotiabank', 'shortlisted', 'story', 'ubc', 'widely anticipated', 'winner', 'writers trust fiction prize', 'writers trust of canada']
stoplist = set(stopwords.words('english') + list(punctuation) + literary_stopwords)
path = '../data/processed/canadian.csv'
df = pd.read_csv(path)
df = df.applymap(str)
texts = df['description'].str.lower()
word_counts = Counter(word_tokenize('\n'.join(texts)))
words_list = word_counts.most_common()
words_list



[(',', 10551),
 ('the', 10102),
 ('a', 6867),
 ('...', 6795),
 ('of', 6473),
 ('and', 5070),
 ('.', 4236),
 ('in', 4111),
 ('to', 3178),
 ('is', 2941),
 (';', 1796),
 ('her', 1711),
 ('&', 1677),
 ("'s", 1646),
 ('for', 1568),
 ('’', 1396),
 ('his', 1372),
 ('with', 1347),
 ('from', 1191),
 ('an', 1128),
 ('s', 1101),
 ('on', 1047),
 ('nbsp', 934),
 ('by', 913),
 ('that', 876),
 ('novel', 866),
 ('she', 853),
 ('has', 812),
 ('as', 811),
 ('he', 799),
 ('this', 767),
 ('new', 734),
 ('?', 732),
 ('quot', 720),
 ('it', 703),
 ('when', 670),
 ('one', 626),
 ('at', 620),
 ('life', 615),
 ('but', 583),
 ('story', 528),
 ('stories', 525),
 ('who', 514),
 ('author', 503),
 (':', 471),
 ('are', 455),
 ('about', 450),
 ('was', 448),
 ('first', 423),
 ('young', 422),
 ('family', 418),
 ('book', 416),
 ('their', 391),
 ('world', 380),
 ('you', 365),
 ('fiction', 355),
 ('collection', 354),
 ('be', 352),
 ('into', 333),
 ('love', 324),
 ('two', 318),
 ('after', 310),
 ('man', 308),
 ('set', 296),

#### Canadian content word cloud

<img src='../img/canadian_word_cloud.png'>

In [130]:
canadian_words = pd.read_csv('../data/processed/canadian_words.csv')

In [131]:
canadian_words

Unnamed: 0,id,word,count
0,1,life,615
1,2,young,422
2,3,family,418
3,4,world,380
4,5,love,324
5,6,man,308
6,7,woman,288
7,8,years,284
8,9,short,230
9,10,home,228


### International

In [82]:
international = pd.read_csv('../data/processed/international_books.csv', encoding = "ISO-8859-1")

In [83]:
international.head()

Unnamed: 0,id,origin,title,author,description
0,6775,international,3:59,Gretchen Mcneil,<p>From <b>Possess</b> and <b>Ten</b> author G...
1,6776,international,125,000 copies sold in its first edition<br>,
2,6777,international,1776,David Mccullough,"In this masterful book, David McCullough tells..."
3,6778,international,1984,George Orwell,&lt;b&gt;Renowned urban artist Shepard Fairey'...
4,6779,international,2312,Kim Stanley Robinson,The Hugo and Nebula nominated and <i>New York ...


In [84]:
international.shape

(10806, 5)

In [85]:
international.dropna(axis = 0, how ='any', inplace = True)

In [86]:
international.shape

(10704, 5)

In [147]:
international.head()

Unnamed: 0,id,origin,title,author,description
0,6775,international,3:59,Gretchen Mcneil,<p>From <b>Possess</b> and <b>Ten</b> author G...
2,6777,international,1776,David Mccullough,"In this masterful book, David McCullough tells..."
3,6778,international,1984,George Orwell,&lt;b&gt;Renowned urban artist Shepard Fairey'...
4,6779,international,2312,Kim Stanley Robinson,The Hugo and Nebula nominated and <i>New York ...
5,6780,international,2666,Roberto Bolaño,&lt;p&gt;&lt;b&gt;A NATIONAL BOOK CRITICS CIRC...


In [None]:
# literary_stopwords = ['amazon best book of the year', 'arthur ellis', 'astonishing', 'author', 'award', 'award-winning', 'best', 'best book of the year', 'bestseller', 'bestselling', 'book', 'book award', 'boston globe', 'canada reads', 'category', 'character', 'classic', 'critically acclaimed', 'debut', 'entertainment weekly', 'epic', 'finalist', 'finalist', 'foremost', 'giller prize', 'giller prize', 'globe and mail', 'governor generals award', 'governor generals literary award', 'harpercollins', 'heralded', 'highly anticipated', 'kobo', 'literary', 'literature', 'longlisted', 'national bestseller', 'nationally', 'new york times', 'novel', 'prize', 'prize-winning', 'publish', 'publishers weekly', 'rogers writers trust', 'scotiabank', 'shortlisted', 'story', 'ubc', 'widely anticipated', 'winner', 'writers trust fiction prize', 'writers trust of canada']
# stoplist = set(stopwords.words('english') + list(punctuation) + literary_stopwords)
# path = '../data/processed/international_books.csv'
# encoding = ', encoding = "ISO-8859-1"'
# df = pd.read_csv(path, encoding)
# df = df.applymap(str)
# texts = df['description'].str.lower()
# word_counts = Counter(word_tokenize('\n'.join(texts)))
# words_list = word_counts.most_common()
# words_list



In [88]:
books = pd.concat([canadian, international])

In [101]:
books.head(2374)

Unnamed: 0,id,origin,title,author,description,full_entry
0,0,Canadian,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ...",01 Nathaniel Mcdaniel and Bigbeards Hook by Ev...
1,1,Canadian,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...,02 Standard of Honor Book Two of the Templar T...
2,2,Canadian,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil...",03 Knights Templar Order in Chaos by Jack Whyt...
3,3,Canadian,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...,100 Easy-to-Grow Native Plants for Canadian Ga...
4,4,Canadian,10 Women,George Bowering,Ten Women is a new collection of short fiction...,10 Women by George Bowering: Ten Women is a ne...
5,5,Canadian,12 Rose Street,Gail Bowen,The indomitable Joanne Kilbourn is back! From ...,12 Rose Street by Gail Bowen: The indomitable ...
6,6,Canadian,12 Shades of Surrender,Anne Calhoun,Are you ready to step deeper into the shade?If...,12 Shades of Surrender by Anne Calhoun: Are yo...
7,7,Canadian,13 Lives,Michael Pawlowski,"With stories that chronicle the abused, the ho...",13 Lives by Michael Pawlowski: With stories th...
8,8,Canadian,13 Ways of Looking at a Fat Girl,Mona Awad,Winner of the Amazon.ca First Novel AwardShort...,13 Ways of Looking at a Fat Girl by Mona Awad:...
9,9,Canadian,150 Years of Stats Canada!,Stats Canada,Canada's funniest online sensation is back to ...,150 Years of Stats Canada! by Stats Canada: Ca...


In [97]:
books.reset_index(drop=True)

Unnamed: 0,id,origin,title,author,description,full_entry
0,0,Canadian,01 Nathaniel Mcdaniel and Bigbeards Hook,Evan Solomon,"Meet Nathaniel McDaniel, the mischievous hero ...",01 Nathaniel Mcdaniel and Bigbeards Hook by Ev...
1,1,Canadian,02 Standard of Honor Book Two of the Templar T...,Jack Whyte,Jack Whyte’s thrilling Templar Trilogy continu...,02 Standard of Honor Book Two of the Templar T...
2,2,Canadian,03 Knights Templar Order in Chaos,Jack Whyte,"In the final novel in the Templar Trilogy, Wil...",03 Knights Templar Order in Chaos by Jack Whyt...
3,3,Canadian,100 Easy-to-Grow Native Plants for Canadian Ga...,Lorraine Johnson,The key to a carefree garden is to know which ...,100 Easy-to-Grow Native Plants for Canadian Ga...
4,4,Canadian,10 Women,George Bowering,Ten Women is a new collection of short fiction...,10 Women by George Bowering: Ten Women is a ne...
...,...,...,...,...,...,...
17474,17594,international,Yours Until Dawn (avon Historical Romance),Teresa Medeiros,<p>Gabriel Fairchild's valor during battle ear...,Yours Until Dawn (avon Historical Romance) by ...
17475,17595,international,You've Got To Read This Book!: 55 People Tell ...,Jack Canfield; Gay Hendricks,<p>There's nothing better than a book you can'...,You've Got To Read This Book!: 55 People Tell ...
17476,17596,international,Z Is For Moose (booklist Editor's Choice. Book...,Kelly Bingham,<p>Z is for Zebra.</p>\n<p>Zebra is absolutely...,Z Is For Moose (booklist Editor's Choice. Book...
17477,17597,international,Zap The Gaps!: Target Higher Performance And A...,"Blanchard, Kenneth H.",<p>Target Higher Performance and Achieve It!</...,Zap The Gaps!: Target Higher Performance And A...


In [94]:
books['full_entry'] = books['title'] + ' by ' + books['author'] + ': ' + books['description']

In [102]:
books['full_entry'][23]

23    7 Ways to Sunday by Lee Kvern: Lee Kvern's muc...
23    1,001 Ingenious Gardening Ideas: New, Fun And ...
Name: full_entry, dtype: object

In [92]:
books.to_csv('../data/processed/books.csv')